12 files changed, 371 insertions, 167 deletions
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 67db607f6..2238727c5 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -228,6 +228,23 @@ struct SamplerComparisonState
 
 // Texture
 
+struct ITexture1D
+{
+    virtual void Load(const int2& v, void* out) = 0;
+    virtual void Sample(SamplerState samplerState, float loc, void* out) = 0;
+    virtual void SampleLevel(SamplerState samplerState, float loc, float level, void* out) = 0;
+};
+
+template <typename T>
+struct Texture1D
+{
+    T Load(const int2& v) const { T out; texture->Load(v, &out); return out; }
+    T Sample(SamplerState samplerState, float v) const { T out; texture->Sample(samplerState, v, &out); return out; }
+    T SampleLevel(SamplerState samplerState, float v, float level) { T out; texture->SampleLevel(samplerState, v, level, &out); return out; }
+    
+    ITexture1D* texture;              
+};
+
 struct ITexture2D
 {
     virtual void Load(const int3& v, void* out) = 0;
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 6efb383fa..450cc4512 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -897,19 +897,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
-                    if (kBaseTextureTypes[tt].coordCount == 1)
-                    {
-                        sb << ", $2";
-                    }
-                    else
+                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                     {
-                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                        sb << ", ($2)";
+                        if (kBaseTextureTypes[tt].coordCount > 1)
                         {
-                            sb << ", ($2)." << char(i + 'x');
+                            sb << '.' << char(i + 'x');
                         }
                     }
                     sb << ")\")\n";
                 }
+                else
+                {
+                    sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+                }
 
                 sb << "T Sample(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1028,7 +1029,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
                 }
 
-
+                // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+                // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+                // I could just assume 0 for elements not supplied, and ignore z. For now will just leave                  
                 sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
                 sb << "T SampleGrad(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1053,23 +1056,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 // CUDA
                 if (!isArray)
                 {
-                    sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
-                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                    if( baseShape != TextureFlavor::Shape::ShapeCube )
                     {
-                        sb << ", $2";
-                        if (kBaseTextureTypes[tt].coordCount > 1)
+                        sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                         {
-                            sb << '.' << char(i + 'x');
+                            sb << ", ($2)";
+                            if (kBaseTextureTypes[tt].coordCount > 1)
+                            {
+                                sb << '.' << char(i + 'x');
+                            }
                         }
+                        sb << ", $3)\")\n";
+                    }
+                    else
+                    {
+                        sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
                     }
-                    sb << ", $3)\")\n";
                 }
 
                 sb << "T SampleLevel(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
                 sb << "float level);\n";
 
-                
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1145,6 +1154,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     EMIT_LINE_DIRECTIVE();
 
                     sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+                    if (kBaseTextureTypes[tt].coordCount == 2)
+                    {
+                        // Gather only works on 2D in CUDA
+                        // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+                        sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+                    }
                     sb << outputType << " Gather" << componentName << "(SamplerState s, ";
                     sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
 
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index 5f185ca8f..cca8f2e51 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -918,19 +918,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
-                    if (kBaseTextureTypes[tt].coordCount == 1)
-                    {
-                        sb << ", $2";
-                    }
-                    else
+                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                     {
-                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                        sb << ", ($2)";
+                        if (kBaseTextureTypes[tt].coordCount > 1)
                         {
-                            sb << ", ($2)." << char(i + 'x');
+                            sb << '.' << char(i + 'x');
                         }
                     }
                     sb << ")\")\n";
                 }
+                else
+                {
+                    sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+                }
 
                 sb << "T Sample(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1049,7 +1050,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
                 }
 
-
+                // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+                // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+                // I could just assume 0 for elements not supplied, and ignore z. For now will just leave                  
                 sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
                 sb << "T SampleGrad(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1074,23 +1077,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 // CUDA
                 if (!isArray)
                 {
-                    sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
-                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                    if( baseShape != TextureFlavor::Shape::ShapeCube )
                     {
-                        sb << ", $2";
-                        if (kBaseTextureTypes[tt].coordCount > 1)
+                        sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                         {
-                            sb << '.' << char(i + 'x');
+                            sb << ", ($2)";
+                            if (kBaseTextureTypes[tt].coordCount > 1)
+                            {
+                                sb << '.' << char(i + 'x');
+                            }
                         }
+                        sb << ", $3)\")\n";
+                    }
+                    else
+                    {
+                        sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
                     }
-                    sb << ", $3)\")\n";
                 }
 
                 sb << "T SampleLevel(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
                 sb << "float level);\n";
 
-                
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1166,6 +1175,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     EMIT_LINE_DIRECTIVE();
 
                     sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+                    if (kBaseTextureTypes[tt].coordCount == 2)
+                    {
+                        // Gather only works on 2D in CUDA
+                        // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+                        sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+                    }
                     sb << outputType << " Gather" << componentName << "(SamplerState s, ";
                     sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
 
@@ -1299,7 +1314,7 @@ for (auto op : binaryOps)
         sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
     }
 }
-SLANG_RAW("#line 1281 \"core.meta.slang\"")
+SLANG_RAW("#line 1296 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index f8ae340bc..f7707cc6d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -347,8 +347,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x)
 
 // Barrier for writes to all memory spaces (HLSL SM 5.0)
 __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
-// TODO(JS): Doesn't seem to be weaker form of sync, so use this?
-__target_intrinsic(cuda, "__syncthreads()")
+__target_intrinsic(cuda, "__threadfence()")
 void AllMemoryBarrier();
 
 // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)
@@ -648,6 +647,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int> T determinant(matrix<T,N,
 
 // Barrier for device memory
 __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
+__target_intrinsic(cuda, "__threadfence()")
 void DeviceMemoryBarrier();
 
 __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()")
@@ -814,6 +814,7 @@ float2 GetRenderTargetSamplePosition(int Index);
 
 // Group memory barrier
 __target_intrinsic(glsl, "groupMemoryBarrier")
+__target_intrinsic(cuda, "__threadfence_block")
 void GroupMemoryBarrier();
 
 
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 215d18670..c0b875df5 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -396,8 +396,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri
 SLANG_RAW("\n")
 SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
-SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n")
-SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
 SLANG_RAW("void AllMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n")
@@ -724,6 +723,7 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> T determinant(
 SLANG_RAW("\n")
 SLANG_RAW("// Barrier for device memory\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
 SLANG_RAW("void DeviceMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n")
@@ -890,6 +890,7 @@ SLANG_RAW("float2 GetRenderTargetSamplePosition(int Index);\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Group memory barrier\n")
 SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence_block\")\n")
 SLANG_RAW("void GroupMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
@@ -1641,7 +1642,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1568 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
diff --git a/source/slang/slang-ir-type-set.cpp b/source/slang/slang-ir-type-set.cpp
index a4ebf8242..e5271698c 100644
--- a/source/slang/slang-ir-type-set.cpp
+++ b/source/slang/slang-ir-type-set.cpp
@@ -115,50 +115,74 @@ IRInst* IRTypeSet::cloneInst(IRInst* inst)
             clone = m_builder.getStringValue(stringLit->getStringSlice());
             break;
         }
-        default:
+        case kIROp_VectorType:
         {
-            if (IRBasicType::isaImpl(inst->op))
+            auto vecType = static_cast<IRVectorType*>(inst);
+            const Index elementCount = Index(GetIntVal(vecType->getElementCount()));
+
+            if (elementCount <= 1)
             {
-                clone = m_builder.getType(inst->op);
+                clone = cloneType(vecType->getElementType());
             }
-            else
+            break;
+        }
+        case kIROp_MatrixType:
+        {
+            auto matType = static_cast<IRMatrixType*>(inst);
+            const Index columnCount = Index(GetIntVal(matType->getColumnCount()));
+            const Index rowCount = Index(GetIntVal(matType->getRowCount()));
+
+            if (columnCount <= 1 && rowCount <= 1)
             {
-                IRType* irType = dynamicCast<IRType>(inst);
-                if (irType)
-                {
-                    auto clonedType = cloneType(inst->getFullType());
-                    Index operandCount = Index(inst->getOperandCount());
+                clone = cloneType(matType->getElementType());
+            }
+            break;
+        }
+        default: break;
+    }
+
+    if (!clone)
+    {
+        if (IRBasicType::isaImpl(inst->op))
+        {
+            clone = m_builder.getType(inst->op);
+        }
+        else
+        {
+            IRType* irType = dynamicCast<IRType>(inst);
+            if (irType)
+            {
+                auto clonedType = cloneType(inst->getFullType());
+                Index operandCount = Index(inst->getOperandCount());
 
-                    List<IRInst*> cloneOperands;
-                    cloneOperands.setCount(operandCount);
+                List<IRInst*> cloneOperands;
+                cloneOperands.setCount(operandCount);
 
-                    for (Index i = 0; i < operandCount; ++i)
-                    {
-                        cloneOperands[i] = cloneInst(inst->getOperand(i));
-                    }
+                for (Index i = 0; i < operandCount; ++i)
+                {
+                    cloneOperands[i] = cloneInst(inst->getOperand(i));
+                }
 
-                    //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
+                //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
 
-                    UInt operandCounts[1] = { UInt(operandCount) };
-                    IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
+                UInt operandCounts[1] = { UInt(operandCount) };
+                IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
 
-                    clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
-                }
-                else
+                clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
+            }
+            else
+            {
+                // This cloning style only works on insts that are not unique
+                auto clonedType = cloneType(inst->getFullType());
+
+                Index operandCount = Index(inst->getOperandCount());
+                clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
+                for (Index i = 0; i < operandCount; ++i)
                 {
-                    // This cloning style only works on insts that are not unique
-                    auto clonedType = cloneType(inst->getFullType());
-
-                    Index operandCount = Index(inst->getOperandCount());
-                    clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
-                    for (Index i = 0; i < operandCount; ++i)
-                    {
-                        auto cloneOperand = cloneInst(inst->getOperand(i));
-                        clone->getOperands()[i].init(clone, cloneOperand);
-                    }
+                    auto cloneOperand = cloneInst(inst->getOperand(i));
+                    clone->getOperands()[i].init(clone, cloneOperand);
                 }
             }
-            break;
         }
     }
 
@@ -226,6 +250,10 @@ void IRTypeSet::getTypes(Kind kind, List<IRType*>& outTypes) const
 IRType* IRTypeSet::addVectorType(IRType* inElementType, int colsCount)
 {
     IRType* elementType = cloneType(inElementType);
+    if (colsCount == 1)
+    {
+        return elementType;
+    }
     return m_builder.getVectorType(elementType, m_builder.getIntValue(m_builder.getIntType(), colsCount));
 }
 
diff --git a/source/slang/slang-ir-type-set.h b/source/slang/slang-ir-type-set.h
index 09abdf2ad..958d71cf1 100644
--- a/source/slang/slang-ir-type-set.h
+++ b/source/slang/slang-ir-type-set.h
@@ -34,7 +34,10 @@ works, but probably needs to be handled in a better way. The better way may invo
 enabled in other code generation and making de-duping possible in emit code.
 
 Note that one pro for this approach is that it does not alter the source module. That as it stands it's not necessary
-for the source module to be immutable, because it is created for emitting and then discarded. 
+for the source module to be immutable, because it is created for emitting and then discarded.
+
+NOTE! That Vector<X, 1> or Matrix<X, 1, 1> will be turned into the type X.
+
  */
 class IRTypeSet
 {
diff --git a/tests/compute/texture-simple.slang b/tests/compute/texture-simple.slang
new file mode 100644
index 000000000..040af2784
--- /dev/null
+++ b/tests/compute/texture-simple.slang
@@ -0,0 +1,31 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil
+// TODO(JS): Doesn't work on vk currently
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute 
+
+//TEST_INPUT: Texture1D(size=4, content = one):name t1D
+Texture1D<float> t1D;
+//TEST_INPUT: Texture2D(size=4, content = one):name t2D
+Texture2D<float> t2D;
+
+//TEST_INPUT: Sampler:name samplerState
+SamplerState samplerState;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 4, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = dispatchThreadID.x;
+    float u = idx * (1.0f / 4);
+    
+    float val = 0.0f;
+    val += t1D.SampleLevel(samplerState, u, 0);
+    val += t2D.SampleLevel(samplerState, float2(u, u), 0);
+ 
+    outputBuffer[idx] = val;
+}
diff --git a/tests/compute/texture-simple.slang.expected.txt b/tests/compute/texture-simple.slang.expected.txt
new file mode 100644
index 000000000..f5cf6fb10
--- /dev/null
+++ b/tests/compute/texture-simple.slang.expected.txt
@@ -0,0 +1,4 @@
+40000000
+40000000
+40000000
+40000000
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp
index 2bb0baf88..d0907482c 100644
--- a/tools/render-test/cpu-compute-util.cpp
+++ b/tools/render-test/cpu-compute-util.cpp
@@ -16,48 +16,106 @@ namespace renderer_test {
 using namespace Slang;
 
 template <int COUNT>
-struct OneTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D
+struct ValueTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D
 {
-    void setOne(void* out)
+    void set(void* out)
     {
         float* dst = (float*)out;
         for (int i = 0; i < COUNT; ++i)
         {
-            dst[i] = 1.0f;
+            dst[i] = m_value;
         }
     }
 
     virtual void Load(const CPPPrelude::int3& v, void* out) SLANG_OVERRIDE
     {
-        setOne(out);
+        set(out);
     }
     virtual void Sample(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, void* out) SLANG_OVERRIDE
     {
-        setOne(out);
+        set(out);
     }
     virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, float level, void* out) SLANG_OVERRIDE
     {
-        setOne(out);
+        set(out);
     }
 
-    OneTexture2D()
+    ValueTexture2D(float value):
+        m_value(value)
     {
         m_interface = static_cast<CPPPrelude::ITexture2D*>(this);
     }
+
+    float m_value;
 };
 
-static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
+template <int COUNT>
+struct ValueTexture1D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture1D
 {
-    switch (elemCount)
+    void set(void* out)
+    {
+        float* dst = (float*)out;
+        for (int i = 0; i < COUNT; ++i)
+        {
+            dst[i] = m_value;
+        }
+    }
+
+    virtual void Load(const CPPPrelude::int2& v, void* out) SLANG_OVERRIDE
     {
-        case 1: return new OneTexture2D<1>();
-        case 2: return new OneTexture2D<2>();
-        case 3: return new OneTexture2D<3>();
-        case 4: return new OneTexture2D<4>();
-        default: return nullptr;
+        set(out);
     }
+    virtual void Sample(CPPPrelude::SamplerState samplerState, float loc, void* out) SLANG_OVERRIDE
+    {
+        set(out);
+    }
+    virtual void SampleLevel(CPPPrelude::SamplerState samplerState, float loc, float level, void* out) SLANG_OVERRIDE
+    {
+        set(out);
+    }
+
+    ValueTexture1D(float value) :
+        m_value(value)
+    {
+        m_interface = static_cast<CPPPrelude::ITexture1D*>(this);
+    }
+
+    float m_value;
+};
+
+static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape baseShape, int elemCount, float value)
+{
+    switch (baseShape)
+    {
+        case SLANG_TEXTURE_1D:
+        {
+            switch (elemCount)
+            {
+                case 1: return new ValueTexture1D<1>(value);
+                case 2: return new ValueTexture1D<2>(value);
+                case 3: return new ValueTexture1D<3>(value);
+                case 4: return new ValueTexture1D<4>(value);
+                default: break;
+            }
+            break;
+        }
+        case SLANG_TEXTURE_2D:
+        {
+            switch (elemCount)
+            {
+                case 1: return new ValueTexture2D<1>(value);
+                case 2: return new ValueTexture2D<2>(value);
+                case 3: return new ValueTexture2D<3>(value);
+                case 4: return new ValueTexture2D<4>(value);
+                default: break;
+            }
+        }
+        default: break;
+    }
+    return nullptr;
 }
 
+
 /* static */SlangResult CPUComputeUtil::calcBindings(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext)
 {
     auto request = compilationAndLayout.output.request;
@@ -109,13 +167,16 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
 
                         //auto access = type->getResourceAccess();
 
-                        switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+                        auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+                        switch (baseShape)
                         {
+                            case SLANG_TEXTURE_1D:
                             case SLANG_TEXTURE_2D:
                             {
                                 SLANG_ASSERT(value->m_userIndex >= 0);
                                 auto& srcEntry = layout.entries[value->m_userIndex];
 
+                 
                                 // TODO(JS):
                                 // We should use the srcEntry to determine what data to store in the texture,
                                 // it's dimensions etc. For now we just support it being 1.
@@ -128,12 +189,23 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
                                     count = int(typeReflection->getElementCount());
                                 }
 
-                                // TODO(JS): Should use the input setup to work how to create this texture
-                                // Store the target specific value
-                                value->m_target = _newOneTexture2D(count);
+                                switch (srcEntry.textureDesc.content)
+                                {
+                                    case InputTextureContent::One:
+                                    {
+                                        value->m_target = _newValueTexture(baseShape, count, 1.0f);
+                                        break;                                        
+                                    }
+                                    case InputTextureContent::Zero:
+                                    {
+                                        value->m_target = _newValueTexture(baseShape, count, 0.0f);
+                                        break;
+                                    }
+                                    default: break;
+                                }
                                 break;
                             }
-                            case SLANG_TEXTURE_1D:
+                            
                             case SLANG_TEXTURE_3D:
                             case SLANG_TEXTURE_CUBE:
                             case SLANG_TEXTURE_BUFFER:
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index c6862d2d3..f471c2961 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -9,6 +9,7 @@
 #include "../bind-location.h"
 
 #include <cuda.h>
+
 #include <cuda_runtime_api.h>
 
 namespace renderer_test {
@@ -33,14 +34,11 @@ public:
     typedef RefObject Super;
 
         /// Dtor
-    CUDAResource(): m_cudaMemory(nullptr) {}
-    CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {}
-
     ~CUDAResource()
     {
         if (m_cudaMemory)
         {
-            SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+            SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
         }
     }
 
@@ -49,13 +47,13 @@ public:
         return value ? dynamic_cast<CUDAResource*>(value->m_target.Ptr()) : nullptr;
     }
         /// Helper function to get the cuda memory pointer when given a value
-    static void* getCUDAData(BindSet::Value* value)
+    static CUdeviceptr getCUDAData(BindSet::Value* value)
     {
         auto resource = getCUDAResource(value);
-        return resource ? resource->m_cudaMemory : nullptr;
+        return resource ? resource->m_cudaMemory : CUdeviceptr();
     }
 
-    void* m_cudaMemory;
+    CUdeviceptr m_cudaMemory = CUdeviceptr();
 };
 
 class CUDATextureResource : public RefObject
@@ -63,23 +61,12 @@ class CUDATextureResource : public RefObject
 public:
     typedef RefObject Super;
 
-    CUDATextureResource() {}
-    CUDATextureResource(CUtexObject cudaTexObj, CUdeviceptr cudaMemory, CUarray cudaArray):
-        m_cudaTexObj(cudaTexObj),
-        m_cudaMemory(cudaMemory),
-        m_cudaArray(cudaArray)
-    {
-    }
     ~CUDATextureResource()
     {
         if (m_cudaTexObj)
         {
             SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
         }
-        if (m_cudaMemory)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
-        }
         if (m_cudaArray)
         {
             SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
@@ -98,10 +85,8 @@ public:
         return resource ? resource->m_cudaTexObj : CUtexObject(0);
     }
 
-protected:
     // This is an opaque type, that's backed by a long long
     CUtexObject m_cudaTexObj = CUtexObject();
-    CUdeviceptr m_cudaMemory = CUdeviceptr();
     CUarray m_cudaArray = CUarray();
 };
 
@@ -140,7 +125,7 @@ public:
     {
         release();
         SLANG_ASSERT(m_stream == nullptr);
-        SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&m_stream, flags));
+        SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags));
         return SLANG_OK;
     }
 
@@ -148,7 +133,7 @@ public:
     {
         if (m_stream)
         {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(m_stream));
+            SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream));
         }
         else
         {
@@ -162,7 +147,7 @@ public:
         if (m_stream)
         {
             sync();
-            SLANG_CUDA_ASSERT_ON_FAIL(cudaStreamDestroy(m_stream));
+            SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream));
             m_stream = nullptr;
         }
     }
@@ -171,9 +156,9 @@ public:
 
     ~ScopeCUDAStream() { release(); }
 
-    operator cudaStream_t () const { return m_stream; }
+    operator CUstream () const { return m_stream; }
 
-    cudaStream_t m_stream;
+    CUstream m_stream;
 };
 
 
@@ -408,10 +393,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                     case slang::TypeReflection::Kind::ParameterBlock:
                     {
                         // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
-
-                        void* cudaMem = nullptr;
-                        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
-                        value->m_target = new CUDAResource(cudaMem);
+                        RefPtr<CUDAResource> resource = new CUDAResource;
+                        SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+                        value->m_target = resource;
                         break;
                     }
                     case slang::TypeReflection::Kind::Resource:
@@ -419,11 +403,15 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         auto type = typeLayout->getType();
                         auto shape = type->getResourceShape();
 
-                        //auto access = type->getResourceAccess();
+                        auto access = type->getResourceAccess();
 
-                        switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+                        auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+
+                        switch (baseShape)
                         {
+                            case SLANG_TEXTURE_1D:
                             case SLANG_TEXTURE_2D:
+                            case SLANG_TEXTURE_3D:
                             {
                                 SLANG_ASSERT(value->m_userIndex >= 0);
                                 auto& srcEntry = entries[value->m_userIndex];
@@ -439,18 +427,38 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                                 const auto& textureDesc = srcEntry.textureDesc;
 
                                 int width = textureDesc.size;
-                                int height = textureDesc.size;
+                                int height = 1;
+                                int depth = 1;
 
+                                switch (baseShape)
+                                {
+                                    case SLANG_TEXTURE_1D: break;
+                                    case SLANG_TEXTURE_2D:
+                                    {
+                                        height = textureDesc.size;
+                                        break;
+                                    }
+                                    case SLANG_TEXTURE_3D:
+                                    {
+                                        height = textureDesc.size;
+                                        depth = textureDesc.size;
+                                        break;
+                                    }
+                                }
+                                
                                 TextureData texData;
                                 generateTextureData(texData, textureDesc);
 
+                                RefPtr<CUDATextureResource> tex = new CUDATextureResource;
+
                                 size_t elementSize = 0;
 
-                                CUarray cudaArray;
                                 {
                                     CUDA_ARRAY_DESCRIPTOR arrayDesc;
                                     arrayDesc.Width = width;
-                                    arrayDesc.Height = height;
+
+                                    // Width, and Height are the width, and height of the CUDA array (in elements); the CUDA array is one-dimensional if height is 0, two-dimensional otherwise;
+                                    arrayDesc.Height = (baseShape == SLANG_TEXTURE_1D) ? 0 : height;
 
                                     switch (textureDesc.format)
                                     {
@@ -476,37 +484,44 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                                     }
 
                                     // Allocate the array
-                                    SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&cudaArray, &arrayDesc));
-                                }
-
-                                CUdeviceptr cudaMemory = (CUdeviceptr)nullptr;
-                                {
-                                    const size_t size = width  * height * elementSize;
-                                     // allocate device memory for result
-                                    SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&cudaMemory, size));
+                                    SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
                                 }
 
+                                switch (baseShape)
                                 {
-                                    CUDA_MEMCPY2D copyParam;
-                                    memset(&copyParam, 0, sizeof(copyParam));
-                                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                                    copyParam.dstArray = cudaArray;
-                                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-                                    copyParam.srcHost = texData.dataBuffer[0].getBuffer();
-                                    copyParam.srcPitch = width * elementSize;
-                                    copyParam.WidthInBytes = copyParam.srcPitch;
-                                    copyParam.Height = height;
-                                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+                                    case SLANG_TEXTURE_1D:
+                                    case SLANG_TEXTURE_2D:
+                                    {
+                                        // TODO(JS):
+                                        // Not clear how the copy should be done for 1D, but seeing as it is copying to an 'array'
+                                        // doing it with cuMemcpy2D is appropriate.
+                                        // Not clear if the height should be 0 or 1. The array required it to be 0.
+                                        CUDA_MEMCPY2D copyParam;
+                                        memset(&copyParam, 0, sizeof(copyParam));
+                                        copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                                        copyParam.dstArray = tex->m_cudaArray;
+                                        copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                                        copyParam.srcHost = texData.dataBuffer[0].getBuffer();
+                                        copyParam.srcPitch = width * elementSize;
+                                        copyParam.WidthInBytes = copyParam.srcPitch; 
+                                        copyParam.Height = height;
+                                        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+                                        break;
+                                    }
+                                    case SLANG_TEXTURE_3D:
+                                    {
+                                        SLANG_ASSERT(!"Not implemented");
+                                        break;
+                                    }
                                 }
 
                                 // set texture parameters
 
-                                CUtexObject cudaTexObj;
                                 {
                                     CUDA_RESOURCE_DESC resDesc;
                                     memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
                                     resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-                                    resDesc.res.array.hArray = cudaArray;
+                                    resDesc.res.array.hArray = tex->m_cudaArray;
 
                                     CUDA_TEXTURE_DESC texDesc;
                                     memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
@@ -516,14 +531,13 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                                     texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
                                     texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
-                                    SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&cudaTexObj, &resDesc, &texDesc, nullptr));
+                                    SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
                                 }
 
-                                value->m_target = new CUDATextureResource(cudaTexObj, cudaMemory, cudaArray);
+                                value->m_target = tex;
                                 break;
                             }
-                            case SLANG_TEXTURE_1D:
-                            case SLANG_TEXTURE_3D:
+
                             case SLANG_TEXTURE_CUBE:
                             case SLANG_TEXTURE_BUFFER:
                             {
@@ -536,10 +550,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_STRUCTURED_BUFFER:
                             {
                                 // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
-                                void* cudaMem = nullptr;
-                                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
-                                value->m_target = new CUDAResource(cudaMem);
-
+                                RefPtr<CUDAResource> resource = new CUDAResource;
+                                SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+                                value->m_target = resource;
                                 break;
                             }
                         }
@@ -572,7 +585,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         auto elementCount = int(typeLayout->getElementCount());
                         if (elementCount == 0)
                         {
-                            CUDAComputeUtil::Array array = { nullptr, 0 };
+                            CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
                             auto resource = CUDAResource::getCUDAResource(value);
                             if (resource)
                             {
@@ -588,7 +601,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                     case slang::TypeReflection::Kind::ParameterBlock:
                     {
                         // These map down to just pointers
-                        *location.getUniform<void*>() = CUDAResource::getCUDAData(value);
+                        *location.getUniform<CUdeviceptr>() = CUDAResource::getCUDAData(value);
                         break;
                     }
                     case slang::TypeReflection::Kind::Resource:
@@ -602,7 +615,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         {
                             case SLANG_STRUCTURED_BUFFER:
                             {
-                                CUDAComputeUtil::StructuredBuffer buffer = { nullptr, 0 };
+                                CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
                                 auto resource = CUDAResource::getCUDAResource(value);
                                 if (resource)
                                 {
@@ -615,7 +628,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             }
                             case SLANG_BYTE_ADDRESS_BUFFER:
                             {
-                                CUDAComputeUtil::ByteAddressBuffer buffer = { nullptr, 0 };
+                                CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
 
                                 auto resource = CUDAResource::getCUDAResource(value);
                                 if (resource)
@@ -649,11 +662,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
             const auto& values = bindSet.getValues();
             for (BindSet::Value* value : values)
             {
-                void* cudaMem = CUDAResource::getCUDAData(value);
+                CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
                 if (value && value->m_data && cudaMem)
                 {
                     // Okay copy the data over...
-                    SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice));
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
                 }
             }
         }
@@ -670,8 +683,8 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
             SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
 
             // Work out the args
-            void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
-            void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+            CUdeviceptr uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
+            CUdeviceptr entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
 
             // NOTE! These are pointers to the cuda memory pointers
             void* args[] = { &entryPointCUDAData , &uniformCUDAData };
@@ -683,10 +696,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
             auto cudaLaunchResult = cuLaunchKernel(kernel,
                 dispatchSize[0], dispatchSize[1], dispatchSize[2], 
                 int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
-                0,                                                                                      // Shared memory size
-                cudaStream,                                                                                 // Stream. 0 is no stream.
-                args,                                                                                   // Args
-                nullptr);                                                                               // extra
+                0,              // Shared memory size
+                cudaStream,     // Stream. 0 is no stream.
+                args,           // Args
+                nullptr);       // extra
 
             SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
 
@@ -707,11 +720,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                 if (entry.isOutput)
                 {
                     // Copy back to CPU memory
-                    void* cudaMem = CUDAResource::getCUDAData(value);
+                   CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
                     if (value && value->m_data && cudaMem)
                     {
                         // Okay copy the data back...
-                        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost));
+                        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
                     }
                 }
             }
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index 8965f5037..f1ca65502 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -8,22 +8,26 @@
 
 namespace renderer_test {
 
+
 struct CUDAComputeUtil
 {
+    // Define here, so we don't need to include the cude header
+    typedef size_t CUdeviceptr;
+
         /// NOTE! MUST match up to definitions in the CUDA prelude
     struct ByteAddressBuffer
     {
-        void* data;
+        CUdeviceptr data;
         size_t sizeInBytes;
     };
     struct StructuredBuffer
     {
-        void* data;
+        CUdeviceptr data;
         size_t count;
     };
     struct Array
     {
-        void* data;
+        CUdeviceptr data;
         size_t count;
     };