15 files changed, 397 insertions, 67 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md
index a9b35d735..7fbc8c135 100644
--- a/docs/cuda-target.md
+++ b/docs/cuda-target.md
@@ -16,13 +16,14 @@ These limitations apply to Slang transpiling to CUDA.
 
 * Only supports the 'texture object' style binding (The texture object API is only supported on devices of compute capability 3.0 or higher. )
 * Samplers are not separate objects in CUDA - they are combined into a single 'TextureObject'. So samplers are effectively ignored on CUDA targets. 
-* Whilst there is tex1Dfetch there are no equivalents for higher dimensions - so such accesses are not currently supported
 * When using a TextureArray (layered texture in CUDA) - the index will be treated as an int, as this is all CUDA allows
 * Care must be used in using `WaveGetLaneIndex` wave intrinsic - it will only give the right results for appopriate launches
+* Surfaces are used for textures which are read/write. CUDA does NOT do format conversion with surfaces.
 
-The following are a work in progress or not implmented but are planned to be so in the future
+The following are a work in progress or not implemented but are planned to be so in the future
 
-* Resource types including surfaces
+* Some resource types remain unsupported, and not all methods on types are supported
+* Some support for Wave intrinsics
 
 # How it works
 
@@ -96,6 +97,30 @@ The UniformState and UniformEntryPointParams struct typically vary by shader. Un
     size_t sizeInBytes;
 ```  
 
+## Texture
+
+Read only textures will be bound as the opaque CUDA type CUtexObject. This type is the combination of both a texture AND a sampler. This is somewhat different from HLSL, where there can be separate `SamplerState` variables. This allows access of a single texture binding with different types of sampling. 
+
+If code relys on this behavior it will be necessary to bind multiple CtexObjects with different sampler settings, accessing the same texture data. 
+
+Slang has some preliminary support for TextureSampler type - a combined Texture and SamplerState. To write Slang code that can target CUDA and other platforms using this mechanism will expose the semantics appropriately within the source.  
+ 
+Load is only supported for Texture1D, and the mip map selection argument is ignored. This is because there is tex1Dfetch and no higher dimensional equivalents. CUDA also only allows such access if the backing array is linear memory - meaning the bound texture cannot have mip maps - thus making the mip map parameter superflous anyway. RWTexture does allow Load on other texture types.  
+ 
+## RWTexture 
+ 
+RWTexture types are converted into CUsurfObject type. 
+
+In CUDA it is not possible to do a format conversion on an access to a CUsurfObject, so it must be backed by the same data format as is used within the Slang source code. 
+
+It is also worth noting that CUsurfObjects in CUDA are NOT allowed to have mip maps. 
+
+By default surface access uses cudaBoundaryModeZero, this can be replaced using the macro SLANG_CUDA_BOUNDARY_MODE in the CUDA prelude.
+
+## Sampler
+
+Samplers are in effect ignored in CUDA output. Currently we do output a variable `SamplerState`, but this value is never accessed within the kernel and so can be ignored. More discussion on this behavior is in `Texture` section.
+
 ## Unsized arrays
 
 Unsized arrays can be used, which are indicated by an array with no size as in `[]`. For example 
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index f62333f68..563b4b6e9 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -343,6 +343,21 @@ struct TextureCubeArray
     ITextureCubeArray* texture;              
 };
 
+/* !!!!!!!!!!!!!!!!!!!!!!!!!!! RWTexture !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
+
+struct IRWTexture1D
+{
+    virtual void Load(int32_t loc, void* out) = 0;
+};
+
+template <typename T>
+struct RWTexture1D
+{
+    T Load(int32_t loc) const { T out; texture->Load(loc, &out); return out; }
+    
+    IRWTexture1D* texture;              
+};
+
 /* Varying input for Compute */
 
 /* Used when running a single thread */
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index b81acba1e..1938e3dc1 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -40,6 +40,16 @@
 #   define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); 
 #endif
 
+ // This macro handles how out-of-range surface coordinates are handled; 
+ // I can equal
+ // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range
+ // cudaBoundaryModeZero, in which case out-of-range reads return zero and out-of-range writes are ignored
+ // cudaBoundaryModeTrap, in which case out-of-range accesses cause the kernel execution to fail. 
+ 
+#ifndef SLANG_CUDA_BOUNDARY_MODE
+#   define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero
+#endif
+
 template <typename T, size_t SIZE>
 struct FixedArray
 {
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index ec1a3ed0b..722629034 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -777,6 +777,67 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << ")$z\")\n";
 
                 }
+
+                // CUDA
+                if (isMultisample)
+                {
+                }
+                else
+                {
+                    if (access == SLANG_RESOURCE_ACCESS_READ_WRITE)
+                    {
+                        const int coordCount = kBaseTextureTypes[tt].coordCount;
+                        const int vecCount = coordCount + int(isArray);
+
+                        if( baseShape != TextureFlavor::Shape::ShapeCube )
+                        {
+                            sb << "__target_intrinsic(cuda, \"surf" << coordCount << "D";
+                            if (isArray)
+                            {
+                                sb << "Layered";
+                            }
+                            sb << "read";
+                            sb << "<$T0>($0";
+                            for (int i = 0; i < coordCount; ++i)
+                            {
+                                sb << ", ($1)";
+                                if (vecCount > 1)
+                                {
+                                    sb << '.' << char(i + 'x');
+                                }
+                            }
+                            if (isArray)
+                            {
+                                sb << ", int(($1)." << char(coordCount + 'x') << ")";
+                            }
+                            sb << ", SLANG_CUDA_BOUNDARY_MODE)\")\n";
+                        }
+                        else
+                        {
+                            sb << "__target_intrinsic(cuda, \"surfCubemap";
+                            if (isArray)
+                            {
+                                sb << "Layered";
+                            }
+                            sb << "read";
+                            sb << "<$T0>($0, ($1).x, ($1).y, ($1).z"; 
+                            if (isArray)
+                            {
+                                sb << ", int(($1).w)";
+                            }
+                            sb << ", SLANG_CUDA_BOUNDARY_MODE)\")\n";
+                        }
+                    }
+                    else if (access == SLANG_RESOURCE_ACCESS_READ)
+                    {
+                        // We can allow this on Texture1D
+                        if( baseShape == TextureFlavor::Shape::Shape1D && isArray == false)
+                        {
+                            sb << "__target_intrinsic(cuda, \"tex1Dfetch<$T0>($0, ($1).x)\")\n";
+                        }
+                    }
+                }
+
                 sb << "T Load(";
                 sb << "int" << loadCoordCount << " location";
                 if(isMultisample)
@@ -785,6 +846,7 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 }
                 sb << ");\n";
 
+                // GLSL
                 if (isMultisample)
                 {
                     sb << "__glsl_extension(GL_EXT_samplerless_texture_functions)";
@@ -804,6 +866,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     }
                     sb << ", $2)$z\")\n";
                 }
+
+
+
                 sb << "T Load(";
                 sb << "int" << loadCoordCount << " location";
                 if(isMultisample)
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index a8ad43965..ba960b1d1 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -798,6 +798,67 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << ")$z\")\n";
 
                 }
+
+                // CUDA
+                if (isMultisample)
+                {
+                }
+                else
+                {
+                    if (access == SLANG_RESOURCE_ACCESS_READ_WRITE)
+                    {
+                        const int coordCount = kBaseTextureTypes[tt].coordCount;
+                        const int vecCount = coordCount + int(isArray);
+
+                        if( baseShape != TextureFlavor::Shape::ShapeCube )
+                        {
+                            sb << "__target_intrinsic(cuda, \"surf" << coordCount << "D";
+                            if (isArray)
+                            {
+                                sb << "Layered";
+                            }
+                            sb << "read";
+                            sb << "<$T0>($0";
+                            for (int i = 0; i < coordCount; ++i)
+                            {
+                                sb << ", ($1)";
+                                if (vecCount > 1)
+                                {
+                                    sb << '.' << char(i + 'x');
+                                }
+                            }
+                            if (isArray)
+                            {
+                                sb << ", int(($1)." << char(coordCount + 'x') << ")";
+                            }
+                            sb << ", SLANG_CUDA_BOUNDARY_MODE)\")\n";
+                        }
+                        else
+                        {
+                            sb << "__target_intrinsic(cuda, \"surfCubemap";
+                            if (isArray)
+                            {
+                                sb << "Layered";
+                            }
+                            sb << "read";
+                            sb << "<$T0>($0, ($1).x, ($1).y, ($1).z"; 
+                            if (isArray)
+                            {
+                                sb << ", int(($1).w)";
+                            }
+                            sb << ", SLANG_CUDA_BOUNDARY_MODE)\")\n";
+                        }
+                    }
+                    else if (access == SLANG_RESOURCE_ACCESS_READ)
+                    {
+                        // We can allow this on Texture1D
+                        if( baseShape == TextureFlavor::Shape::Shape1D && isArray == false)
+                        {
+                            sb << "__target_intrinsic(cuda, \"tex1Dfetch<$T0>($0, ($1).x)\")\n";
+                        }
+                    }
+                }
+
                 sb << "T Load(";
                 sb << "int" << loadCoordCount << " location";
                 if(isMultisample)
@@ -806,6 +867,7 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 }
                 sb << ");\n";
 
+                // GLSL
                 if (isMultisample)
                 {
                     sb << "__glsl_extension(GL_EXT_samplerless_texture_functions)";
@@ -825,6 +887,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     }
                     sb << ", $2)$z\")\n";
                 }
+
+
+
                 sb << "T Load(";
                 sb << "int" << loadCoordCount << " location";
                 if(isMultisample)
@@ -1359,7 +1424,7 @@ for (auto op : binaryOps)
         sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
     }
 }
-SLANG_RAW("#line 1341 \"core.meta.slang\"")
+SLANG_RAW("#line 1406 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 988c6f69c..c3339cbb5 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1433,7 +1433,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,
 
 // TODO(JS): For CUDA the article claims mask has to be used carefully
 // https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
-// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask()
 // seems to be appropriate.
 
 __target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 8614fd756..69349d9dc 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1509,7 +1509,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatc
 SLANG_RAW("\n")
 SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
 SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
-SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask()\n")
 SLANG_RAW("// seems to be appropriate.\n")
 SLANG_RAW("\n")
 SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
diff --git a/tests/compute/rw-texture-simple.slang b/tests/compute/rw-texture-simple.slang
new file mode 100644
index 000000000..dde0ecd4c
--- /dev/null
+++ b/tests/compute/rw-texture-simple.slang
@@ -0,0 +1,27 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+// Doesn't work on DX11 currently - locks up on binding
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil
+// TODO(JS): Doesn't work on vk currently, because createTextureView not implemented on vk renderer
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute 
+
+//TEST_INPUT: RWTexture1D(format=R_Float32, size=4, content = one):name rwt1D
+RWTexture1D<float> rwt1D;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 4, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = dispatchThreadID.x;
+    float u = idx * (1.0f / 4);
+    
+    float val = 0.0f;
+ 
+    val += rwt1D.Load(idx);
+ 
+    outputBuffer[idx] = val;
+}
diff --git a/tests/compute/rw-texture-simple.slang.expected.txt b/tests/compute/rw-texture-simple.slang.expected.txt
new file mode 100644
index 000000000..cc5e55ab6
--- /dev/null
+++ b/tests/compute/rw-texture-simple.slang.expected.txt
@@ -0,0 +1,4 @@
+3F800000
+3F800000
+3F800000
+3F800000
diff --git a/tests/compute/texture-simple.slang b/tests/compute/texture-simple.slang
index 8e72250ff..df990ec7a 100644
--- a/tests/compute/texture-simple.slang
+++ b/tests/compute/texture-simple.slang
@@ -6,6 +6,10 @@
 //DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute 
 
+// Doesn't work on CUDA, not clear why yet
+//DISABLE_TEST_INPUT: Texture1D(format=R_Float32, size=4, content = one, mipMaps=1):name tLoad1D
+//Texture1D<float> tLoad1D;
+
 //TEST_INPUT: Texture1D(size=4, content = one):name t1D
 Texture1D<float> t1D;
 //TEST_INPUT: Texture2D(size=4, content = one):name t2D
@@ -35,6 +39,7 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     float u = idx * (1.0f / 4);
     
     float val = 0.0f;
+   
     val += t1D.SampleLevel(samplerState, u, 0);
     val += t2D.SampleLevel(samplerState, float2(u, u), 0);
     val += t3D.SampleLevel(samplerState, float3(u, u, u), 0);
@@ -44,5 +49,7 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     val += t2DArray.SampleLevel(samplerState, float3(u, u, 0), 0);
     val += tCubeArray.SampleLevel(samplerState, float4(u, u, u, 0), 0);
  
+    //val += tLoad1D.Load(int2(idx, 0));
+ 
     outputBuffer[idx] = val;
 }
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp
index 608da9461..3826ccec1 100644
--- a/tools/render-test/cpu-compute-util.cpp
+++ b/tools/render-test/cpu-compute-util.cpp
@@ -247,19 +247,61 @@ struct ValueTextureCubeArray : public CPUComputeUtil::Resource, public CPPPrelud
     float m_value;
 };
 
-static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape shape, int elemCount, float value)
+
+template <int COUNT>
+struct ValueRWTexture1D : public CPUComputeUtil::Resource, public CPPPrelude::IRWTexture1D
+{
+    void set(void* out)
+    {
+        float* dst = (float*)out;
+        for (int i = 0; i < COUNT; ++i)
+        {
+            dst[i] = m_value;
+        }
+    }
+
+    virtual void Load(int32_t loc, void* out) SLANG_OVERRIDE
+    {
+        set(out);
+    }
+
+    ValueRWTexture1D(float value) :
+        m_value(value)
+    {
+        m_interface = static_cast<CPPPrelude::IRWTexture1D*>(this);
+    }
+
+    float m_value;
+};
+
+
+static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape shape, SlangResourceAccess access, Index elemCount, float value)
 {
     switch (shape)
     {
         case SLANG_TEXTURE_1D:
         {
-            switch (elemCount)
+            if (access == SLANG_RESOURCE_ACCESS_READ_WRITE)
             {
-                case 1: return new ValueTexture1D<1>(value);
-                case 2: return new ValueTexture1D<2>(value);
-                case 3: return new ValueTexture1D<3>(value);
-                case 4: return new ValueTexture1D<4>(value);
-                default: break;
+                switch (elemCount)
+                {
+                    case 1: return new ValueRWTexture1D<1>(value);
+                    case 2: return new ValueRWTexture1D<2>(value);
+                    case 3: return new ValueRWTexture1D<3>(value);
+                    case 4: return new ValueRWTexture1D<4>(value);
+                    default: break;
+                }
+            }
+            else
+            {
+                switch (elemCount)
+                {
+                    case 1: return new ValueTexture1D<1>(value);
+                    case 2: return new ValueTexture1D<2>(value);
+                    case 3: return new ValueTexture1D<3>(value);
+                    case 4: return new ValueTexture1D<4>(value);
+                    default: break;
+                }
             }
             break;
         }
@@ -388,7 +430,7 @@ static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape shape, int
                         auto type = typeLayout->getType();
                         auto shape = type->getResourceShape();
 
-                        //auto access = type->getResourceAccess();
+                        auto access = type->getResourceAccess();
 
                         auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
                         switch (baseShape)
@@ -407,22 +449,22 @@ static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape shape, int
 
                                 slang::TypeReflection* typeReflection = typeLayout->getResourceResultType();
 
-                                int count = 1;
+                                Index count = 1;
                                 if (typeReflection->getKind() == slang::TypeReflection::Kind::Vector)
                                 {
-                                    count = int(typeReflection->getElementCount());
+                                    count = Index(typeReflection->getElementCount());
                                 }
 
                                 switch (srcEntry.textureDesc.content)
                                 {
                                     case InputTextureContent::One:
                                     {
-                                        value->m_target = _newValueTexture(shape, count, 1.0f);
+                                        value->m_target = _newValueTexture(shape, access, count, 1.0f);
                                         break;                                        
                                     }
                                     case InputTextureContent::Zero:
                                     {
-                                        value->m_target = _newValueTexture(shape, count, 0.0f);
+                                        value->m_target = _newValueTexture(shape, access, count, 0.0f);
                                         break;
                                     }
                                     default: break;
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index bce98c1cb..b21b22b30 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -28,13 +28,13 @@ SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0
 
 #define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; }
 
-class CUDAResource : public CUDAComputeUtil::ResourceBase
+class MemoryCUDAResource : public CUDAResource
 {
 public:
-    typedef CUDAComputeUtil::ResourceBase Super;
+    typedef CUDAResource Super;
 
         /// Dtor
-    ~CUDAResource()
+    ~MemoryCUDAResource()
     {
         if (m_cudaMemory)
         {
@@ -42,27 +42,31 @@ public:
         }
     }
 
-    static CUDAResource* getCUDAResource(BindSet::Value* value)
+    static MemoryCUDAResource* asResource(BindSet::Value* value)
     {
-        return value ? dynamic_cast<CUDAResource*>(value->m_target.Ptr()) : nullptr;
+        return value ? dynamic_cast<MemoryCUDAResource*>(value->m_target.Ptr()) : nullptr;
     }
-        /// Helper function to get the cuda memory pointer when given a value
+        /// Helper function to get the CUDA memory pointer when given a value
     static CUdeviceptr getCUDAData(BindSet::Value* value)
     {
-        auto resource = getCUDAResource(value);
+        auto resource = asResource(value);
         return resource ? resource->m_cudaMemory : CUdeviceptr();
     }
 
     CUdeviceptr m_cudaMemory = CUdeviceptr();
 };
 
-class CUDATextureResource : public CUDAComputeUtil::ResourceBase
+class TextureCUDAResource : public CUDAResource
 {
 public:
-    typedef CUDAComputeUtil::ResourceBase Super;
+    typedef CUDAResource Super;
 
-    ~CUDATextureResource()
+    ~TextureCUDAResource()
     {
+        if (m_cudaSurfObj)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
+        }
         if (m_cudaTexObj)
         {
             SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
@@ -77,20 +81,30 @@ public:
         }
     }
 
-    static CUDATextureResource* getCUDATextureResource(BindSet::Value* value)
+    static TextureCUDAResource* asResource(BindSet::Value* value)
     {
-        return value ? dynamic_cast<CUDATextureResource*>(value->m_target.Ptr()) : nullptr;
+        return value ? dynamic_cast<TextureCUDAResource*>(value->m_target.Ptr()) : nullptr;
     }
 
-    static CUtexObject getCUDATexObject(BindSet::Value* value)
+    static CUtexObject getTexObject(BindSet::Value* value)
     {
-        auto resource = getCUDATextureResource(value);
+        auto resource = asResource(value);
         // It's an assumption here that 0 is okay for null. Seems to work...
         return resource ? resource->m_cudaTexObj : CUtexObject(0);
     }
 
-    // This is an opaque type, that's backed by a long long
+    static CUsurfObject getSurfObject(BindSet::Value* value)
+    {
+        auto resource = asResource(value);
+        return resource ? resource->m_cudaSurfObj : CUsurfObject(0);
+    }
+
+    // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by a long long
     CUtexObject m_cudaTexObj = CUtexObject();
+
+    // The surfObj is for reading/writing 'texture like' things, but not for sampling.
+    CUsurfObject m_cudaSurfObj = CUsurfObject();
+
     CUarray m_cudaArray = CUarray();
     CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
 };
@@ -335,20 +349,42 @@ public:
     return SLANG_SUCCEEDED(context.init(0));
 }
 
-/* static */SlangResult CUDAComputeUtil::createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<ResourceBase>& outResource)
+static bool _hasReadAccess(SlangResourceAccess access)
+{
+    return access = SLANG_RESOURCE_ACCESS_READ || access == SLANG_RESOURCE_ACCESS_READ_WRITE;
+}
+
+static bool _hasWriteAccess(SlangResourceAccess access)
+{
+    return access == SLANG_RESOURCE_ACCESS_READ_WRITE;
+}
+
+/* static */SlangResult CUDAComputeUtil::createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<CUDAResource>& outResource)
 {
     auto type = typeLayout->getType();
     auto shape = type->getResourceShape();
 
     auto access = type->getResourceAccess();
 
+    if (!(access == SLANG_RESOURCE_ACCESS_READ ||
+        access == SLANG_RESOURCE_ACCESS_READ_WRITE))
+    {
+        SLANG_ASSERT(!"Only read or read write currently supported");
+        return SLANG_FAIL;
+    }
+
     CUresourcetype resourceType = CU_RESOURCE_TYPE_ARRAY;
     auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
 
     slang::TypeReflection* typeReflection = typeLayout->getResourceResultType();
 
-    const auto& textureDesc = srcEntry.textureDesc;
+    InputTextureDesc textureDesc = srcEntry.textureDesc;
 
+    if (_hasWriteAccess(access))
+    {
+        textureDesc.mipMapCount = 1;
+    }
+    
     // CUDA wants the unused dimensions to be 0.
     // Might need to specially handle elsewhere
     int width = textureDesc.size;
@@ -384,13 +420,13 @@ public:
             return SLANG_FAIL;
         }
     }
-
+    
     TextureData texData;
     generateTextureData(texData, textureDesc);
 
     auto mipLevels = texData.mipLevels;
 
-    RefPtr<CUDATextureResource> tex = new CUDATextureResource;
+    RefPtr<TextureCUDAResource> tex = new TextureCUDAResource;
 
     size_t elementSize = 0;
 
@@ -486,6 +522,11 @@ public:
                 arrayDesc.Format = format;
                 arrayDesc.NumChannels = numChannels;
 
+                if (baseShape == SLANG_TEXTURE_CUBE)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                }
+
                 SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
             }
             else if (baseShape == SLANG_TEXTURE_3D || baseShape == SLANG_TEXTURE_CUBE)
@@ -553,7 +594,6 @@ public:
         }
         SLANG_ASSERT(dstArray);
 
-
         // Check using the desc to see if it's plausible
         {
             CUDA_ARRAY_DESCRIPTOR arrayDesc;
@@ -710,15 +750,25 @@ public:
             resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
         }
 
-        CUDA_TEXTURE_DESC texDesc;
-        memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
-        texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
-        texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
-        texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
-        texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-        texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+        if (_hasWriteAccess(access))
+        {
+            // If has write access it's effectively UAV, and so doesn't have sampling available
+            SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+        }
+        else
+        {
+            // If read only it's a SRV and can sample, but cannot write
+            CUDA_TEXTURE_DESC texDesc;
+            memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+            texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+            texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+            SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+        }
 
-        SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
     }
 
     outResource = tex;
@@ -782,7 +832,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                     case slang::TypeReflection::Kind::ParameterBlock:
                     {
                         // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
-                        RefPtr<CUDAResource> resource = new CUDAResource;
+                        RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
                         SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
                         value->m_target = resource;
                         break;
@@ -801,7 +851,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_TEXTURE_3D:
                             case SLANG_TEXTURE_CUBE:
                             {
-                                RefPtr<CUDAComputeUtil::ResourceBase> resource;
+                                RefPtr<CUDAResource> resource;
                                 SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
                                 value->m_target = resource;
                                 break;
@@ -817,7 +867,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_STRUCTURED_BUFFER:
                             {
                                 // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
-                                RefPtr<CUDAResource> resource = new CUDAResource;
+                                RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
                                 SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
                                 value->m_target = resource;
                                 break;
@@ -853,7 +903,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         if (elementCount == 0)
                         {
                             CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
-                            auto resource = CUDAResource::getCUDAResource(value);
+                            auto resource = MemoryCUDAResource::asResource(value);
                             if (resource)
                             {
                                 array.data = resource->m_cudaMemory;
@@ -868,7 +918,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                     case slang::TypeReflection::Kind::ParameterBlock:
                     {
                         // These map down to just pointers
-                        *location.getUniform<CUdeviceptr>() = CUDAResource::getCUDAData(value);
+                        *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
                         break;
                     }
                     case slang::TypeReflection::Kind::Resource:
@@ -876,14 +926,14 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         auto type = typeLayout->getType();
                         auto shape = type->getResourceShape();
 
-                        //auto access = type->getResourceAccess();
+                        auto access = type->getResourceAccess();
 
                         switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
                         {
                             case SLANG_STRUCTURED_BUFFER:
                             {
                                 CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
-                                auto resource = CUDAResource::getCUDAResource(value);
+                                auto resource = MemoryCUDAResource::asResource(value);
                                 if (resource)
                                 {
                                     buffer.data = resource->m_cudaMemory;
@@ -897,7 +947,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             {
                                 CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
 
-                                auto resource = CUDAResource::getCUDAResource(value);
+                                auto resource = MemoryCUDAResource::asResource(value);
                                 if (resource)
                                 {
                                     buffer.data = resource->m_cudaMemory;
@@ -912,7 +962,14 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_TEXTURE_3D:
                             case SLANG_TEXTURE_CUBE:
                             {
-                                *location.getUniform<CUtexObject>() = CUDATextureResource::getCUDATexObject(value);
+                                if (_hasWriteAccess(access))
+                                {
+                                    *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
+                                }
+                                else
+                                {
+                                    *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
+                                }
                                 break;
                             }
 
@@ -929,7 +986,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
             const auto& values = bindSet.getValues();
             for (BindSet::Value* value : values)
             {
-                CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
+                CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
                 if (value && value->m_data && cudaMem)
                 {
                     // Okay copy the data over...
@@ -950,8 +1007,8 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
             SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
 
             // Work out the args
-            CUdeviceptr uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
-            CUdeviceptr entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+            CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
+            CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
 
             // NOTE! These are pointers to the cuda memory pointers
             void* args[] = { &entryPointCUDAData , &uniformCUDAData };
@@ -987,7 +1044,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                 if (entry.isOutput)
                 {
                     // Copy back to CPU memory
-                   CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
+                   CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
                     if (value && value->m_data && cudaMem)
                     {
                         // Okay copy the data back...
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index 58a5bba33..f15c9d4e3 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -8,10 +8,16 @@
 
 namespace renderer_test {
 
+// Base class for CUDA resources. This includes textures but also
+// memory allocations
+class CUDAResource : public RefObject
+{
+public:
+};
 
 struct CUDAComputeUtil
 {
-    // Define here, so we don't need to include the cude header
+    // Define here, so we don't need to include the CUDA header
     typedef size_t CUdeviceptr;
 
         /// NOTE! MUST match up to definitions in the CUDA prelude
@@ -40,12 +46,7 @@ struct CUDAComputeUtil
         List<BindSet::Value*> m_buffers;
     };
 
-    class ResourceBase : public RefObject
-    {
-    public:
-    };
-
-    static SlangResult createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<ResourceBase>& outResource);
+    static SlangResult createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<CUDAResource>& outResource);
 
     static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext);
 
diff --git a/tools/render-test/shader-input-layout.cpp b/tools/render-test/shader-input-layout.cpp
index 108483a2a..f9d6a60e1 100644
--- a/tools/render-test/shader-input-layout.cpp
+++ b/tools/render-test/shader-input-layout.cpp
@@ -452,6 +452,12 @@ namespace renderer_test
                                     entry.textureDesc.format = format;
                                     entry.bufferDesc.format = format;
                                 }
+                                else if(word == "mipMaps")
+                                {
+                                    parser.Read("=");
+                                    entry.textureDesc.mipMapCount = int(parser.ReadInt());
+                                }
+
                                 if (parser.LookAhead(","))
                                     parser.Read(",");
                                 else
@@ -974,7 +980,12 @@ namespace renderer_test
             arraySize *= 6;
         output.arraySize = arraySize;
         output.textureSize = inputDesc.size;
-        output.mipLevels = Math::Log2Floor(output.textureSize) + 1;
+
+        const Index maxMipLevels = Math::Log2Floor(output.textureSize) + 1;
+        Index mipLevels = (inputDesc.mipMapCount <= 0) ? maxMipLevels : inputDesc.mipMapCount;
+        mipLevels = (mipLevels > maxMipLevels) ? maxMipLevels : mipLevels;
+
+        output.mipLevels = int(mipLevels); 
         output.dataBuffer.setCount(output.mipLevels * output.arraySize);
 
         int slice = 0;
diff --git a/tools/render-test/shader-input-layout.h b/tools/render-test/shader-input-layout.h
index a9d525d47..0831f73bb 100644
--- a/tools/render-test/shader-input-layout.h
+++ b/tools/render-test/shader-input-layout.h
@@ -33,6 +33,7 @@ struct InputTextureDesc
     bool isDepthTexture = false;
     bool isRWTexture = false;
     int size = 4;
+    int mipMapCount = 0;            ///< 0 means the maximum number of mips will be bound
 
     Format format = Format::RGBA_Unorm_UInt8;