From 0bc89f27b08fa7241a7be4f80c2161f25ea3bf78 Mon Sep 17 00:00:00 2001
From: Mukund Keshava <mkeshava@nvidia.com>
Date: Wed, 16 Jul 2025 20:46:06 +0530
Subject: Fix CUDA issues with texture reads and surface writes (#7780)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix 1D texture reads in CUDA target

Fixes #7570: 1D surface writes don't work

The issue was that the Load function for read-only textures (hlsl.meta.slang lines 3629-3656)
only supported 2D and 3D textures for CUDA targets, causing 1D texture reads to fall through
to <invalid intrinsic>. This affected the srcTexture[tid.x] read operation in the reproduction case.

Changes:
- Updated static_assert to include SLANG_TEXTURE_1D support
- Added tex1DArrayfetch_int<T> for 1D array texture reads
- Added tex1Dfetch_int<T> for regular 1D texture reads

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Mukund Keshava <mkeshavaNV@users.noreply.github.com>

* Add 1D texture read support for CUDA target

- Add tex1Dfetch_int template specializations for float2, float4, uint, uint2, uint4
- Remove TODO comment about 1D PTX not being supported
- Enable 1D texture test in texture-subscript-cuda.slang
- Fix assembly code issues in original template specializations

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Mukund Keshava <mkeshavaNV@users.noreply.github.com>

* Update slang-cuda-prelude.h

* Fix texture3d ptx issue

* undo 1D texture changes

* Update hlsl.meta.slang

* Update hlsl.meta.slang

* Update hlsl.meta.slang

* Update hlsl.meta.slang

* Extend texture-subscript-cuda.slang test with uint and int format variants

Add test cases for newly supported texture formats in CUDA:
- 2D textures with uint, uint2, uint4
- 2D textures with int, int2, int4
- 3D textures with uint, uint2, uint4
- 3D textures with int, int2, int4

This ensures the texture subscript operations work correctly for all
the format variants added in the CUDA texture fixes.

Co-authored-by: Mukund Keshava <mkeshavaNV@users.noreply.github.com>

* update expected file

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Mukund Keshava <mkeshavaNV@users.noreply.github.com>
---
 prelude/slang-cuda-prelude.h                       | 237 ++++++++++++++++++++-
 source/slang/hlsl.meta.slang                       |   2 +-
 tests/compute/texture-subscript-cuda.slang         | 107 +++++++++-
 .../texture-subscript-cuda.slang.expected.txt      |   4 +
 4 files changed, 342 insertions(+), 8 deletions(-)
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index f0c053168..fd79b77aa 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1324,10 +1324,11 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(
     cudaSurfaceBoundaryMode boundaryMode)
 {
     asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" ::"l"(surfObj),
+        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
         "r"(x),
         "r"(y),
         "r"(z),
+        "r"(0),
         "f"(v));
 }
 
@@ -1376,11 +1377,12 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(
 {
     const float vx = v.x, vy = v.y;
     asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3}], {%4,%5};}\n\t" ::"l"(surfObj),
+        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
         "r"(x),
         "r"(y),
         "r"(z),
+        "r"(0),
         "f"(vx),
         "f"(vy));
 }
@@ -1435,17 +1437,242 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(
 {
     const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
     asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" ::"l"(surfObj),
+        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
         "r"(x),
         "r"(y),
         "r"(z),
+        "r"(0),
         "f"(vx),
         "f"(vy),
         "f"(vz),
         "f"(vw));
 }
 
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint>(
+    uint v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(v));
+}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint>(
+    uint v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    asm volatile(
+        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(v));
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint2>(
+    uint2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const uint vx = v.x, vy = v.y;
+    asm volatile(
+        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(vx),
+        "r"(vy));
+}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint2>(
+    uint2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const uint vx = v.x, vy = v.y;
+    asm volatile(
+        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(vx),
+        "r"(vy));
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint4>(
+    uint4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
+    asm volatile(
+        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(vx),
+        "r"(vy),
+        "r"(vz),
+        "r"(vw));
+}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint4>(
+    uint4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
+    asm volatile(
+        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(vx),
+        "r"(vy),
+        "r"(vz),
+        "r"(vw));
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int>(
+    int v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(v));
+}
+// Int2
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int2>(
+    int2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const int vx = v.x, vy = v.y;
+    asm volatile(
+        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(vx),
+        "r"(vy));
+}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int4>(
+    int4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
+    asm volatile(
+        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(vx),
+        "r"(vy),
+        "r"(vz),
+        "r"(vw));
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int>(
+    int v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    asm volatile(
+        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(v));
+}
+// Int2
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int2>(
+    int2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const int vx = v.x, vy = v.y;
+    asm volatile(
+        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(vx),
+        "r"(vy));
+}
+// Int4
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int4>(
+    int4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
+    asm volatile(
+        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "r"(0),
+        "r"(vx),
+        "r"(vy),
+        "r"(vz),
+        "r"(vw));
+}
+
 // ----------------------------- F32 -----------------------------------------
 
 // Unary
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 216dfc04a..264098bec 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -28388,4 +28388,4 @@ void InterlockedAddF16Emulated(half* dest, half value, out half originalValue)
 void InterlockedAddF16x2(half2* dest, half2 value, out half2 originalValue)
 {
     originalValue = __atomic_add(*dest, value);
-}
\ No newline at end of file
+}
diff --git a/tests/compute/texture-subscript-cuda.slang b/tests/compute/texture-subscript-cuda.slang
index 7c4a2cc78..26b1f9e09 100644
--- a/tests/compute/texture-subscript-cuda.slang
+++ b/tests/compute/texture-subscript-cuda.slang
@@ -24,10 +24,37 @@ Texture2DArray<float2> cudaT2DArray_f2;
 //TEST_INPUT: Texture2D(size=16, content = one, arrayLength=3):name cudaT2DArray_f4
 Texture2DArray<float4> cudaT2DArray_f4;
 
-//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0], stride=4):out,name cudaOutputBuffer
+// New texture declarations for uint and int variants
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_uint
+Texture2D<uint> cudaT2D_uint;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_uint2
+Texture2D<uint2> cudaT2D_uint2;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_uint4
+Texture2D<uint4> cudaT2D_uint4;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_int
+Texture2D<int> cudaT2D_int;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_int2
+Texture2D<int2> cudaT2D_int2;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D_int4
+Texture2D<int4> cudaT2D_int4;
+
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_uint
+Texture3D<uint> cudaT3D_uint;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_uint2
+Texture3D<uint2> cudaT3D_uint2;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_uint4
+Texture3D<uint4> cudaT3D_uint4;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_int
+Texture3D<int> cudaT3D_int;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_int2
+Texture3D<int2> cudaT3D_int2;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D_int4
+Texture3D<int4> cudaT3D_int4;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name cudaOutputBuffer
 RWStructuredBuffer<float> cudaOutputBuffer;
 
-[numthreads(7, 1, 1)]
+[numthreads(11, 1, 1)]
 [shader("compute")]
 void computeMain(int3 dispatchThreadID : SV_DispatchThreadID)
 {
@@ -97,5 +124,81 @@ void computeMain(int3 dispatchThreadID : SV_DispatchThreadID)
             cudaOutputBuffer[idx] += result4.w;
         }
     break;
+
+    case 5:
+        {
+            // Test 2D textures with uint variants
+            int2 var = int2(1, 2);
+            uint result = cudaT2D_uint[var];
+            cudaOutputBuffer[idx] = float(result);
+            
+            uint2 result2 = cudaT2D_uint2[var];
+            cudaOutputBuffer[idx] += float(result2.x);
+            cudaOutputBuffer[idx] += float(result2.y);
+
+            uint4 result4 = cudaT2D_uint4[var];
+            cudaOutputBuffer[idx] += float(result4.x);
+            cudaOutputBuffer[idx] += float(result4.y);
+            cudaOutputBuffer[idx] += float(result4.z);
+            cudaOutputBuffer[idx] += float(result4.w);
+        }
+    break;
+
+    case 6:
+        {
+            // Test 2D textures with int variants
+            int2 var = int2(1, 2);
+            int result = cudaT2D_int[var];
+            cudaOutputBuffer[idx] = float(result);
+            
+            int2 result2 = cudaT2D_int2[var];
+            cudaOutputBuffer[idx] += float(result2.x);
+            cudaOutputBuffer[idx] += float(result2.y);
+
+            int4 result4 = cudaT2D_int4[var];
+            cudaOutputBuffer[idx] += float(result4.x);
+            cudaOutputBuffer[idx] += float(result4.y);
+            cudaOutputBuffer[idx] += float(result4.z);
+            cudaOutputBuffer[idx] += float(result4.w);
+        }
+    break;
+
+    case 7:
+        {
+            // Test 3D textures with uint variants
+            int3 var = int3(1, 1, 1);
+            uint result = cudaT3D_uint[var];
+            cudaOutputBuffer[idx] = float(result);
+            
+            uint2 result2 = cudaT3D_uint2[var];
+            cudaOutputBuffer[idx] += float(result2.x);
+            cudaOutputBuffer[idx] += float(result2.y);
+
+            uint4 result4 = cudaT3D_uint4[var];
+            cudaOutputBuffer[idx] += float(result4.x);
+            cudaOutputBuffer[idx] += float(result4.y);
+            cudaOutputBuffer[idx] += float(result4.z);
+            cudaOutputBuffer[idx] += float(result4.w);
+        }
+    break;
+
+    case 8:
+        {
+            // Test 3D textures with int variants
+            int3 var = int3(1, 1, 1);
+            int result = cudaT3D_int[var];
+            cudaOutputBuffer[idx] = float(result);
+            
+            int2 result2 = cudaT3D_int2[var];
+            cudaOutputBuffer[idx] += float(result2.x);
+            cudaOutputBuffer[idx] += float(result2.y);
+
+            int4 result4 = cudaT3D_int4[var];
+            cudaOutputBuffer[idx] += float(result4.x);
+            cudaOutputBuffer[idx] += float(result4.y);
+            cudaOutputBuffer[idx] += float(result4.z);
+            cudaOutputBuffer[idx] += float(result4.w);
+        }
+    break;
     }
 } 
\ No newline at end of file
diff --git a/tests/compute/texture-subscript-cuda.slang.expected.txt b/tests/compute/texture-subscript-cuda.slang.expected.txt
index 698717361..ceb358832 100644
--- a/tests/compute/texture-subscript-cuda.slang.expected.txt
+++ b/tests/compute/texture-subscript-cuda.slang.expected.txt
@@ -3,5 +3,9 @@
 40E00000
 40E00000
 40E00000
+4FDE4000
+4FDE4000
+4FDE4000
+4FDE4000
 0
 0
\ No newline at end of file
-- 
cgit v1.2.3