Improve texture loads and stores on CUDA (#8644)

- fix handling layer and mip level - add support for 1D layered textures - reduce code by using macros - assert when trying to emit unsupported intrinsics There is a new set of unit tests in slang-rhi for exhaustive testing of shader loads/stores on textures. These fixes allow to enable most of these tests. Formatted loads/stores on surfaces are not supported in PTX ISA, so this would require codegen for the conversion which in theory should be possible but not as part of the CUDA prelude.
author: Simon Kallweit <64953474+skallweitNV@users.noreply.github.com> 2025-10-08 20:18:23 +0200
committer: GitHub <noreply@github.com> 2025-10-08 18:18:23 +0000
commit: 4b3fd0a4536f873649e6fcdefbaf2f8b6f0897e0 (patch)
tree: 1d6369221c8e1786f97de0d36fce3d9b955c4150
parent: e4d1200cb45260b2a114d6f4f8f8d0b389a7da56 (diff)
2 files changed, 407 insertions, 711 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 6c68cdb71..2c8faf922 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1323,419 +1323,237 @@ SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z))
 // For the _convert versions they are *not*. They don't need to be because sust.p does not require
 // it.
 
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode);
-
 // https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
 
-// Float
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(
-    float v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(v));
-}
+// surf1Dwrite_convert
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(
-    float v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(v));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(
-    float v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(v));
-}
+#define SLANG_SURF1DWRITE_CONVERT_IMPL(T, c)                                                     \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T>(                              \
+        T v,                                                                                     \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        asm volatile(                                                                            \
+            "sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};" ::"l"(surfObj),        \
+            "r"(x),                                                                              \
+            c(v));                                                                               \
+    }                                                                                            \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##2>(                           \
+        T##2 v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const T vx = v.x, vy = v.y;                                                              \
+        asm volatile(                                                                            \
+            "sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2, %3};" ::"l"(surfObj), \
+            "r"(x),                                                                              \
+            c(vx),                                                                               \
+            c(vy));                                                                              \
+    }                                                                                            \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##4>(                           \
+        T##4 v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                                          \
+        asm volatile(                                                                            \
+            "sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE                                          \
+            " [%0, {%1}], {%2, %3, %4, %5};" ::"l"(surfObj),                                     \
+            "r"(x),                                                                              \
+            c(vx),                                                                               \
+            c(vy),                                                                               \
+            c(vz),                                                                               \
+            c(vw));                                                                              \
+    }
 
-// Float2
+SLANG_SURF1DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF1DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF1DWRITE_CONVERT_IMPL(int, "r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(
-    float2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(vx),
-        "f"(vy));
-}
+// surf1DLayeredwrite_convert (not supported)
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(
-    float2 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1DLayeredwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
-    int y,
+    int layer,
     cudaSurfaceBoundaryMode boundaryMode)
 {
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(vx),
-        "f"(vy));
+    static_assert(false, "CUDA doesn't support formatted surface writes on 1D array surfaces");
 }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(
-    float2 v,
+// surf2Dwrite_convert
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(vx),
-        "f"(vy));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-// Float4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1}], {%2,%3,%4,%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+#define SLANG_SURF2DWRITE_CONVERT_IMPL(T, c)                                                  \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T>(                           \
+        T v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        asm volatile(                                                                         \
+            "sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1, %2}], {%3};" ::"l"(surfObj), \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(v));                                                                            \
+    }                                                                                         \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##2>(                        \
+        T##2 v,                                                                               \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        const T vx = v.x, vy = v.y;                                                           \
+        asm volatile(                                                                         \
+            "sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE                                       \
+            " [%0, {%1, %2}], {%3, %4};" ::"l"(surfObj),                                      \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(vx),                                                                            \
+            c(vy));                                                                           \
+    }                                                                                         \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##4>(                        \
+        T##4 v,                                                                               \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                                       \
+        asm volatile(                                                                         \
+            "sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE                                       \
+            " [%0, {%1, %2}], {%3, %4, %5, %6};" ::"l"(surfObj),                              \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(vx),                                                                            \
+            c(vy),                                                                            \
+            c(vz),                                                                            \
+            c(vw));                                                                           \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+SLANG_SURF2DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF2DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF2DWRITE_CONVERT_IMPL(int, "r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+// surf2DLayeredwrite_convert (not supported)
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint>(
-    uint v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(v));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint>(
-    uint v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2DLayeredwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
-    int z,
+    int layer,
     cudaSurfaceBoundaryMode boundaryMode)
 {
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(v));
+    static_assert(false, "CUDA doesn't support formatted surface writes on 2D array surfaces");
 }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint2>(
-    uint2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint2>(
-    uint2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy));
-}
+// surf3Dwrite_convert
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint4>(
-    uint4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint4>(
-    uint4 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
     int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int>(
-    int v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int2>(
-    int2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int4>(
-    int4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+#define SLANG_SURF3DWRITE_CONVERT_IMPL(T, c)                             \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T>(      \
+        T v,                                                             \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        asm volatile(                                                    \
+            "sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE                     \
+            " [%0, {%1, %2, %3, %4}], {%5};" ::"l"(surfObj),             \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(v));                                                       \
+    }                                                                    \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##2>(   \
+        T##2 v,                                                          \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        const T vx = v.x, vy = v.y;                                      \
+        asm volatile(                                                    \
+            "sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE                  \
+            " [%0, {%1, %2, %3, %4}], {%5, %6};" ::"l"(surfObj),         \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(vx),                                                       \
+            c(vy));                                                      \
+    }                                                                    \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##4>(   \
+        T##4 v,                                                          \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                  \
+        asm volatile(                                                    \
+            "sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE                  \
+            " [%0, {%1, %2, %3, %4}], {%5, %6, %7, %8};" ::"l"(surfObj), \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(vx),                                                       \
+            c(vy),                                                       \
+            c(vz),                                                       \
+            c(vw));                                                      \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int>(
-    int v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int2>(
-    int2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy));
-}
-// Int4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int4>(
-    int4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+SLANG_SURF3DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF3DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF3DWRITE_CONVERT_IMPL(int, "r")
 
 // ----------------------------- F32 -----------------------------------------
 
@@ -4413,337 +4231,218 @@ struct TensorView
 };
 
 // Implementations for texture fetch/load functions using tex PTX intrinsics
-// These are used for read-only texture access with integer coordinates
-// See #6781 for details.
+// These are used for read-only texture access with integer coordinates.
 
-// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
-// supported.
+// 1D is not supported via PTX. Keeping the implementation below in case it ever gets supported.
 template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
-{
-    T result;
-    float stub;
-    asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x));
-    return result;
-}
-
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    T result;
-    float stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result;
-    uint stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result;
-    int stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_int2(result_x, result_y);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip)
+{
+    static_assert(false, "CUDA does not support fetching from 1D textures");
+}
+
+#if 0
+#define SLANG_TEX1DFETCH_INT_IMPL(T, dtype, c)                                                 \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip)    \
+    {                                                                                          \
+        T result;                                                                              \
+        T stub;                                                                                \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result), c(stub), c(stub), c(stub)                                             \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return result;                                                                         \
+    }                                                                                          \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+    {                                                                                          \
+        T result_x, result_y;                                                                  \
+        T stub;                                                                                \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                       \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return make_##T##2(result_x, result_y);                                                \
+    }                                                                                          \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+    {                                                                                          \
+        T result_x, result_y, result_z, result_w;                                              \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                               \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return make_##T##4(result_x, result_y, result_z, result_w);                            \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX1DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DFETCH_INT_IMPL(int, "s32", "=r")
+#endif
 
 template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    T result;
-    float stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    // Note: The repeated z is a stub used as the fourth operand in ptx.
-    // From the docs:
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
-    // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
-    // textures; and is a four-element vector for 3d textures.
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result;
-    uint stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result;
-    int stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_int2(result_x, result_y);
-}
+#define SLANG_TEX2DFETCH_INT_IMPL(T, dtype, c)                                                     \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip) \
+    {                                                                                              \
+        T result;                                                                                  \
+        T stub;                                                                                    \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result), c(stub), c(stub), c(stub)                                                 \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return result;                                                                             \
+    }                                                                                              \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                             \
+        T##2 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip)                             \
+    {                                                                                              \
+        T result_x, result_y;                                                                      \
+        T stub;                                                                                    \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                           \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return make_##T##2(result_x, result_y);                                                    \
+    }                                                                                              \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                             \
+        T##4 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip)                             \
+    {                                                                                              \
+        T result_x, result_y, result_z, result_w;                                                  \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                                   \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return make_##T##4(result_x, result_y, result_z, result_w);                                \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DFETCH_INT_IMPL(int, "s32", "=r")
 
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
-{
-    T result;
-    float stub;
-    asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(layer));
-    return result;
-}
 
 template<typename T>
 SLANG_FORCE_INLINE SLANG_CUDA_CALL T
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    T result;
-    float stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result;
-    uint stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+#define SLANG_TEX3DFETCH_INT_IMPL(T, dtype, c)                                            \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                                  \
+    tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)                      \
+    {                                                                                     \
+        T result;                                                                         \
+        T stub;                                                                           \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result), c(stub), c(stub), c(stub)                                        \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return result;                                                                    \
+    }                                                                                     \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                    \
+        T##2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)             \
+    {                                                                                     \
+        T result_x, result_y;                                                             \
+        T stub;                                                                           \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result_x), c(result_y), c(stub), c(stub)                                  \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return make_##T##2(result_x, result_y);                                           \
+    }                                                                                     \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                    \
+        T##4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)             \
+    {                                                                                     \
+        T result_x, result_y, result_z, result_w;                                         \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                          \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return make_##T##4(result_x, result_y, result_z, result_w);                       \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_uint2(result_x, result_y);
-}
+SLANG_TEX3DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX3DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX3DFETCH_INT_IMPL(int, "s32", "=r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip);
+
+#define SLANG_TEX1DARRAYFETCH_INT_IMPL(T, dtype, c)                                \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                           \
+    tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)             \
+    {                                                                              \
+        T result;                                                                  \
+        T stub;                                                                    \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result), c(stub), c(stub), c(stub)                                 \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return result;                                                             \
+    }                                                                              \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                             \
+        T##2 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)    \
+    {                                                                              \
+        T result_x, result_y;                                                      \
+        T stub;                                                                    \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result_x), c(result_y), c(stub), c(stub)                           \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return make_##T##2(result_x, result_y);                                    \
+    }                                                                              \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                             \
+        T##4 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)    \
+    {                                                                              \
+        T result_x, result_y, result_z, result_w;                                  \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                   \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return make_##T##4(result_x, result_y, result_z, result_w);                \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2DArrayfetch_int(
-    CUtexObject texObj,
-    int x,
-    int y,
-    int layer)
-{
-    int result;
-    int stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+SLANG_TEX1DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(int, "s32", "=r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_int2(result_x, result_y);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip);
+
+#define SLANG_TEX2DARRAYFETCH_INT_IMPL(T, dtype, c)                                         \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                                    \
+    tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)               \
+    {                                                                                       \
+        T result;                                                                           \
+        T stub;                                                                             \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result), c(stub), c(stub), c(stub)                                          \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return result;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                      \
+        T##2 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)      \
+    {                                                                                       \
+        T result_x, result_y;                                                               \
+        T stub;                                                                             \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                    \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return make_##T##2(result_x, result_y);                                             \
+    }                                                                                       \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                      \
+        T##4 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)      \
+    {                                                                                       \
+        T result_x, result_y, result_z, result_w;                                           \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                            \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return make_##T##4(result_x, result_y, result_z, result_w);                         \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(int, "s32", "=r")
 
 // Wave rotate helper functions - templated approach
 #define SLANG_WARP_FULL_MASK 0xFFFFFFFF
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index ff2762b23..26c2a43cb 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -3654,32 +3654,29 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
             __intrinsic_asm ".Load";
 		case cuda:
             if (isArray != 0)
+            {
+                switch(Shape.flavor)
                 {
-     				static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_3D),
-                              "Integer coordinates are supported for texture reads only for 2D and 3D textures and 2D array textures.");
-
-                    if (Shape.flavor == $(SLANG_TEXTURE_2D))
-                    {
-						__intrinsic_asm "tex2DArrayfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
-				    }
-                    else
-					{
-                        __intrinsic_asm "<invalid intrinsic>";
-                    }
+                case $(SLANG_TEXTURE_1D):
+                    __intrinsic_asm "tex1DArrayfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
+                case $(SLANG_TEXTURE_2D):
+                    __intrinsic_asm "tex2DArrayfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z, ($1).w)";
                 }
-                else
+            }
+            else
+            {
+                switch(Shape.flavor)
                 {
-                    switch(Shape.flavor)
-                    {
-                    case $(SLANG_TEXTURE_2D):
-                        __intrinsic_asm "tex2Dfetch_int<$T0>($0, ($1).x, ($1).y)";
-                    case $(SLANG_TEXTURE_3D):
-                        __intrinsic_asm "tex3Dfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
-					case $(SLANG_TEXTURE_CUBE):
-                    default:
-                        __intrinsic_asm "<invalid intrinsic>";
-                    }
+                case $(SLANG_TEXTURE_1D):
+                    __intrinsic_asm "tex1Dfetch_int<$T0>($0, ($1).x, ($1).y)";
+                case $(SLANG_TEXTURE_2D):
+                    __intrinsic_asm "tex2Dfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
+                case $(SLANG_TEXTURE_3D):
+                    __intrinsic_asm "tex3Dfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z, ($1).w)";
                 }
+            }
+            static_assert(false, "Unsupported 'Load' of 'texture' for 'cuda' target");
+            __intrinsic_asm "<invalid intrinsic>";
         case metal:
             switch (Shape.flavor)
             {
author	Simon Kallweit <64953474+skallweitNV@users.noreply.github.com>	2025-10-08 20:18:23 +0200
committer	GitHub <noreply@github.com>	2025-10-08 18:18:23 +0000
commit	4b3fd0a4536f873649e6fcdefbaf2f8b6f0897e0 (patch)
tree	1d6369221c8e1786f97de0d36fce3d9b955c4150
parent	e4d1200cb45260b2a114d6f4f8f8d0b389a7da56 (diff)