1 files changed, 388 insertions, 689 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 6c68cdb71..2c8faf922 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1323,419 +1323,237 @@ SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z))
 // For the _convert versions they are *not*. They don't need to be because sust.p does not require
 // it.
 
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
-    T,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode);
-
 // https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
 
-// Float
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(
-    float v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(v));
-}
+// surf1Dwrite_convert
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(
-    float v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(v));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(
-    float v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(v));
-}
+#define SLANG_SURF1DWRITE_CONVERT_IMPL(T, c)                                                     \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T>(                              \
+        T v,                                                                                     \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        asm volatile(                                                                            \
+            "sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};" ::"l"(surfObj),        \
+            "r"(x),                                                                              \
+            c(v));                                                                               \
+    }                                                                                            \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##2>(                           \
+        T##2 v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const T vx = v.x, vy = v.y;                                                              \
+        asm volatile(                                                                            \
+            "sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2, %3};" ::"l"(surfObj), \
+            "r"(x),                                                                              \
+            c(vx),                                                                               \
+            c(vy));                                                                              \
+    }                                                                                            \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##4>(                           \
+        T##4 v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                             \
+        int x,                                                                                   \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                                          \
+        asm volatile(                                                                            \
+            "sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE                                          \
+            " [%0, {%1}], {%2, %3, %4, %5};" ::"l"(surfObj),                                     \
+            "r"(x),                                                                              \
+            c(vx),                                                                               \
+            c(vy),                                                                               \
+            c(vz),                                                                               \
+            c(vw));                                                                              \
+    }
 
-// Float2
+SLANG_SURF1DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF1DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF1DWRITE_CONVERT_IMPL(int, "r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(
-    float2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(vx),
-        "f"(vy));
-}
+// surf1DLayeredwrite_convert (not supported)
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(
-    float2 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1DLayeredwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
-    int y,
+    int layer,
     cudaSurfaceBoundaryMode boundaryMode)
 {
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(vx),
-        "f"(vy));
+    static_assert(false, "CUDA doesn't support formatted surface writes on 1D array surfaces");
 }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(
-    float2 v,
+// surf2Dwrite_convert
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(vx),
-        "f"(vy));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-// Float4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1}], {%2,%3,%4,%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+#define SLANG_SURF2DWRITE_CONVERT_IMPL(T, c)                                                  \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T>(                           \
+        T v,                                                                                  \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        asm volatile(                                                                         \
+            "sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1, %2}], {%3};" ::"l"(surfObj), \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(v));                                                                            \
+    }                                                                                         \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##2>(                        \
+        T##2 v,                                                                               \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        const T vx = v.x, vy = v.y;                                                           \
+        asm volatile(                                                                         \
+            "sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE                                       \
+            " [%0, {%1, %2}], {%3, %4};" ::"l"(surfObj),                                      \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(vx),                                                                            \
+            c(vy));                                                                           \
+    }                                                                                         \
+    template<>                                                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##4>(                        \
+        T##4 v,                                                                               \
+        cudaSurfaceObject_t surfObj,                                                          \
+        int x,                                                                                \
+        int y,                                                                                \
+        cudaSurfaceBoundaryMode boundaryMode)                                                 \
+    {                                                                                         \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                                       \
+        asm volatile(                                                                         \
+            "sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE                                       \
+            " [%0, {%1, %2}], {%3, %4, %5, %6};" ::"l"(surfObj),                              \
+            "r"(x),                                                                           \
+            "r"(y),                                                                           \
+            c(vx),                                                                            \
+            c(vy),                                                                            \
+            c(vz),                                                                            \
+            c(vw));                                                                           \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+SLANG_SURF2DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF2DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF2DWRITE_CONVERT_IMPL(int, "r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(
-    float4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "f"(vx),
-        "f"(vy),
-        "f"(vz),
-        "f"(vw));
-}
+// surf2DLayeredwrite_convert (not supported)
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint>(
-    uint v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(v));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint>(
-    uint v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2DLayeredwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
-    int z,
+    int layer,
     cudaSurfaceBoundaryMode boundaryMode)
 {
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(v));
+    static_assert(false, "CUDA doesn't support formatted surface writes on 2D array surfaces");
 }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint2>(
-    uint2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint2>(
-    uint2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy));
-}
+// surf3Dwrite_convert
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint4>(
-    uint4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint4>(
-    uint4 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
+    T v,
     cudaSurfaceObject_t surfObj,
     int x,
     int y,
     int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+    cudaSurfaceBoundaryMode boundaryMode);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int>(
-    int v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int2>(
-    int2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int4>(
-    int4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+#define SLANG_SURF3DWRITE_CONVERT_IMPL(T, c)                             \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T>(      \
+        T v,                                                             \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        asm volatile(                                                    \
+            "sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE                     \
+            " [%0, {%1, %2, %3, %4}], {%5};" ::"l"(surfObj),             \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(v));                                                       \
+    }                                                                    \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##2>(   \
+        T##2 v,                                                          \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        const T vx = v.x, vy = v.y;                                      \
+        asm volatile(                                                    \
+            "sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE                  \
+            " [%0, {%1, %2, %3, %4}], {%5, %6};" ::"l"(surfObj),         \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(vx),                                                       \
+            c(vy));                                                      \
+    }                                                                    \
+    template<>                                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##4>(   \
+        T##4 v,                                                          \
+        cudaSurfaceObject_t surfObj,                                     \
+        int x,                                                           \
+        int y,                                                           \
+        int z,                                                           \
+        cudaSurfaceBoundaryMode boundaryMode)                            \
+    {                                                                    \
+        const T vx = v.x, vy = v.y, vz = v.z, vw = v.w;                  \
+        asm volatile(                                                    \
+            "sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE                  \
+            " [%0, {%1, %2, %3, %4}], {%5, %6, %7, %8};" ::"l"(surfObj), \
+            "r"(x),                                                      \
+            "r"(y),                                                      \
+            "r"(z),                                                      \
+            "r"(0),                                                      \
+            c(vx),                                                       \
+            c(vy),                                                       \
+            c(vz),                                                       \
+            c(vw));                                                      \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int>(
-    int v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile(
-        "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int2>(
-    int2 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y;
-    asm volatile(
-        "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy));
-}
-// Int4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int4>(
-    int4 v,
-    cudaSurfaceObject_t surfObj,
-    int x,
-    int y,
-    int z,
-    cudaSurfaceBoundaryMode boundaryMode)
-{
-    const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile(
-        "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
-        " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
-        "r"(x),
-        "r"(y),
-        "r"(z),
-        "r"(0),
-        "r"(vx),
-        "r"(vy),
-        "r"(vz),
-        "r"(vw));
-}
+SLANG_SURF3DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF3DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF3DWRITE_CONVERT_IMPL(int, "r")
 
 // ----------------------------- F32 -----------------------------------------
 
@@ -4413,337 +4231,218 @@ struct TensorView
 };
 
 // Implementations for texture fetch/load functions using tex PTX intrinsics
-// These are used for read-only texture access with integer coordinates
-// See #6781 for details.
+// These are used for read-only texture access with integer coordinates.
 
-// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
-// supported.
+// 1D is not supported via PTX. Keeping the implementation below in case it ever gets supported.
 template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
-{
-    T result;
-    float stub;
-    asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x));
-    return result;
-}
-
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    T result;
-    float stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result;
-    uint stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result;
-    int stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_int2(result_x, result_y);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip)
+{
+    static_assert(false, "CUDA does not support fetching from 1D textures");
+}
+
+#if 0
+#define SLANG_TEX1DFETCH_INT_IMPL(T, dtype, c)                                                 \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip)    \
+    {                                                                                          \
+        T result;                                                                              \
+        T stub;                                                                                \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result), c(stub), c(stub), c(stub)                                             \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return result;                                                                         \
+    }                                                                                          \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+    {                                                                                          \
+        T result_x, result_y;                                                                  \
+        T stub;                                                                                \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                       \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return make_##T##2(result_x, result_y);                                                \
+    }                                                                                          \
+    template<>                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+    {                                                                                          \
+        T result_x, result_y, result_z, result_w;                                              \
+        asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;"                  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                               \
+            : "l"(texObj), "r"(x), "r"(mip));                                                  \
+        return make_##T##4(result_x, result_y, result_z, result_w);                            \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX1DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DFETCH_INT_IMPL(int, "s32", "=r")
+#endif
 
 template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    T result;
-    float stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    // Note: The repeated z is a stub used as the fourth operand in ptx.
-    // From the docs:
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
-    // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
-    // textures; and is a four-element vector for 3d textures.
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result;
-    uint stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result;
-    int stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_int2(result_x, result_y);
-}
+#define SLANG_TEX2DFETCH_INT_IMPL(T, dtype, c)                                                     \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip) \
+    {                                                                                              \
+        T result;                                                                                  \
+        T stub;                                                                                    \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result), c(stub), c(stub), c(stub)                                                 \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return result;                                                                             \
+    }                                                                                              \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                             \
+        T##2 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip)                             \
+    {                                                                                              \
+        T result_x, result_y;                                                                      \
+        T stub;                                                                                    \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                           \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return make_##T##2(result_x, result_y);                                                    \
+    }                                                                                              \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                             \
+        T##4 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip)                             \
+    {                                                                                              \
+        T result_x, result_y, result_z, result_w;                                                  \
+        asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;"                  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                                   \
+            : "l"(texObj), "r"(x), "r"(y), "r"(mip));                                              \
+        return make_##T##4(result_x, result_y, result_z, result_w);                                \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DFETCH_INT_IMPL(int, "s32", "=r")
 
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
-{
-    T result;
-    float stub;
-    asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(layer));
-    return result;
-}
 
 template<typename T>
 SLANG_FORCE_INLINE SLANG_CUDA_CALL T
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    T result;
-    float stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip);
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    float result_x, result_y;
-    float stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    float result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result;
-    uint stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+#define SLANG_TEX3DFETCH_INT_IMPL(T, dtype, c)                                            \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                                  \
+    tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)                      \
+    {                                                                                     \
+        T result;                                                                         \
+        T stub;                                                                           \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result), c(stub), c(stub), c(stub)                                        \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return result;                                                                    \
+    }                                                                                     \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                    \
+        T##2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)             \
+    {                                                                                     \
+        T result_x, result_y;                                                             \
+        T stub;                                                                           \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result_x), c(result_y), c(stub), c(stub)                                  \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return make_##T##2(result_x, result_y);                                           \
+    }                                                                                     \
+    template<>                                                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                    \
+        T##4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip)             \
+    {                                                                                     \
+        T result_x, result_y, result_z, result_w;                                         \
+        asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                          \
+            : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip));       \
+        return make_##T##4(result_x, result_y, result_z, result_w);                       \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result_x, result_y;
-    uint stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_uint2(result_x, result_y);
-}
+SLANG_TEX3DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX3DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX3DFETCH_INT_IMPL(int, "s32", "=r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    uint result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_uint4(result_x, result_y, result_z, result_w);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip);
+
+#define SLANG_TEX1DARRAYFETCH_INT_IMPL(T, dtype, c)                                \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                           \
+    tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)             \
+    {                                                                              \
+        T result;                                                                  \
+        T stub;                                                                    \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result), c(stub), c(stub), c(stub)                                 \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return result;                                                             \
+    }                                                                              \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                             \
+        T##2 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)    \
+    {                                                                              \
+        T result_x, result_y;                                                      \
+        T stub;                                                                    \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result_x), c(result_y), c(stub), c(stub)                           \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return make_##T##2(result_x, result_y);                                    \
+    }                                                                              \
+    template<>                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                             \
+        T##4 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip)    \
+    {                                                                              \
+        T result_x, result_y, result_z, result_w;                                  \
+        asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                   \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(mip));                          \
+        return make_##T##4(result_x, result_y, result_z, result_w);                \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2DArrayfetch_int(
-    CUtexObject texObj,
-    int x,
-    int y,
-    int layer)
-{
-    int result;
-    int stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return result;
-}
+SLANG_TEX1DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(int, "s32", "=r")
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    int result_x, result_y;
-    int stub;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_int2(result_x, result_y);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip);
+
+#define SLANG_TEX2DARRAYFETCH_INT_IMPL(T, dtype, c)                                         \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T                                                    \
+    tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)               \
+    {                                                                                       \
+        T result;                                                                           \
+        T stub;                                                                             \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result), c(stub), c(stub), c(stub)                                          \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return result;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                      \
+        T##2 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)      \
+    {                                                                                       \
+        T result_x, result_y;                                                               \
+        T stub;                                                                             \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result_x), c(result_y), c(stub), c(stub)                                    \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return make_##T##2(result_x, result_y);                                             \
+    }                                                                                       \
+    template<>                                                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL                                                      \
+        T##4 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip)      \
+    {                                                                                       \
+        T result_x, result_y, result_z, result_w;                                           \
+        asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"  \
+            : c(result_x), c(result_y), c(result_z), c(result_w)                            \
+            : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+        return make_##T##4(result_x, result_y, result_z, result_w);                         \
+    }
 
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
-    int result_x, result_y, result_z, result_w;
-    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
-        : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
-        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
-    return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(int, "s32", "=r")
 
 // Wave rotate helper functions - templated approach
 #define SLANG_WARP_FULL_MASK 0xFFFFFFFF