Add subscript operator support in cuda (#6830)

* cuda: Add support for subscript operator This CL adds support for the subscript operator for Read Only textures in cuda. Also adds a test for this. Fixes #6781 * format code * fix review comments * format code --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
author: Mukund Keshava <mkeshava@nvidia.com> 2025-04-30 16:07:02 +0530
committer: GitHub <noreply@github.com> 2025-04-30 10:37:02 +0000
commit: b0e150511a6a536c8ad9e74910b30ae179a10ec9 (patch)
tree: cb749d757e0e556d987d6a30020971ed5a6aa41d /prelude
parent: 41ac7a0d8b4e9c08eccc2153020900e0262cae84 (diff)
1 files changed, 67 insertions, 0 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 38e018e3e..738f2fa16 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -3366,3 +3366,70 @@ struct TensorView
         *reinterpret_cast<T*>(data + offset) = val;
     }
 };
+
+// Implementations for texture fetch/load functions using tex PTX intrinsics
+// These are used for read-only texture access with integer coordinates
+// See #6781 for details.
+
+// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
+// supported.
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
+{
+    T result;
+    float dummy;
+    asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+    T result;
+    float dummy;
+    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+    T result;
+    float dummy;
+    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+    // Note: The repeated z is a dummy used as the fourth operand in ptx.
+    // From the docs:
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+    // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
+    // textures; and is a four-element vector for 3d textures.
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
+{
+    T result;
+    float dummy;
+    asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(layer));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+    T result;
+    float dummy;
+    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+    return result;
+}
+\ No newline at end of file
author	Mukund Keshava <mkeshava@nvidia.com>	2025-04-30 16:07:02 +0530
committer	GitHub <noreply@github.com>	2025-04-30 10:37:02 +0000
commit	b0e150511a6a536c8ad9e74910b30ae179a10ec9 (patch)
tree	cb749d757e0e556d987d6a30020971ed5a6aa41d /prelude
parent	41ac7a0d8b4e9c08eccc2153020900e0262cae84 (diff)