summaryrefslogtreecommitdiff
path: root/prelude
diff options
context:
space:
mode:
authorMukund Keshava <mkeshava@nvidia.com>2025-05-12 11:09:41 +0530
committerGitHub <noreply@github.com>2025-05-12 11:09:41 +0530
commit03f9242489d5598c9c7594ac12e269f57a018cda (patch)
tree2315d185985aa9d65dd4e3db8e1bf8d7a79d77c8 /prelude
parentb46c342f47b61119a0dc517ce6eb75eab3398504 (diff)
cuda: Add more formats for texture read/write (#7012)
* WiP: Add more formats for texture reads * fix test * format code * add float2/float4 versions for 1D and 3D as well * fixed review comment * fix review comments --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h186
1 files changed, 175 insertions, 11 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 7c68384b3..a9e99683e 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -3408,9 +3408,9 @@ template<typename T>
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
{
T result;
- float dummy;
+ float stub;
asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
- : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+ : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
: "l"(texObj), "r"(x));
return result;
}
@@ -3419,22 +3419,75 @@ template<typename T>
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
{
T result;
- float dummy;
+ float stub;
asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+ : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
: "l"(texObj), "r"(x), "r"(y));
return result;
}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+ float result_x, result_y;
+ float stub;
+ asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
+ : "l"(texObj), "r"(x), "r"(y));
+ return make_float2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+ float result_x, result_y, result_z, result_w;
+ asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
+ : "l"(texObj), "r"(x), "r"(y));
+ return make_float4(result_x, result_y, result_z, result_w);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+ uint result;
+ uint stub;
+ asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+ : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y));
+ return result;
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+ uint result_x, result_y;
+ uint stub;
+ asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y));
+ return make_uint2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+ uint result_x, result_y, result_z, result_w;
+ asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
+ : "l"(texObj), "r"(x), "r"(y));
+ return make_uint4(result_x, result_y, result_z, result_w);
+}
+
template<typename T>
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
{
T result;
- float dummy;
+ float stub;
asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+ : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
: "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- // Note: The repeated z is a dummy used as the fourth operand in ptx.
+ // Note: The repeated z is a stub used as the fourth operand in ptx.
// From the docs:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
// Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
@@ -3442,13 +3495,66 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, i
return result;
}
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+ float result_x, result_y;
+ float stub;
+ asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+ return make_float2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+ float result_x, result_y, result_z, result_w;
+ asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+ return make_float4(result_x, result_y, result_z, result_w);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+ uint result;
+ uint stub;
+ asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+ return result;
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+ uint result_x, result_y;
+ uint stub;
+ asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+ return make_uint2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+ uint result_x, result_y, result_z, result_w;
+ asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+ return make_uint4(result_x, result_y, result_z, result_w);
+}
+
template<typename T>
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
{
T result;
- float dummy;
+ float stub;
asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+ : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
: "l"(texObj), "r"(x), "r"(layer));
return result;
}
@@ -3458,9 +3564,67 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T
tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
{
T result;
- float dummy;
+ float stub;
asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+ : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
: "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
return result;
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float2
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+ float result_x, result_y;
+ float stub;
+ asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+ return make_float2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float4
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+ float result_x, result_y, result_z, result_w;
+ asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
+ : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+ return make_float4(result_x, result_y, result_z, result_w);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+ uint result;
+ uint stub;
+ asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+ return result;
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+ uint result_x, result_y;
+ uint stub;
+ asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
+ : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+ return make_uint2(result_x, result_y);
+}
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+ uint result_x, result_y, result_z, result_w;
+ asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+ : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
+ : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+ return make_uint4(result_x, result_y, result_z, result_w);
} \ No newline at end of file