diff options
| author | Mukund Keshava <mkeshava@nvidia.com> | 2025-05-12 11:09:41 +0530 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-05-12 11:09:41 +0530 |
| commit | 03f9242489d5598c9c7594ac12e269f57a018cda (patch) | |
| tree | 2315d185985aa9d65dd4e3db8e1bf8d7a79d77c8 /prelude | |
| parent | b46c342f47b61119a0dc517ce6eb75eab3398504 (diff) | |
cuda: Add more formats for texture read/write (#7012)
* WiP: Add more formats for texture reads
* fix test
* format code
* add float2/float4 versions for 1D and 3D as well
* fixed review comment
* fix review comments
---------
Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 186 |
1 files changed, 175 insertions, 11 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 7c68384b3..a9e99683e 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -3408,9 +3408,9 @@ template<typename T> SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x) { T result; - float dummy; + float stub; asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];" - : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy) + : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub) : "l"(texObj), "r"(x)); return result; } @@ -3419,22 +3419,75 @@ template<typename T> SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y) { T result; - float dummy; + float stub; asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" - : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy) + : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub) : "l"(texObj), "r"(x), "r"(y)); return result; } +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex2Dfetch_int(CUtexObject texObj, int x, int y) +{ + float result_x, result_y; + float stub; + asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" + : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub) + : "l"(texObj), "r"(x), "r"(y)); + return make_float2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex2Dfetch_int(CUtexObject texObj, int x, int y) +{ + float result_x, result_y, result_z, result_w; + asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" + : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w) + : "l"(texObj), "r"(x), "r"(y)); + return make_float4(result_x, result_y, result_z, result_w); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex2Dfetch_int(CUtexObject texObj, int x, int y) +{ + uint result; + uint stub; + asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" + : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y)); + return result; +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex2Dfetch_int(CUtexObject texObj, int x, int y) +{ + uint result_x, result_y; + uint stub; + asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" + : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y)); + return make_uint2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex2Dfetch_int(CUtexObject texObj, int x, int y) +{ + uint result_x, result_y, result_z, result_w; + asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" + : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w) + : "l"(texObj), "r"(x), "r"(y)); + return make_uint4(result_x, result_y, result_z, result_w); +} + template<typename T> SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) { T result; - float dummy; + float stub; asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" - : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy) + : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub) : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); - // Note: The repeated z is a dummy used as the fourth operand in ptx. + // Note: The repeated z is a stub used as the fourth operand in ptx. // From the docs: // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d @@ -3442,13 +3495,66 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, i return result; } +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) +{ + float result_x, result_y; + float stub; + asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); + return make_float2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) +{ + float result_x, result_y, result_z, result_w; + asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w) + : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); + return make_float4(result_x, result_y, result_z, result_w); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) +{ + uint result; + uint stub; + asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); + return result; +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) +{ + uint result_x, result_y; + uint stub; + asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); + return make_uint2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z) +{ + uint result_x, result_y, result_z, result_w; + asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w) + : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z)); + return make_uint4(result_x, result_y, result_z, result_w); +} + template<typename T> SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer) { T result; - float dummy; + float stub; asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];" - : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy) + : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub) : "l"(texObj), "r"(x), "r"(layer)); return result; } @@ -3458,9 +3564,67 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) { T result; - float dummy; + float stub; asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" - : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy) + : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub) : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); return result; +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 +tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) +{ + float result_x, result_y; + float stub; + asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); + return make_float2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 +tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) +{ + float result_x, result_y, result_z, result_w; + asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w) + : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); + return make_float4(result_x, result_y, result_z, result_w); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint +tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) +{ + uint result; + uint stub; + asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); + return result; +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 +tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) +{ + uint result_x, result_y; + uint stub; + asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub) + : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); + return make_uint2(result_x, result_y); +} + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 +tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) +{ + uint result_x, result_y, result_z, result_w; + asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];" + : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w) + : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); + return make_uint4(result_x, result_y, result_z, result_w); }
\ No newline at end of file |
