diff options
| author | Harsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com> | 2025-09-04 10:58:02 +0530 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-04 05:28:02 +0000 |
| commit | 5ec41675d817f82a7ce3c4d79c68548db0bd4227 (patch) | |
| tree | 57abff17713b5d9ea876be29e3b451c9abe8c49d /source/slang | |
| parent | b45706b3f532f85525de5746f1f607ba2e57fc88 (diff) | |
Enable CUDA support for additional HLSL intrinsic tests (#8293)
Enable CUDA support for additional HLSL intrinsic tests by implementing
missing functionality and fixing compiler bugs affecting CUDA targets.
- Fix critical bug in InterlockedCompareStore64 where division used /4
instead of /8 for 64-bit types, causing incorrect memory addressing for
all signed int 64_t atomics
- Add signed int64_t atomic wrappers (atomicExch, atomicCAS) to CUDA
prelu de that properly cast to/from unsigned types as required by CUDA's
atomic API
- Enable tests: atomic-intrinsics-64bit.slang
- Implement CUDA support for QuadAny and QuadAll operations using warp
shu ffle primitives (__shfl_sync with quad-level lane masking)
- Add CUDA to quad_control capability definition in
slang-capabilities.capdef
- Add _slang_quadAny/_slang_quadAll helper functions to CUDA prelude
- Enable tests: quad-control-comp-functionality.slang,
subgroup-quad.slang
---------
Co-authored-by: szihs <675653+szihs@users.noreply.github.com>
Diffstat (limited to 'source/slang')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 93 | ||||
| -rw-r--r-- | source/slang/slang-capabilities.capdef | 2 |
2 files changed, 70 insertions, 25 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 2af0dbcf7..d5f8cd2e1 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -3480,23 +3480,31 @@ ${{{{ const char* componentArg = (isShadow ? "" : componentArgString[componentId]); }}}} [ForceInline] - [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)] + [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)] vector<T.Element,4> Gather$(compareFunc)$(componentFunc)( $(samplerParam) vector<float, Shape.dimensions+isArray> location $(compareParam)) { - static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), - "Gather is supported only for 2D and 3D textures"); - __target_switch { - case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case hlsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); + __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case cuda: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures"); + static_assert(isArray == 0, "CUDA Gather does not support texture arrays"); + __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))"; case metal: case wgsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)($(getTexture) $(getSampler), location $(compareArg) $(componentArg)); case glsl: case spirv: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)(this $(samplerArg), location $(compareArg) $(componentArg)); } } @@ -3519,24 +3527,33 @@ ${{{{ } [ForceInline] - [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)] + [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)] vector<T.Element,4> Gather$(compareFunc)$(componentFunc)( $(samplerParam) vector<float, Shape.dimensions+isArray> location $(compareParam), constexpr vector<int, Shape.planeDimensions> offset) { - static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), - "Gather is supported only for 2D and 3D textures"); - __target_switch { - case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case hlsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); + __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case cuda: + // CUDA tex2Dgather doesn't support offset - ignore offset parameter + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures"); + static_assert(isArray == 0, "CUDA Gather does not support texture arrays"); + __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))"; case metal: case wgsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)_offset($(getTexture) $(getSampler), location $(compareArg), offset $(componentArg)); case glsl: case spirv: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)_offset(this $(samplerArg), location $(compareArg), offset $(componentArg)); } } @@ -5704,7 +5721,7 @@ ${{{{ case hlsl: __intrinsic_asm ".InterlockedCompareStore64"; default: let buf = __getEquivalentStructuredBuffer<T>(this); - __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value); + __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value); return; } } @@ -14917,7 +14934,7 @@ matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr) __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) -[require(glsl_hlsl_metal_spirv, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)] T QuadReadLaneAt(T sourceValue, uint quadLaneID) { __target_switch @@ -14934,12 +14951,14 @@ T QuadReadLaneAt(T sourceValue, uint quadLaneID) OpCapability GroupNonUniformQuad; result:$$T = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID; }; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))"; } } __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) -[require(glsl_hlsl_metal_spirv, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)] vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID) { __target_switch @@ -14955,6 +14974,8 @@ vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID) OpCapability GroupNonUniformQuad; result:$$vector<T,N> = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID; }; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID); @@ -14965,7 +14986,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossX(T localValue) { __target_switch @@ -14982,6 +15003,8 @@ T QuadReadAcrossX(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapX"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)"; } } @@ -14990,7 +15013,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossX(vector<T,N> localValue) { __target_switch @@ -15007,6 +15030,8 @@ vector<T,N> QuadReadAcrossX(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapX"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue); @@ -15017,7 +15042,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossY(T localValue) { __target_switch @@ -15034,6 +15059,8 @@ T QuadReadAcrossY(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapY"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)"; } } __generic<T : __BuiltinType, let N : int> @@ -15041,7 +15068,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossY(vector<T,N> localValue) { __target_switch @@ -15058,6 +15085,8 @@ vector<T,N> QuadReadAcrossY(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapY"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue); @@ -15068,7 +15097,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossDiagonal(T localValue) { __target_switch @@ -15085,6 +15114,8 @@ T QuadReadAcrossDiagonal(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapDiagonal"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)"; } } __generic<T : __BuiltinType, let N : int> @@ -15092,7 +15123,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue) { __target_switch @@ -15109,6 +15140,8 @@ vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapDiagonal"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue); @@ -16300,7 +16333,7 @@ void shader_subgroup_preamble() __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] +[require(cuda_glsl_metal_spirv, subgroup_rotate)] T WaveRotate(T value, uint delta) { shader_subgroup_preamble<T>(); @@ -16308,6 +16341,8 @@ T WaveRotate(T value, uint delta) { case glsl: __intrinsic_asm "subgroupRotate"; + case cuda: + __intrinsic_asm "__shfl_sync(0xFFFFFFFF, $0, (_getLaneId() + $1) % 32)"; case metal: __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: @@ -16322,7 +16357,7 @@ T WaveRotate(T value, uint delta) __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] +[require(cuda_glsl_metal_spirv, subgroup_rotate)] vector<T, N> WaveRotate(vector<T, N> value, uint delta) { shader_subgroup_preamble<T>(); @@ -16330,6 +16365,8 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta) { case glsl: __intrinsic_asm "subgroupRotate"; + case cuda: + __intrinsic_asm "_slang_waveRotate($0, $1)"; case metal: __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: @@ -16344,7 +16381,7 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta) __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] +[require(cuda_glsl_spirv, subgroup_rotate)] T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) { shader_subgroup_preamble<T>(); @@ -16352,6 +16389,8 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) { case glsl: __intrinsic_asm "subgroupClusteredRotate"; + case cuda: + __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)"; case spirv: return spirv_asm { @@ -16364,7 +16403,7 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] +[require(cuda_glsl_spirv, subgroup_rotate)] vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize) { shader_subgroup_preamble<T>(); @@ -16372,6 +16411,8 @@ vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint { case glsl: __intrinsic_asm "subgroupClusteredRotate"; + case cuda: + __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)"; case spirv: return spirv_asm { @@ -16978,7 +17019,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence) __glsl_extension(GL_EXT_shader_quad_control) __spirv_version(1.3) [ForceInline] -[require(glsl_hlsl_metal_spirv, quad_control)] +[require(cuda_glsl_hlsl_metal_spirv, quad_control)] bool QuadAny(bool expr) { __requireMaximallyReconverges(); @@ -16988,6 +17029,7 @@ bool QuadAny(bool expr) case hlsl: __intrinsic_asm "QuadAny"; case glsl: __intrinsic_asm "subgroupQuadAny"; case metal: __intrinsic_asm "quad_any"; + case cuda: __intrinsic_asm "_slang_quadAny"; case spirv: return spirv_asm { @@ -17003,7 +17045,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence) __glsl_extension(GL_EXT_shader_quad_control) __spirv_version(1.3) [ForceInline] -[require(glsl_hlsl_metal_spirv, quad_control)] +[require(cuda_glsl_hlsl_metal_spirv, quad_control)] bool QuadAll(bool expr) { __requireMaximallyReconverges(); @@ -17013,6 +17055,7 @@ bool QuadAll(bool expr) case hlsl: __intrinsic_asm "QuadAll"; case glsl: __intrinsic_asm "subgroupQuadAll"; case metal: __intrinsic_asm "quad_all"; + case cuda: __intrinsic_asm "_slang_quadAll"; case spirv: return spirv_asm { diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef index 822356312..ff9697f7d 100644 --- a/source/slang/slang-capabilities.capdef +++ b/source/slang/slang-capabilities.capdef @@ -2226,6 +2226,7 @@ alias subgroup_partitioned = _sm_6_5 /// [Compound] alias subgroup_rotate = GL_KHR_shader_subgroup_rotate | metal + | _cuda_sm_5_0 ; /// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics @@ -2260,6 +2261,7 @@ alias helper_lane = _sm_6_0 + fragment alias quad_control = _sm_6_7 | GL_EXT_shader_quad_control + GL_EXT_maximal_reconvergence + GL_KHR_shader_subgroup_vote | metal + | _cuda_sm_5_0 ; /// Capabilities required to enable shader breakpoints |
