diff options
| author | Harsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com> | 2025-09-04 10:58:02 +0530 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-04 05:28:02 +0000 |
| commit | 5ec41675d817f82a7ce3c4d79c68548db0bd4227 (patch) | |
| tree | 57abff17713b5d9ea876be29e3b451c9abe8c49d | |
| parent | b45706b3f532f85525de5746f1f607ba2e57fc88 (diff) | |
Enable CUDA support for additional HLSL intrinsic tests (#8293)
Enable CUDA support for additional HLSL intrinsic tests by implementing
missing functionality and fixing compiler bugs affecting CUDA targets.
- Fix critical bug in InterlockedCompareStore64 where division used /4
instead of /8 for 64-bit types, causing incorrect memory addressing for
all signed int 64_t atomics
- Add signed int64_t atomic wrappers (atomicExch, atomicCAS) to CUDA
prelu de that properly cast to/from unsigned types as required by CUDA's
atomic API
- Enable tests: atomic-intrinsics-64bit.slang
- Implement CUDA support for QuadAny and QuadAll operations using warp
shu ffle primitives (__shfl_sync with quad-level lane masking)
- Add CUDA to quad_control capability definition in
slang-capabilities.capdef
- Add _slang_quadAny/_slang_quadAll helper functions to CUDA prelude
- Enable tests: quad-control-comp-functionality.slang,
subgroup-quad.slang
---------
Co-authored-by: szihs <675653+szihs@users.noreply.github.com>
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 238 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 93 | ||||
| -rw-r--r-- | source/slang/slang-capabilities.capdef | 2 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang | 11 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang | 1 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/subgroup-quad.slang | 10 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/texture-2d-gather.slang | 58 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang | 1 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang | 1 |
9 files changed, 384 insertions, 31 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 44afd71b9..062e0ca6c 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -2402,7 +2402,32 @@ struct ByteAddressBuffer }; // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer -// Missing support for Atomic operations +// Atomic operations support + +// Signed 64-bit atomic wrappers +// CUDA only supports unsigned long long atomics, so we cast signed to unsigned +__device__ __forceinline__ long long atomicExch(long long* address, long long val) +{ + return (long long)atomicExch((unsigned long long*)address, (unsigned long long)val); +} + +__device__ __forceinline__ long long atomicCAS(long long* address, long long compare, long long val) +{ + return (long long)atomicCAS( + (unsigned long long*)address, + (unsigned long long)compare, + (unsigned long long)val); +} + +// Float bitwise atomic compare-and-swap +// Uses integer atomics to preserve exact float bit patterns +__device__ __forceinline__ float atomicCAS(float* address, float compare, float val) +{ + int* addr_as_int = (int*)address; + int old = atomicCAS(addr_as_int, __float_as_int(compare), __float_as_int(val)); + return __int_as_float(old); +} + // Missing support for Load with status struct RWByteAddressBuffer { @@ -4715,3 +4740,214 @@ tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer) : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer)); return make_int4(result_x, result_y, result_z, result_w); } + +// Wave rotate helper functions - templated approach +#define SLANG_WARP_FULL_MASK 0xFFFFFFFF + +// Macro-based wave rotate implementation following codebase patterns +#define SLANG_WAVE_ROTATE_IMPL(T) \ + __device__ __forceinline__ T##2 _slang_waveRotate(T##2 value, unsigned int delta) \ + { \ + return make_##T##2( \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.x, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.y, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \ + } \ + __device__ __forceinline__ T##3 _slang_waveRotate(T##3 value, unsigned int delta) \ + { \ + return make_##T##3( \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.x, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.y, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.z, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \ + } \ + __device__ __forceinline__ T##4 _slang_waveRotate(T##4 value, unsigned int delta) \ + { \ + return make_##T##4( \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.x, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.y, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.z, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \ + (T)__shfl_sync( \ + SLANG_WARP_FULL_MASK, \ + value.w, \ + (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \ + } + +// Generate wave rotate functions for all standard vector types +SLANG_WAVE_ROTATE_IMPL(uint) +SLANG_WAVE_ROTATE_IMPL(int) +SLANG_WAVE_ROTATE_IMPL(float) +SLANG_WAVE_ROTATE_IMPL(short) +SLANG_WAVE_ROTATE_IMPL(ushort) +SLANG_WAVE_ROTATE_IMPL(char) +SLANG_WAVE_ROTATE_IMPL(uchar) +SLANG_WAVE_ROTATE_IMPL(longlong) +SLANG_WAVE_ROTATE_IMPL(ulonglong) + +#ifdef SLANG_CUDA_ENABLE_HALF +SLANG_WAVE_ROTATE_IMPL(__half) +#endif + +// Special handling for boolean vectors (requires int conversion) +__device__ __forceinline__ bool2 _slang_waveRotate(bool2 value, unsigned int delta) +{ + int2 intValue = make_int2((int)value.x, (int)value.y); + int2 result = _slang_waveRotate(intValue, delta); + return make_bool2((bool)result.x, (bool)result.y); +} + +__device__ __forceinline__ bool3 _slang_waveRotate(bool3 value, unsigned int delta) +{ + int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z); + int3 result = _slang_waveRotate(intValue, delta); + return make_bool3((bool)result.x, (bool)result.y, (bool)result.z); +} + +__device__ __forceinline__ bool4 _slang_waveRotate(bool4 value, unsigned int delta) +{ + int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w); + int4 result = _slang_waveRotate(intValue, delta); + return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w); +} + +#undef SLANG_WAVE_ROTATE_IMPL + +// Quad control operations for CUDA +__device__ __forceinline__ bool _slang_quadAny(bool expr) +{ + // Get values from all 4 lanes in the quad + bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0); + bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1); + bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2); + bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3); + return v0 || v1 || v2 || v3; +} + +__device__ __forceinline__ bool _slang_quadAll(bool expr) +{ + // Get values from all 4 lanes in the quad + bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0); + bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1); + bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2); + bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3); + return v0 && v1 && v2 && v3; +} + +// Clustered wave rotate operations for CUDA +// Clustered rotate rotates values within clusters of specified size +#define SLANG_WAVE_CLUSTERED_ROTATE_IMPL(T) \ + __device__ __forceinline__ T \ + _slang_waveClusteredRotate(T value, unsigned int delta, unsigned int clusterSize) \ + { \ + unsigned int laneId = _getLaneId(); \ + unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \ + unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \ + return __shfl_sync(SLANG_WARP_FULL_MASK, value, targetLane); \ + } \ + __device__ __forceinline__ \ + T##2 _slang_waveClusteredRotate(T##2 value, unsigned int delta, unsigned int clusterSize) \ + { \ + unsigned int laneId = _getLaneId(); \ + unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \ + unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \ + return make_##T##2( \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane)); \ + } \ + __device__ __forceinline__ \ + T##3 _slang_waveClusteredRotate(T##3 value, unsigned int delta, unsigned int clusterSize) \ + { \ + unsigned int laneId = _getLaneId(); \ + unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \ + unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \ + return make_##T##3( \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane)); \ + } \ + __device__ __forceinline__ \ + T##4 _slang_waveClusteredRotate(T##4 value, unsigned int delta, unsigned int clusterSize) \ + { \ + unsigned int laneId = _getLaneId(); \ + unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \ + unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \ + return make_##T##4( \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane), \ + (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.w, targetLane)); \ + } + +// Generate clustered wave rotate functions for all standard types +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uint) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(int) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(float) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(short) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ushort) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(char) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uchar) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(longlong) +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ulonglong) + +#ifdef SLANG_CUDA_ENABLE_HALF +SLANG_WAVE_CLUSTERED_ROTATE_IMPL(__half) +#endif + +// Special handling for boolean clustered rotate +__device__ __forceinline__ bool _slang_waveClusteredRotate( + bool value, + unsigned int delta, + unsigned int clusterSize) +{ + int intValue = (int)value; + int result = _slang_waveClusteredRotate(intValue, delta, clusterSize); + return (bool)result; +} + +__device__ __forceinline__ bool2 +_slang_waveClusteredRotate(bool2 value, unsigned int delta, unsigned int clusterSize) +{ + int2 intValue = make_int2((int)value.x, (int)value.y); + int2 result = _slang_waveClusteredRotate(intValue, delta, clusterSize); + return make_bool2((bool)result.x, (bool)result.y); +} + +__device__ __forceinline__ bool3 +_slang_waveClusteredRotate(bool3 value, unsigned int delta, unsigned int clusterSize) +{ + int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z); + int3 result = _slang_waveClusteredRotate(intValue, delta, clusterSize); + return make_bool3((bool)result.x, (bool)result.y, (bool)result.z); +} + +__device__ __forceinline__ bool4 +_slang_waveClusteredRotate(bool4 value, unsigned int delta, unsigned int clusterSize) +{ + int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w); + int4 result = _slang_waveClusteredRotate(intValue, delta, clusterSize); + return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w); +} + +#undef SLANG_WAVE_CLUSTERED_ROTATE_IMPL diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 2af0dbcf7..d5f8cd2e1 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -3480,23 +3480,31 @@ ${{{{ const char* componentArg = (isShadow ? "" : componentArgString[componentId]); }}}} [ForceInline] - [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)] + [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)] vector<T.Element,4> Gather$(compareFunc)$(componentFunc)( $(samplerParam) vector<float, Shape.dimensions+isArray> location $(compareParam)) { - static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), - "Gather is supported only for 2D and 3D textures"); - __target_switch { - case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case hlsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); + __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case cuda: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures"); + static_assert(isArray == 0, "CUDA Gather does not support texture arrays"); + __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))"; case metal: case wgsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)($(getTexture) $(getSampler), location $(compareArg) $(componentArg)); case glsl: case spirv: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)(this $(samplerArg), location $(compareArg) $(componentArg)); } } @@ -3519,24 +3527,33 @@ ${{{{ } [ForceInline] - [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)] + [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)] vector<T.Element,4> Gather$(compareFunc)$(componentFunc)( $(samplerParam) vector<float, Shape.dimensions+isArray> location $(compareParam), constexpr vector<int, Shape.planeDimensions> offset) { - static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), - "Gather is supported only for 2D and 3D textures"); - __target_switch { - case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case hlsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); + __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)"; + case cuda: + // CUDA tex2Dgather doesn't support offset - ignore offset parameter + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures"); + static_assert(isArray == 0, "CUDA Gather does not support texture arrays"); + __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))"; case metal: case wgsl: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)_offset($(getTexture) $(getSampler), location $(compareArg), offset $(componentArg)); case glsl: case spirv: + static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE), + "Gather is supported only for 2D and 3D textures"); return __texture_gather$(compareFunc)_offset(this $(samplerArg), location $(compareArg), offset $(componentArg)); } } @@ -5704,7 +5721,7 @@ ${{{{ case hlsl: __intrinsic_asm ".InterlockedCompareStore64"; default: let buf = __getEquivalentStructuredBuffer<T>(this); - __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value); + __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value); return; } } @@ -14917,7 +14934,7 @@ matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr) __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) -[require(glsl_hlsl_metal_spirv, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)] T QuadReadLaneAt(T sourceValue, uint quadLaneID) { __target_switch @@ -14934,12 +14951,14 @@ T QuadReadLaneAt(T sourceValue, uint quadLaneID) OpCapability GroupNonUniformQuad; result:$$T = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID; }; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))"; } } __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) -[require(glsl_hlsl_metal_spirv, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)] vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID) { __target_switch @@ -14955,6 +14974,8 @@ vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID) OpCapability GroupNonUniformQuad; result:$$vector<T,N> = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID; }; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID); @@ -14965,7 +14986,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossX(T localValue) { __target_switch @@ -14982,6 +15003,8 @@ T QuadReadAcrossX(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapX"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)"; } } @@ -14990,7 +15013,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossX(vector<T,N> localValue) { __target_switch @@ -15007,6 +15030,8 @@ vector<T,N> QuadReadAcrossX(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapX"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue); @@ -15017,7 +15042,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossY(T localValue) { __target_switch @@ -15034,6 +15059,8 @@ T QuadReadAcrossY(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapY"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)"; } } __generic<T : __BuiltinType, let N : int> @@ -15041,7 +15068,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossY(vector<T,N> localValue) { __target_switch @@ -15058,6 +15085,8 @@ vector<T,N> QuadReadAcrossY(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapY"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue); @@ -15068,7 +15097,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] T QuadReadAcrossDiagonal(T localValue) { __target_switch @@ -15085,6 +15114,8 @@ T QuadReadAcrossDiagonal(T localValue) result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapDiagonal"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)"; } } __generic<T : __BuiltinType, let N : int> @@ -15092,7 +15123,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad) __spirv_version(1.3) __wgsl_extension(subgroups) [ForceInline] -[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] +[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)] vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue) { __target_switch @@ -15109,6 +15140,8 @@ vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue) result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction; }; case wgsl: __intrinsic_asm "quadSwapDiagonal"; + case cuda: + __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)"; } } __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue); @@ -16300,7 +16333,7 @@ void shader_subgroup_preamble() __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] +[require(cuda_glsl_metal_spirv, subgroup_rotate)] T WaveRotate(T value, uint delta) { shader_subgroup_preamble<T>(); @@ -16308,6 +16341,8 @@ T WaveRotate(T value, uint delta) { case glsl: __intrinsic_asm "subgroupRotate"; + case cuda: + __intrinsic_asm "__shfl_sync(0xFFFFFFFF, $0, (_getLaneId() + $1) % 32)"; case metal: __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: @@ -16322,7 +16357,7 @@ T WaveRotate(T value, uint delta) __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] +[require(cuda_glsl_metal_spirv, subgroup_rotate)] vector<T, N> WaveRotate(vector<T, N> value, uint delta) { shader_subgroup_preamble<T>(); @@ -16330,6 +16365,8 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta) { case glsl: __intrinsic_asm "subgroupRotate"; + case cuda: + __intrinsic_asm "_slang_waveRotate($0, $1)"; case metal: __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: @@ -16344,7 +16381,7 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta) __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] +[require(cuda_glsl_spirv, subgroup_rotate)] T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) { shader_subgroup_preamble<T>(); @@ -16352,6 +16389,8 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) { case glsl: __intrinsic_asm "subgroupClusteredRotate"; + case cuda: + __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)"; case spirv: return spirv_asm { @@ -16364,7 +16403,7 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] +[require(cuda_glsl_spirv, subgroup_rotate)] vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize) { shader_subgroup_preamble<T>(); @@ -16372,6 +16411,8 @@ vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint { case glsl: __intrinsic_asm "subgroupClusteredRotate"; + case cuda: + __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)"; case spirv: return spirv_asm { @@ -16978,7 +17019,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence) __glsl_extension(GL_EXT_shader_quad_control) __spirv_version(1.3) [ForceInline] -[require(glsl_hlsl_metal_spirv, quad_control)] +[require(cuda_glsl_hlsl_metal_spirv, quad_control)] bool QuadAny(bool expr) { __requireMaximallyReconverges(); @@ -16988,6 +17029,7 @@ bool QuadAny(bool expr) case hlsl: __intrinsic_asm "QuadAny"; case glsl: __intrinsic_asm "subgroupQuadAny"; case metal: __intrinsic_asm "quad_any"; + case cuda: __intrinsic_asm "_slang_quadAny"; case spirv: return spirv_asm { @@ -17003,7 +17045,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence) __glsl_extension(GL_EXT_shader_quad_control) __spirv_version(1.3) [ForceInline] -[require(glsl_hlsl_metal_spirv, quad_control)] +[require(cuda_glsl_hlsl_metal_spirv, quad_control)] bool QuadAll(bool expr) { __requireMaximallyReconverges(); @@ -17013,6 +17055,7 @@ bool QuadAll(bool expr) case hlsl: __intrinsic_asm "QuadAll"; case glsl: __intrinsic_asm "subgroupQuadAll"; case metal: __intrinsic_asm "quad_all"; + case cuda: __intrinsic_asm "_slang_quadAll"; case spirv: return spirv_asm { diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef index 822356312..ff9697f7d 100644 --- a/source/slang/slang-capabilities.capdef +++ b/source/slang/slang-capabilities.capdef @@ -2226,6 +2226,7 @@ alias subgroup_partitioned = _sm_6_5 /// [Compound] alias subgroup_rotate = GL_KHR_shader_subgroup_rotate | metal + | _cuda_sm_5_0 ; /// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics @@ -2260,6 +2261,7 @@ alias helper_lane = _sm_6_0 + fragment alias quad_control = _sm_6_7 | GL_EXT_shader_quad_control + GL_EXT_maximal_reconvergence + GL_KHR_shader_subgroup_vote | metal + | _cuda_sm_5_0 ; /// Capabilities required to enable shader breakpoints diff --git a/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang b/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang index 355729d93..da5af8a5c 100644 --- a/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang +++ b/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang @@ -1,4 +1,5 @@ -//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=DX12):-slang -compute -dx12 -profile cs_6_6 -shaderobj -output-using-type +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_6 -shaderobj -output-using-type +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cuda -profile cs_6_6 -shaderobj -output-using-type // This is to support 64-bit `Interlocked*` functions defined for HLSL SM6.6 // https://microsoft.github.io/DirectX-Specs/d3d/HLSL_SM_6_6_Int64_and_Float_Atomics.html @@ -336,7 +337,7 @@ void computeMain(uint groupIndex : SV_GroupIndex, int3 dispatchThreadID: SV_Disp outputBuffer[idx] = int(result); } -// DX12: 1 -// DX12-NEXT: 1 -// DX12-NEXT: 1 -// DX12-NEXT: 1 +// CHK: 1 +// CHK-NEXT: 1 +// CHK-NEXT: 1 +// CHK-NEXT: 1 diff --git a/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang b/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang index 20c36c2be..6dfd1d883 100644 --- a/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang +++ b/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang @@ -1,6 +1,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -emit-spirv-directly //TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -emit-spirv-via-glsl //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_7 -dx12 -shaderobj -render-feature hardware-device +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -profile cs_6_7 -shaderobj -render-feature hardware-device //TEST(compute):COMPARE_COMPUTE_EX:-metal -compute -shaderobj -xslang -DMETAL //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer diff --git a/tests/hlsl-intrinsic/subgroup-quad.slang b/tests/hlsl-intrinsic/subgroup-quad.slang index 1cfbffb49..ec5a80e56 100644 --- a/tests/hlsl-intrinsic/subgroup-quad.slang +++ b/tests/hlsl-intrinsic/subgroup-quad.slang @@ -2,6 +2,7 @@ //TEST:SIMPLE(filecheck=SPIRV): -entry main -stage compute -target spirv -emit-spirv-directly //TEST:SIMPLE(filecheck=HLSL): -entry main -stage compute -target hlsl //TEST:SIMPLE(filecheck=METAL): -entry main -stage compute -target metal +//TEST:SIMPLE(filecheck=CUDA): -entry main -stage compute -target cuda RWStructuredBuffer<float> output; @@ -51,4 +52,13 @@ void main() // METAL: ^ 3 // METAL: quad_shuffle // METAL: quad_shuffle + + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, (_getLaneId() & 0xFFFFFFFC) | ((1U) & 3)) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, (_getLaneId() & 0xFFFFFFFC) | ((1U) & 3)) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 1) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 1) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 2) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 2) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 3) + // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 3) } diff --git a/tests/hlsl-intrinsic/texture-2d-gather.slang b/tests/hlsl-intrinsic/texture-2d-gather.slang new file mode 100644 index 000000000..329041f4d --- /dev/null +++ b/tests/hlsl-intrinsic/texture-2d-gather.slang @@ -0,0 +1,58 @@ +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -output-using-type +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-dx12 -compute -profile cs_6_0 -shaderobj -output-using-type + +// Test CUDA Gather runtime behavior - compare with known gather pattern +// tex2Dgather samples 4 texels in 2x2 pattern around coordinate + +//TEST_INPUT: Texture2D(size=4, content = one):name testTexture +// Create a 4x4 texture with 1.0 values - simple but non-zero to verify gather works +Texture2D<float4> testTexture; + +//TEST_INPUT: Sampler:name samplerState +SamplerState samplerState; + +//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<float> outputBuffer; + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + // Simple gather test - sample at center of 2x2 region + // This should gather from texels (0,0), (1,0), (0,1), (1,1) + float2 coords = float2(0.75, 0.75); // Between texels for gather + + // Test basic gather - should return 4 values in specific order + float4 gathered = testTexture.GatherRed(samplerState, coords); + + // Store the gathered values + outputBuffer[0] = gathered.x; // Should be consistent pattern + outputBuffer[1] = gathered.y; + outputBuffer[2] = gathered.z; + outputBuffer[3] = gathered.w; + + // Also test that gather actually works by using texture coordinates + // as the texture values (coord-based pattern) + int2 texelCoord = int2(dispatchThreadID.xy); + float coordValue = float(texelCoord.x + texelCoord.y * 4); // Create pattern: 0,1,2,3,4,5,6,7... + + // Store marker value like CUDA reference (42) + outputBuffer[4] = 42.0; // Marker to verify test is working + + // Test another gather position + float4 gathered2 = testTexture.GatherRed(samplerState, float2(1.25, 1.25)); + outputBuffer[5] = gathered2.x; + outputBuffer[6] = gathered2.y; + outputBuffer[7] = gathered2.z; + outputBuffer[8] = gathered2.w; +} + +// Test results - texture filled with 1.0 values +// CHECK: 1.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 42.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 1.0 +// CHECK-NEXT: 1.0 diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang index 81601e9be..a5be09b0b 100644 --- a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang +++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang @@ -1,5 +1,6 @@ //TEST_CATEGORY(wave, compute) //TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -profile sm_6_0 //TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -Xslang... -capability GL_KHR_shader_subgroup_rotate -X. //TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl //TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -allow-glsl -Xslang... -DUSE_GLSL_SYNTAX -capability GL_KHR_shader_subgroup_rotate -X. diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang index 353afbb35..f67005078 100644 --- a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang +++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang @@ -1,5 +1,6 @@ // TEST_CATEGORY(wave, compute) // TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -emit-spirv-directly // TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -Xslang... -capability GL_KHR_shader_subgroup_rotate -X. //TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL |
