diff options
| author | Darren Wihandi <65404740+fairywreath@users.noreply.github.com> | 2025-05-25 12:58:08 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-05-25 09:58:08 -0700 |
| commit | 0476b57faad96bee61f59f27ddd48c6cb067cfa2 (patch) | |
| tree | d3fe49cd906c29b03b2a840dd2c057ccc331b4f7 /source | |
| parent | 554be7a5f990df19a21db10b4e5dc0285cbe8168 (diff) | |
Add full support for SPV_NV_shader_subgroup_partitioned (#7103)
* Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics
* More partitioned intrinsics
* More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations
* Refactor HLSL WaveMultiPrefix* implementations
* fix cap atoms
* Clean up implementation
* Add GLSL intrinsics and cleanup
* Add tests
* Fix affected capability test
* Update and fix tests
* Move expected.txt file
* Refactor WaveMask* to call WaveMulti*
* Refactor SPIRV/GLSL preamble code
* Enable emit-via-glsl tests
* remove wave_multi_prefix capability in favor of subgroup_partitioned
* Update docs
* Update cap atoms doc
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/glsl.meta.slang | 351 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 1468 | ||||
| -rw-r--r-- | source/slang/slang-capabilities.capdef | 9 |
3 files changed, 1079 insertions, 749 deletions
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang index 588396251..88c90a777 100644 --- a/source/slang/glsl.meta.slang +++ b/source/slang/glsl.meta.slang @@ -8280,6 +8280,7 @@ public vector<T,N> subgroupQuadSwapDiagonal(vector<T,N> value) // GL_KHR_shader_subgroup_rotate __generic<T : __BuiltinType> +[ForceInline] [require(glsl_metal_spirv, subgroup_rotate)] public T subgroupRotate(T value, uint delta) { @@ -8287,6 +8288,7 @@ public T subgroupRotate(T value, uint delta) } __generic<T : __BuiltinType, let N : int> +[ForceInline] [require(glsl_metal_spirv, subgroup_rotate)] public vector<T, N> subgroupRotate(vector<T, N> value, uint delta) { @@ -8294,6 +8296,7 @@ public vector<T, N> subgroupRotate(vector<T, N> value, uint delta) } __generic<T : __BuiltinType> +[ForceInline] [require(glsl_spirv, subgroup_rotate)] public T subgroupClusteredRotate(T value, uint delta, constexpr uint clusterSize) { @@ -8302,12 +8305,360 @@ public T subgroupClusteredRotate(T value, uint delta, constexpr uint clusterSize } __generic<T : __BuiltinType, let N : int> +[ForceInline] [require(glsl_spirv, subgroup_rotate)] public vector<T, N> subgroupClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize) { return WaveClusteredRotate(value, delta, clusterSize); } + +// GL_NV_shader_subgroup_partitioned + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedAddNV(T value, uvec4 ballot) +{ + return WaveMultiSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedAddNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedMulNV(T value, uvec4 ballot) +{ + return WaveMultiProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedMulNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedMinNV(T value, uvec4 ballot) +{ + return WaveMultiMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N: int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedMinNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedMaxNV(T value, uvec4 ballot) +{ + return WaveMultiMax(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedMaxNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiMax(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedAndNV(T value, uvec4 ballot) +{ + return WaveMultiBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedAndNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedOrNV(T value, uvec4 ballot) +{ + return WaveMultiBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedOrNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedXorNV(T value, uvec4 ballot) +{ + return WaveMultiBitXor(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedXorNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiBitXor(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveAddNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveAddNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveMulNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveMulNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveMinNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveMinNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveMaxNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveMax(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveMaxNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveMax(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveAndNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveAndNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveOrNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveOrNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedInclusiveXorNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitXor(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedInclusiveXorNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixInclusiveBitXor(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveAddNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveAddNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveSum(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveMulNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveMulNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveProduct(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveMinNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveMinNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveMin(value, ballot); +} + +__generic<T : __BuiltinArithmeticType> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveMaxNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveMax(value, ballot); +} + +__generic<T : __BuiltinArithmeticType, let N : int> +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveMaxNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveMax(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveAndNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveAndNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitAnd(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveOrNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveOrNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitOr(value, ballot); +} + +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public T subgroupPartitionedExclusiveXorNV(T value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitXor(value, ballot); +} + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public vector<T, N> subgroupPartitionedExclusiveXorNV(vector<T, N> value, uvec4 ballot) +{ + return WaveMultiPrefixExclusiveBitXor(value, ballot); +} + +__generic<T : __BuiltinType> +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +public uvec4 subgroupPartitionNV(T value) +{ + return WaveMatch(value); +} + + //// GLSL atomic // The following type internally is a Shader Storage Buffer diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 87f98adaf..cb050dd51 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -14138,382 +14138,294 @@ uint WaveMaskPrefixCountBits(WaveMask mask, bool value) // Across lane ops -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskBitAnd(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupAnd($1)"; - case cuda: __intrinsic_asm "_waveAnd($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseAnd $$T result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitAnd($1)"; + default: + return WaveMultiBitAnd(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskBitAnd(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupAnd($1)"; - case cuda: __intrinsic_asm "_waveAndMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseAnd $$vector<T,N> result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitAnd($1)"; + default: + return WaveMultiBitAnd(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskBitAnd(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveAndMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)"; + case hlsl: + __intrinsic_asm "WaveActiveBitAnd($1)"; + default: + return WaveMultiBitAnd(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskBitOr(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupOr($1)"; - case cuda: __intrinsic_asm "_waveOr($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitOr($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseOr $$T result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitOr($1)"; + default: + return WaveMultiBitOr(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int> +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +[ForceInline] vector<T,N> WaveMaskBitOr(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupOr($1)"; - case cuda: __intrinsic_asm "_waveOrMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitOr($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseOr $$vector<T,N> result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitOr($1)"; + default: + return WaveMultiBitOr(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] + +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskBitOr(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveOrMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitOr($1)"; + case hlsl: + __intrinsic_asm "WaveActiveBitOr($1)"; + default: + return WaveMultiBitOr(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskBitXor(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupXor($1)"; - case cuda: __intrinsic_asm "_waveXor($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitXor($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseXor $$T result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitXor($1)"; + default: + return WaveMultiBitXor(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] + +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskBitXor(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupXor($1)"; - case cuda: __intrinsic_asm "_waveXorMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitXor($1)"; - case spirv: - return spirv_asm { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformBitwiseXor $$vector<T,N> result Subgroup 0 $expr - }; + case hlsl: + __intrinsic_asm "WaveActiveBitXor($1)"; + default: + return WaveMultiBitXor(expr, uint4(mask, 0, 0, 0)); } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] + +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskBitXor(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveXorMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveBitXor($1)"; + case hlsl: + __intrinsic_asm "WaveActiveBitXor($1)"; + default: + return WaveMultiBitXor(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskMax(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMax($1)"; - case cuda: __intrinsic_asm "_waveMax($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMax($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMax $$T result Subgroup 0 $expr}; - else if (__isSignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMax $$T result Subgroup 0 $expr}; - else if (__isUnsignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMax $$T result Subgroup 0 $expr}; - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveMax($1)"; + default: + return WaveMultiMax(expr, uint4(mask, 0, 0, 0)); } } + __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskMax(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMax($1)"; - case cuda: __intrinsic_asm "_waveMaxMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMax($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMax $$vector<T,N> result Subgroup 0 $expr}; - else if (__isSignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMax $$vector<T,N> result Subgroup 0 $expr}; - else if (__isUnsignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMax $$vector<T,N> result Subgroup 0 $expr}; - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveMax($1)"; + default: + return WaveMultiMax(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskMax(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveMaxMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMax($1)"; + case hlsl: + __intrinsic_asm "WaveActiveMax($1)"; + default: + return WaveMultiMax(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskMin(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMin($1)"; - case cuda: __intrinsic_asm "_waveMin($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMin($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMin $$T result Subgroup 0 $expr}; - else if (__isSignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMin $$T result Subgroup 0 $expr}; - else if (__isUnsignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMin $$T result Subgroup 0 $expr}; - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveMin($1)"; + default: + return WaveMultiMin(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskMin(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMin($1)"; - case cuda: __intrinsic_asm "_waveMinMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMin($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMin $$vector<T,N> result Subgroup 0 $expr}; - else if (__isSignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMin $$vector<T,N> result Subgroup 0 $expr}; - else if (__isUnsignedInt<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMin $$vector<T,N> result Subgroup 0 $expr}; - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveMin($1)"; + default: + return WaveMultiMin(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskMin(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveMinMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveMin($1)"; + case hlsl: + __intrinsic_asm "WaveActiveMin($1)"; + default: + return WaveMultiMin(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskProduct(WaveMask mask, T expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMul($1)"; - case cuda: __intrinsic_asm "_waveProduct($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveProduct($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$T result Subgroup 0 $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIMul $$T result Subgroup 0 $expr; - }; - } - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveProduct($1)"; + default: + return WaveMultiProduct(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskProduct(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: __intrinsic_asm "subgroupMul($1)"; - case cuda: __intrinsic_asm "_waveProductMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveProduct($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$vector<T,N> result Subgroup 0 $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIMul $$vector<T,N> result Subgroup 0 $expr; - }; - } - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveProduct($1)"; + default: + return WaveMultiProduct(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskProduct(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveProductMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveProduct($1)"; + case hlsl: + __intrinsic_asm "WaveActiveProduct($1)"; + default: + return WaveMultiProduct(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskSum(WaveMask mask, T expr) { __target_switch { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupAdd($1)"; - case cuda: __intrinsic_asm "_waveSum($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveSum($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$T result Subgroup 0 $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIAdd $$T result Subgroup 0 $expr; - }; - } - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveSum($1)"; + default: + return WaveMultiSum(expr, uint4(mask, 0, 0, 0)); } } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskSum(WaveMask mask, vector<T,N> expr) { __target_switch { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupAdd($1)"; - case cuda: __intrinsic_asm "_waveSumMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveSum($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$vector<T,N> result Subgroup 0 $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIAdd $$vector<T,N> result Subgroup 0 $expr; - }; - } - else return expr; + case hlsl: + __intrinsic_asm "WaveActiveSum($1)"; + default: + return WaveMultiSum(expr, uint4(mask, 0, 0, 0)); } } + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskSum(WaveMask mask, matrix<T,N,M> expr) { __target_switch { - case cuda: __intrinsic_asm "_waveSumMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveActiveSum($1)"; + case hlsl: + __intrinsic_asm "WaveActiveSum($1)"; + default: + return WaveMultiSum(expr, uint4(mask, 0, 0, 0)); } } @@ -14580,134 +14492,48 @@ bool WaveMaskAllEqual(WaveMask mask, matrix<T,N,M> value) // Prefix __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskPrefixProduct(WaveMask mask, T expr) { - __target_switch - { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupExclusiveMul($1)"; - case cuda: __intrinsic_asm "_wavePrefixProduct($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixProduct($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$T result Subgroup ExclusiveScan $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIMul $$T result Subgroup ExclusiveScan $expr; - }; - } - else return expr; - } + return WaveMultiPrefixProduct(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskPrefixProduct(WaveMask mask, vector<T,N> expr) { - __target_switch - { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupExclusiveMul($1)"; - case cuda: __intrinsic_asm "_wavePrefixProductMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixProduct($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$vector<T,N> result Subgroup ExclusiveScan $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - OpGroupNonUniformIMul $$vector<T,N> result Subgroup ExclusiveScan $expr; - }; - } - else return expr; - } + return WaveMultiPrefixProduct(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskPrefixProduct(WaveMask mask, matrix<T,N,M> expr) { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixProductMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixProduct($1)"; - } + return WaveMultiPrefixProduct(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskPrefixSum(WaveMask mask, T expr) { - __target_switch - { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupExclusiveAdd($1)"; - case cuda: __intrinsic_asm "_wavePrefixSum($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixSum($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$T result Subgroup ExclusiveScan $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - result:$$T = OpGroupNonUniformIAdd Subgroup ExclusiveScan $expr; - }; - } - else return expr; - } + return WaveMultiPrefixSum(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskPrefixSum(WaveMask mask, vector<T,N> expr) { - __target_switch - { - case glsl: - if (__isHalf<T>()) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - __intrinsic_asm "subgroupExclusiveAdd($1)"; - case cuda: __intrinsic_asm "_wavePrefixSumMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixSum($1)"; - case spirv: - if (__isFloat<T>()) - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$vector<T,N> result Subgroup ExclusiveScan $expr}; - else if (__isInt<T>()) - { - return spirv_asm - { - OpCapability GroupNonUniformArithmetic; - result:$$vector<T,N> = OpGroupNonUniformIAdd Subgroup ExclusiveScan $expr; - }; - } - else return expr; - } + return WaveMultiPrefixSum(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskPrefixSum(WaveMask mask, matrix<T,N,M> expr) { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixSumMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WavePrefixSum($1)"; - } + return WaveMultiPrefixSum(expr, uint4(mask, 0, 0, 0)); } __generic<T : __BuiltinType> @@ -14813,133 +14639,76 @@ WaveMask WaveMaskMatch(WaveMask mask, matrix<T,N,M> value) } } -__generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskPrefixBitAnd(WaveMask mask, T expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveAnd($1)"; - case cuda: __intrinsic_asm "_wavePrefixAnd($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$T result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitAnd(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskPrefixBitAnd(WaveMask mask, vector<T,N> expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveAnd($1)"; - case cuda: __intrinsic_asm "_wavePrefixAndMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$vector<T,N> result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitAnd(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_hlsl, subgroup_partitioned)] matrix<T,N,M> WaveMaskPrefixBitAnd(WaveMask mask, matrix<T,N,M> expr) { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))"; - } + return WaveMultiPrefixBitAnd(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskPrefixBitOr(WaveMask mask, T expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveOr($1)"; - case cuda: __intrinsic_asm "_wavePrefixOr($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$T result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitOr(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskPrefixBitOr(WaveMask mask, vector<T,N> expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveOr($1)"; - case cuda: __intrinsic_asm "_wavePrefixOrMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseOr $$vector<T,N> result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitOr(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskPrefixBitOr(WaveMask mask, matrix<T,N,M> expr) { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixOrMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))"; - } + return WaveMultiPrefixBitOr(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] T WaveMaskPrefixBitXor(WaveMask mask, T expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveXor($1)"; - case cuda: __intrinsic_asm "_wavePrefixXor($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseXor $$T result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitXor(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_arithmetic) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] vector<T,N> WaveMaskPrefixBitXor(WaveMask mask, vector<T,N> expr) { - __target_switch - { - case glsl: __intrinsic_asm "subgroupExclusiveXor($1)"; - case cuda: __intrinsic_asm "_wavePrefixXorMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))"; - case spirv: - return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseXor $$vector<T,N> result Subgroup ExclusiveScan $expr}; - } + return WaveMultiPrefixBitOr(expr, uint4(mask, 0, 0, 0)); } -__generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_hlsl, subgroup_arithmetic)] +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr) { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixXorMultiple($0, $1)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))"; - } + return WaveMultiPrefixBitOr(expr, uint4(mask, 0, 0, 0)); } //@public: @@ -15156,7 +14925,7 @@ const WaveActiveBitOpEntry kWaveActiveBitOpEntries[] = {{"BitAnd", "And", "Bitwi for (auto opName : kWaveActiveBitOpEntries) { }}}} /// @category wave Wave and quad functions -__generic<T : __BuiltinIntegerType> +__generic<T : __BuiltinLogicalType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __wgsl_extension(subgroups) @@ -15179,7 +14948,7 @@ T WaveActive$(opName.hlslName)(T expr) } } -__generic<T : __BuiltinIntegerType, let N : int> +__generic<T : __BuiltinLogicalType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __wgsl_extension(subgroups) @@ -15202,7 +14971,7 @@ vector<T, N> WaveActive$(opName.hlslName)(vector<T, N> expr) } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> +__generic<T : __BuiltinLogicalType, let N : int, let M : int> [require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_arithmetic)] matrix<T, N, M> WaveActive$(opName.hlslName)(matrix<T, N, M> expr) { @@ -16238,7 +16007,7 @@ uint4 WaveMatch(matrix<T,N,M> value) } /// @category wave -[require(cuda_hlsl, wave_multi_prefix)] +[require(cuda_hlsl, subgroup_partitioned)] uint WaveMultiPrefixCountBits(bool value, uint4 mask) { __target_switch @@ -16248,537 +16017,750 @@ uint WaveMultiPrefixCountBits(bool value, uint4 mask) } } -/// @category wave -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -T WaveMultiPrefixBitAnd(T expr, uint4 mask) +__glsl_extension(GL_EXT_demote_to_helper_invocation) +[ForceInline] +[require(glsl_hlsl_metal_spirv, helper_lane)] +bool IsHelperLane() { - __target_switch - { - case cuda: __intrinsic_asm "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveAndNV"; + __target_switch { + case hlsl: __intrinsic_asm "IsHelperLane()"; + case glsl: __intrinsic_asm "gl_HelperInvocation"; + case metal: __intrinsic_asm "simd_is_helper_thread()"; case spirv: - return spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - result:$$T = OpGroupNonUniformBitwiseAnd Subgroup PartitionedExclusiveScanNV $expr $mask + return spirv_asm { + OpExtension "SPV_EXT_demote_to_helper_invocation"; + OpCapability DemoteToHelperInvocationEXT; + result:$$bool = OpIsHelperInvocationEXT }; } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask) +//@hidden: + +__generic<T : __BuiltinType> +[ForceInline] +[require(glsl)] +void __requireGLSLShaderSubgroupTypeExtension() { + // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm + if (__type_equals<T, half>() + || __type_equals<T, float16_t>() + ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); + else if (__type_equals<T, uint8_t>() + || __type_equals<T, int8_t>() + ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8"); + else if (__type_equals<T, uint16_t>() + || __type_equals<T, int16_t>() + ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16"); + else if (__type_equals<T, uint64_t>() + || __type_equals<T, int64_t>() + ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64"); + + __intrinsic_asm ""; +} + +__generic<T : __BuiltinType> +[ForceInline] +[require(metal)] +void __checkMetalShaderSubgroupType() +{ + // These builtin types are not supported for Metal's `simd` operations. + if (__type_equals<T, uint8_t>() + || __type_equals<T, int8_t>() + || __type_equals<T, uint64_t>() + || __type_equals<T, int64_t>() + || __isBool<T>() + ) + { + static_assert(false, "Unsupported type for subgroup operations in Metal. Valid types include scalars and vectors of uint/uint32_t, int/int32_t, uint16_t, int16_t, float, and half."); + } +} + +__generic<T : __BuiltinType> +[ForceInline] +void shader_subgroup_preamble() +{ + // checks needed for shader_subgroup functions; __requireTargetExtension does not work + // (does not add the ext specified correctly to the compile output; using extended type + // will result in error for using the type) __target_switch { - case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveAndNV"; - case spirv: - return spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - result:$$vector<T,N> = OpGroupNonUniformBitwiseAnd Subgroup PartitionedExclusiveScanNV $expr $mask - }; + case glsl: + __requireGLSLShaderSubgroupTypeExtension<T>(); + case metal: + __checkMetalShaderSubgroupType<T>(); + default: + return; } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask) +//@public: + +// +// Wave Rotate intrinsics. +// These are Slang specific intrinsics to rotate values within a subgroup. +// + +__generic<T : __BuiltinType> +__glsl_extension(GL_KHR_shader_subgroup_rotate) +[require(glsl_metal_spirv, subgroup_rotate)] +T WaveRotate(T value, uint delta) { + shader_subgroup_preamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd"; case glsl: + __intrinsic_asm "subgroupRotate"; + case metal: + __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: - matrix<T, N, M> result; - for (int i = 0; i < N; ++i) - result[i] = WaveMultiPrefixBitAnd(expr[i], mask); - return result; + return spirv_asm + { + OpExtension "SPV_KHR_subgroup_rotate"; + OpCapability GroupNonUniformRotateKHR; + result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta; + }; } } -/// @category wave -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -T WaveMultiPrefixBitOr(T expr, uint4 mask) +__generic<T : __BuiltinType, let N : int> +__glsl_extension(GL_KHR_shader_subgroup_rotate) +[require(glsl_metal_spirv, subgroup_rotate)] +vector<T, N> WaveRotate(vector<T, N> value, uint delta) { + shader_subgroup_preamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveOrNV"; + case glsl: + __intrinsic_asm "subgroupRotate"; + case metal: + __intrinsic_asm "simd_shuffle_rotate_down"; case spirv: return spirv_asm { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - result:$$T = OpGroupNonUniformBitwiseOr Subgroup PartitionedExclusiveScanNV $expr $mask + OpExtension "SPV_KHR_subgroup_rotate"; + OpCapability GroupNonUniformRotateKHR; + result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta; }; } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask) +__generic<T : __BuiltinType> +__glsl_extension(GL_KHR_shader_subgroup_rotate) +[require(glsl_spirv, subgroup_rotate)] +T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) { + shader_subgroup_preamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveOrNV"; + case glsl: + __intrinsic_asm "subgroupClusteredRotate"; case spirv: return spirv_asm { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - result:$$vector<T,N> = OpGroupNonUniformBitwiseOr Subgroup PartitionedExclusiveScanNV $expr $mask + OpExtension "SPV_KHR_subgroup_rotate"; + OpCapability GroupNonUniformRotateKHR; + result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize; }; } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask) +__generic<T : __BuiltinType, let N : int> +__glsl_extension(GL_KHR_shader_subgroup_rotate) +[require(glsl_spirv, subgroup_rotate)] +vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize) { + shader_subgroup_preamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr"; case glsl: + __intrinsic_asm "subgroupClusteredRotate"; case spirv: - matrix<T, N, M> result; - for (int i = 0; i < N; ++i) - result[i] = WaveMultiPrefixBitOr(expr[i], mask); - return result; + return spirv_asm + { + OpExtension "SPV_KHR_subgroup_rotate"; + OpCapability GroupNonUniformRotateKHR; + result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize; + }; } } -/// @category wave -__generic<T : __BuiltinIntegerType> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -__spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -T WaveMultiPrefixBitXor(T expr, uint4 mask) + +// +// WaveMulti intrinsics are subgroup operations that operate on a 128-bit `uint4` mask. +// They are equivalent to SPIRV/GLSL's subgroup partitioned operation and HLSL's `WaveMultiPrefix*` operations. +// +// SPIRV/GLSL natively supports masked subgroup operations for both reductions and exclusive/inclusive scans. +// HLSL only natively supports exclusive scans(prefix operations) on arithmetic operations. Inclusve scans +// are emulated by performing an additional operation to the inclusive scan result. Reductions are not supported. +// + +__generic<T : __BuiltinType> +[ForceInline] +void __shaderSubgroupPartitionedPreamble() { + shader_subgroup_preamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveXorNV"; + case glsl: + __requireTargetExtension("GL_NV_shader_subgroup_partitioned"); case spirv: - return spirv_asm + spirv_asm { OpExtension "SPV_NV_shader_subgroup_partitioned"; OpCapability GroupNonUniformPartitionedNV; - result:$$T = OpGroupNonUniformBitwiseXor Subgroup PartitionedExclusiveScanNV $expr $mask }; + default: + return; } } -__generic<T : __BuiltinIntegerType, let N : int> -__glsl_extension(GL_NV_shader_subgroup_partitioned) +// +// WaveMultiSum/WaveMultiProduct. +// +${{{{ +struct WaveMultiSumProductEntry { const char* name; const char* spirvName; }; +const WaveMultiSumProductEntry kWaveMultiSumProductNames[] = { {"Sum", "Add"}, {"Product", "Mul"} }; +for (auto opName : kWaveMultiSumProductNames) { +}}}} + +__generic<T : __BuiltinArithmeticType> __spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +T WaveMulti$(opName.name)(T value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveXorNV"; + case cuda: + __intrinsic_asm "_wave$(opName.name)($1.x, $0)"; + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.spirvName)NV"; case spirv: - return spirv_asm { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - result:$$vector<T,N> = OpGroupNonUniformBitwiseXor Subgroup PartitionedExclusiveScanNV $expr $mask - }; + if (__isFloat<T>()) + return spirv_asm { result:$$T = OpGroupNonUniformF$(opName.spirvName) Subgroup PartitionedReduceNV $value $mask }; + else + return spirv_asm { result:$$T = OpGroupNonUniformI$(opName.spirvName) Subgroup PartitionedReduceNV $value $mask }; + } } } -__generic<T : __BuiltinIntegerType, let N : int, let M : int> -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask) +__generic<T : __BuiltinArithmeticType, let N : int> +__spirv_version(1.3) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMulti$(opName.name)(vector<T, N> value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor"; + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.spirvName)NV"; case spirv: + { + if (__isFloat<T>()) + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformF$(opName.spirvName) Subgroup PartitionedReduceNV $value $mask }; + else + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformI$(opName.spirvName) Subgroup PartitionedReduceNV $value $mask }; + } + } +} + +__generic<T : __BuiltinArithmeticType, let N : int, let M : int> +[require(cuda_glsl_spirv, subgroup_partitioned)] +matrix<T,N,M> WaveMulti$(opName.name)(matrix<T,N,M> value, uint4 mask) +{ + __target_switch + { + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; + default: matrix<T, N, M> result; for (int i = 0; i < N; ++i) - result[i] = WaveMultiPrefixBitXor(expr[i], mask); + result[i] = WaveMulti$(opName.name)(value[i], mask); return result; } } -/// @category wave +${{{{ +} // WaveMultiSum/WaveMultiProduct. +}}}} + + +// +// WaveMultiPrefixInclusiveSum/WaveMultiPrefixInclusiveProduct. +// WaveMultiPrefixExclusiveSum/WaveMultiPrefixExclusiveProduct. +// WaveMultiPrefixSum/WaveMultiPrefixProduct. +// +${{{{ +struct WaveMultiPrefixSumProductEntry +{ + const char* name; + const char* spirvName; + const char* spirvGroupOperation; + const char* glslName; + const char* hlslName; + const char* cudaName; + const char* cudaExtraOperation; + + // Inclusive operations are not implemented by the CUDA prelude functions. + // They are implemented here by calling the exclusive implementation and performing an additional operations + // with the current invocation's value. This works for all cases except for element-wise matrix multiplication. + bool cudaMatrixVariantSupport; +}; + +const WaveMultiPrefixSumProductEntry kWaveMultiPrefixSumProductNames[] = +{ + // name spirvName spirvGroupOperation glslName hlslName cudaName cudaExtraOperation cudaMatrixVariantSupport + { "InclusiveSum", "Add", "PartitionedInclusiveScanNV", "InclusiveAdd", "Sum($0, $1) + $0", "Sum", "+ $0", false }, + { "InclusiveProduct", "Mul", "PartitionedInclusiveScanNV", "InclusiveMul", "Product($0, $1) * $0", "Product", "* $0", false }, + { "ExclusiveSum", "Add", "PartitionedExclusiveScanNV", "ExclusiveAdd", "Sum($0, $1)", "Sum", "", true }, + { "ExclusiveProduct", "Mul", "PartitionedExclusiveScanNV", "ExclusiveMul", "Product($0, $1)", "Product", "", true }, + + // These are HLSL SM 6.5 intrinsics and are equal to the exclusive variants. + { "Sum", "Add", "PartitionedExclusiveScanNV", "ExclusiveAdd", "Sum($0, $1)", "Sum", "", true }, + { "Product", "Mul", "PartitionedExclusiveScanNV", "ExclusiveMul", "Product($0, $1)", "Product", "", true }, +}; + +for (auto opName : kWaveMultiPrefixSumProductNames) { +}}}} + __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_NV_shader_subgroup_partitioned) __spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -T WaveMultiPrefixProduct(T value, uint4 mask) +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +T WaveMultiPrefix$(opName.name)(T value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveMulNV"; + case cuda: + __intrinsic_asm "_wavePrefix$(opName.cudaName)($1.x, $0) $(opName.cudaExtraOperation)"; + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case hlsl: + __intrinsic_asm "WaveMultiPrefix$(opName.hlslName)"; case spirv: { - spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - }; - if (__isFloat<T>()) - { - return spirv_asm - { - result:$$T = OpGroupNonUniformFMul Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$T = OpGroupNonUniformF$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; else - { - return spirv_asm - { - result:$$T = OpGroupNonUniformIMul Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$T = OpGroupNonUniformI$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; } } } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_NV_shader_subgroup_partitioned) __spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask) +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMultiPrefix$(opName.name)(vector<T, N> value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveMulNV"; + case cuda: + __intrinsic_asm "_wavePrefix$(opName.cudaName)Multiple($1.x, $0) $(opName.cudaExtraOperation)"; + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case hlsl: + __intrinsic_asm "WaveMultiPrefix$(opName.hlslName)"; case spirv: { - spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - }; - if (__isFloat<T>()) - { - return spirv_asm - { - result:$$vector<T,N> = OpGroupNonUniformFMul Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformF$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; else - { - return spirv_asm - { - result:$$vector<T,N> = OpGroupNonUniformIMul Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformI$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; } } } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask) +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +matrix<T,N,M> WaveMultiPrefix$(opName.name)(matrix<T,N,M> value, uint4 mask) { __target_switch { - case cuda: __intrinsic_asm "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixProduct"; - case glsl: - case spirv: + ${{{{ if(opName.cudaMatrixVariantSupport) { }}}} + case cuda: + __intrinsic_asm "_wavePrefix$(opName.cudaName)Multiple($1.x, $0) $(opName.cudaExtraOperation)"; + ${{{{ } }}}} + default: matrix<T, N, M> result; for (int i = 0; i < N; ++i) - result[i] = WaveMultiPrefixProduct(value[i], mask); + result[i] = WaveMultiPrefix$(opName.name)(value[i], mask); return result; } } -/// @category wave +${{{{ +} +// WaveMultiPrefixInclusiveSum/WaveMultiPrefixInclusiveProduct. +// WaveMultiPrefixExclusiveSum/WaveMultiPrefixExclusiveProduct. +// WaveMultiPrefixSum/WaveMultiPrefixProduct. +}}}} + + +// +// WaveMultiMin/WaveMultiMax. +// +${{{{ +struct WaveMultiMinMaxEntry { const char* name; }; +const WaveMultiMinMaxEntry kWaveMultiMinMaxNames[] = { {"Min"}, {"Max"} }; +for (auto opName : kWaveMultiMinMaxNames) { +}}}} + __generic<T : __BuiltinArithmeticType> -__glsl_extension(GL_NV_shader_subgroup_partitioned) __spirv_version(1.3) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -T WaveMultiPrefixSum(T value, uint4 mask) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +T WaveMulti$(opName.name)(T value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveAddNV"; + case cuda: + __intrinsic_asm "_wave$(opName.name)($1.x, $0)"; + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.name)NV"; case spirv: { - spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - }; - if (__isFloat<T>()) - { - return spirv_asm - { - result:$$T = OpGroupNonUniformFAdd Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$T = OpGroupNonUniformF$(opName.name) Subgroup PartitionedReduceNV $value $mask }; + else if (__isUnsignedInt<T>()) + return spirv_asm { result:$$T = OpGroupNonUniformU$(opName.name) Subgroup PartitionedReduceNV $value $mask }; else - { - return spirv_asm - { - result:$$T = OpGroupNonUniformIAdd Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$T = OpGroupNonUniformS$(opName.name) Subgroup PartitionedReduceNV $value $mask }; } } } __generic<T : __BuiltinArithmeticType, let N : int> -__glsl_extension(GL_NV_shader_subgroup_partitioned) -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] __spirv_version(1.3) -vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMulti$(opName.name)(vector<T, N> value, uint4 mask) { + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { - case cuda: __intrinsic_asm "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )"; - case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; - case glsl: __intrinsic_asm "subgroupPartitionedExclusiveAddNV"; + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.name)NV"; case spirv: { - spirv_asm - { - OpExtension "SPV_NV_shader_subgroup_partitioned"; - OpCapability GroupNonUniformPartitionedNV; - }; - if (__isFloat<T>()) - { - return spirv_asm - { - result:$$vector<T,N> = OpGroupNonUniformFAdd Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformF$(opName.name) Subgroup PartitionedReduceNV $value $mask }; + else if (__isUnsignedInt<T>()) + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformU$(opName.name) Subgroup PartitionedReduceNV $value $mask }; else - { - return spirv_asm - { - result:$$vector<T,N> = OpGroupNonUniformIAdd Subgroup PartitionedExclusiveScanNV $value $mask - }; - } + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformS$(opName.name) Subgroup PartitionedReduceNV $value $mask }; } } } __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -[require(cuda_glsl_hlsl_spirv, wave_multi_prefix)] -matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask) +[require(cuda_glsl_spirv, subgroup_partitioned)] +matrix<T, N, M> WaveMulti$(opName.name)(matrix<T, N, M> value, uint4 mask) { __target_switch { - case cuda: __intrinsic_asm "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)"; - case hlsl: __intrinsic_asm "WaveMultiPrefixSum"; - case glsl: - case spirv: + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; + default: matrix<T, N, M> result; + [ForceUnroll] for (int i = 0; i < N; ++i) - result[i] = WaveMultiPrefixSum(value[i], mask); + result[i] = WaveMulti$(opName.name)(value[i], mask); return result; } } -__glsl_extension(GL_EXT_demote_to_helper_invocation) -[ForceInline] -[require(glsl_hlsl_metal_spirv, helper_lane)] -bool IsHelperLane() -{ - __target_switch { - case hlsl: __intrinsic_asm "IsHelperLane()"; - case glsl: __intrinsic_asm "gl_HelperInvocation"; - case metal: __intrinsic_asm "simd_is_helper_thread()"; - case spirv: - return spirv_asm { - OpExtension "SPV_EXT_demote_to_helper_invocation"; - OpCapability DemoteToHelperInvocationEXT; - result:$$bool = OpIsHelperInvocationEXT - }; - } -} +${{{{ +} // WaveMultiMin/WaveMultiMax. +}}}} -//@hidden: -__generic<T : __BuiltinType> -[ForceInline] -[require(glsl)] -void __requireGLSLShaderSubgroupTypeExtension() +// +// WaveMultiPrefixInclusiveMin/WaveMultiPrefixInclusiveMax. +// WaveMultiPrefixExclusiveMin/WaveMultiPrefixExclusiveMax. +// +${{{{ +struct WaveMultiPrefixMinMaxEntry { - // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm - if (__type_equals<T, half>() - || __type_equals<T, float16_t>() - ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16"); - else if (__type_equals<T, uint8_t>() - || __type_equals<T, int8_t>() - ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8"); - else if (__type_equals<T, uint16_t>() - || __type_equals<T, int16_t>() - ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16"); - else if (__type_equals<T, uint64_t>() - || __type_equals<T, int64_t>() - ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64"); + const char* name; + const char* spirvName; + const char* spirvGroupOperation; + const char* glslName; +}; - __intrinsic_asm ""; -} +const WaveMultiPrefixMinMaxEntry kWaveMultiPrefixMinMaxNames[] = +{ + // name spirvName spirvGroupOperation glslName + { "InclusiveMin", "Min", "PartitionedInclusiveScanNV", "InclusiveMin" }, + { "InclusiveMax", "Max", "PartitionedInclusiveScanNV", "InclusiveMax" }, + { "ExclusiveMin", "Min", "PartitionedExclusiveScanNV", "ExclusiveMin" }, + { "ExclusiveMax", "Max", "PartitionedExclusiveScanNV", "ExclusiveMax" }, +}; -__generic<T : __BuiltinType> +for (auto opName : kWaveMultiPrefixMinMaxNames) { +}}}} + +__generic<T : __BuiltinArithmeticType> +__spirv_version(1.3) [ForceInline] -[require(metal)] -void __checkMetalShaderSubgroupType() +[require(glsl_spirv, subgroup_partitioned)] +T WaveMultiPrefix$(opName.name)(T value, uint4 mask) { - // These builtin types are not supported for Metal's `simd` operations. - if (__type_equals<T, uint8_t>() - || __type_equals<T, int8_t>() - || __type_equals<T, uint64_t>() - || __type_equals<T, int64_t>() - || __isBool<T>() - ) + __shaderSubgroupPartitionedPreamble<T>(); + __target_switch { - static_assert(false, "Unsupported type for subgroup operations in Metal. Valid types include scalars and vectors of uint/uint32_t, int/int32_t, uint16_t, int16_t, float, and half."); + case glsl: + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case spirv: + { + if (__isFloat<T>()) + return spirv_asm { result:$$T = OpGroupNonUniformF$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + else if (__isUnsignedInt<T>()) + return spirv_asm { result:$$T = OpGroupNonUniformU$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + else + return spirv_asm { result:$$T = OpGroupNonUniformS$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + } } } -__generic<T : __BuiltinType> -void shader_subgroup_preamble() +__generic<T : __BuiltinArithmeticType, let N : int> +__spirv_version(1.3) +[ForceInline] +[require(glsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMultiPrefix$(opName.name)(vector<T, N> value, uint4 mask) { - // checks needed for shader_subgroup functions; __requireTargetExtension does not work - // (does not add the ext specified correctly to the compile output; using extended type - // will result in error for using the type) + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { case glsl: - __requireGLSLShaderSubgroupTypeExtension<T>(); - case metal: - __checkMetalShaderSubgroupType<T>(); - default: - return; + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case spirv: + { + if (__isFloat<T>()) + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformF$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + else if (__isUnsignedInt<T>()) + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformU$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + else + return spirv_asm { result:$$vector<T,N> = OpGroupNonUniformS$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask }; + } } } -//@public: +__generic<T : __BuiltinArithmeticType, let N : int, let M : int> +[require(glsl_spirv, subgroup_partitioned)] +matrix<T, N, M> WaveMultiPrefix$(opName.name)(matrix<T, N, M> value, uint4 mask) +{ + matrix<T, N, M> result; + [ForceUnroll] + for (int i = 0; i < N; ++i) + result[i] = WaveMultiPrefix$(opName.name)(value[i], mask); + return result; +} + +${{{{ +} +// WaveMultiPrefixInclusiveMin/WaveMultiPrefixInclusiveMax. +// WaveMultiPrefixExclusiveMin/WaveMultiPrefixExclusiveMax. +}}}} + // -// Wave Rotate intrinsics. -// These are Slang specific intrinsics to rotate values within a subgroup. +// WaveMultiBitAnd/WaveMultiBitOr/WaveMultiBitXor. // +${{{{ +struct WaveMultiBitsEntry { const char* name; }; +const WaveMultiBitsEntry kWaveMultiBitsNames[] = { {"And"}, {"Or"} , {"Xor"} }; +for (auto opName : kWaveMultiBitsNames) { +}}}} -__generic<T : __BuiltinType> -__glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] -T WaveRotate(T value, uint delta) +__generic<T : __BuiltinLogicalType> +__spirv_version(1.3) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +T WaveMultiBit$(opName.name)(T value, uint4 mask) { - shader_subgroup_preamble<T>(); + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { + case cuda: + __intrinsic_asm "_wave$(opName.name)($1.x, $0)"; case glsl: - __intrinsic_asm "subgroupRotate"; - case metal: - __intrinsic_asm "simd_shuffle_rotate_down"; + __intrinsic_asm "subgroupPartitioned$(opName.name)NV"; case spirv: return spirv_asm { - OpExtension "SPV_KHR_subgroup_rotate"; - OpCapability GroupNonUniformRotateKHR; - result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta; + result:$$T = OpGroupNonUniformBitwise$(opName.name) Subgroup PartitionedReduceNV $value $mask; }; } } -__generic<T : __BuiltinType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_metal_spirv, subgroup_rotate)] -vector<T, N> WaveRotate(vector<T, N> value, uint delta) +__generic<T : __BuiltinLogicalType, let N : int> +__spirv_version(1.3) +[ForceInline] +[require(cuda_glsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMultiBit$(opName.name)(vector<T, N> value, uint4 mask) { - shader_subgroup_preamble<T>(); + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; case glsl: - __intrinsic_asm "subgroupRotate"; - case metal: - __intrinsic_asm "simd_shuffle_rotate_down"; + __intrinsic_asm "subgroupPartitioned$(opName.name)NV"; case spirv: return spirv_asm { - OpExtension "SPV_KHR_subgroup_rotate"; - OpCapability GroupNonUniformRotateKHR; - result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta; + result:$$vector<T,N> = OpGroupNonUniformBitwise$(opName.name) Subgroup PartitionedReduceNV $value $mask; }; } } -__generic<T : __BuiltinType> -__glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] -T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize) +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[require(cuda_glsl_spirv, subgroup_partitioned)] +matrix<T, N, M> WaveMultiBit$(opName.name)(matrix<T, N, M> value, uint4 mask) { - shader_subgroup_preamble<T>(); __target_switch { + case cuda: + __intrinsic_asm "_wave$(opName.name)Multiple($1.x, $0)"; + default: + matrix<T,N,M> result; + [ForceUnroll] + for (int i = 0; i < N; ++i) + result[i] = WaveMultiBit$(opName.name)(value[i], mask); + return result; + } +} + +${{{{ +} // WaveMultiBitAnd/WaveMultiBitOr/WaveMultiBitXor. +}}}} + + +// +// WaveMultiPrefixInclusiveBitAnd/WaveMultiPrefixInclusiveBitOr/WaveMultiInclusiveBitXor. +// WaveMultiPrefixExclusiveBitAnd/WaveMultiPrefixExclusiveBitXor/WaveMultiExclusiveBitXor. +// WaveMultiPrefixBitAnd/WaveMultiPrefixBitOr/WaveMultiBitXor. +// +${{{{ +struct WaveMultiPrefixBitwiseEntry +{ + const char* name; + const char* spirvName; + const char* spirvGroupOperation; + const char* glslName; + const char* hlslName; + const char* cudaExtraOperation; + + bool cudaMatrixVariantSupport; +}; + +const WaveMultiPrefixBitwiseEntry kWaveMultiPrefixBitwiseNames[] = +{ + // name spirvName spirvGroupOperation glslName hlslName cudaExtraOperation cudaMatrixVariantSupport + { "InclusiveBitAnd", "And", "PartitionedInclusiveScanNV", "InclusiveAnd", "And($0, $1) & $0", "& $0", false }, + { "InclusiveBitOr", "Or", "PartitionedInclusiveScanNV", "InclusiveOr", "Or($0, $1) | $0", "| $0", false }, + { "InclusiveBitXor", "Xor", "PartitionedInclusiveScanNV", "InclusiveXor", "Xor($0, $1) ^ $0", "^ $0", false }, + { "ExclusiveBitAnd", "And", "PartitionedExclusiveScanNV", "ExclusiveAnd", "And", "", true }, + { "ExclusiveBitOr", "Or", "PartitionedExclusiveScanNV", "ExclusiveOr", "Or", "", true }, + { "ExclusiveBitXor", "Xor", "PartitionedExclusiveScanNV", "ExclusiveXor", "Xor", "", true }, + + // These are HLSL SM 6.5 intrinsics and are equal to the exclusive variants. + { "BitAnd", "And", "PartitionedExclusiveScanNV", "ExclusiveAnd", "And", "", true }, + { "BitOr", "Or", "PartitionedExclusiveScanNV", "ExclusiveOr", "Or", "", true }, + { "BitXor", "Xor", "PartitionedExclusiveScanNV", "ExclusiveXor", "Xor", "", true }, +}; + +for (auto opName : kWaveMultiPrefixBitwiseNames) { +}}}} + +__generic<T : __BuiltinLogicalType> +__spirv_version(1.3) +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +T WaveMultiPrefix$(opName.name)(T value, uint4 mask) +{ + __shaderSubgroupPartitionedPreamble<T>(); + __target_switch + { + case cuda: + __intrinsic_asm "_wavePrefix$(opName.spirvName)($1.x, $0) $(opName.cudaExtraOperation)"; case glsl: - __intrinsic_asm "subgroupClusteredRotate"; + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case hlsl: + __intrinsic_asm "WaveMultiPrefixBit$(opName.hlslName)"; case spirv: return spirv_asm { - OpExtension "SPV_KHR_subgroup_rotate"; - OpCapability GroupNonUniformRotateKHR; - result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize; + result:$$T = OpGroupNonUniformBitwise$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask; }; } } -__generic<T : __BuiltinType, let N : int> -__glsl_extension(GL_KHR_shader_subgroup_rotate) -[require(glsl_spirv, subgroup_rotate)] -vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize) +__generic<T : __BuiltinLogicalType, let N : int> +__spirv_version(1.3) +[ForceInline] +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +vector<T, N> WaveMultiPrefix$(opName.name)(vector<T, N> value, uint4 mask) { - shader_subgroup_preamble<T>(); + __shaderSubgroupPartitionedPreamble<T>(); __target_switch { + case cuda: + __intrinsic_asm "_wavePrefix$(opName.spirvName)Multiple($1.x, $0) $(opName.cudaExtraOperation)"; case glsl: - __intrinsic_asm "subgroupClusteredRotate"; + __intrinsic_asm "subgroupPartitioned$(opName.glslName)NV"; + case hlsl: + __intrinsic_asm "WaveMultiPrefixBit$(opName.hlslName)"; case spirv: return spirv_asm { - OpExtension "SPV_KHR_subgroup_rotate"; - OpCapability GroupNonUniformRotateKHR; - result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize; + result:$$vector<T,N> = OpGroupNonUniformBitwise$(opName.spirvName) Subgroup $(opName.spirvGroupOperation) $value $mask; }; } } +__generic<T : __BuiltinLogicalType, let N : int, let M : int> +[require(cuda_glsl_hlsl_spirv, subgroup_partitioned)] +matrix<T, N, M> WaveMultiPrefix$(opName.name)(matrix<T, N, M> value, uint4 mask) +{ + __target_switch + { +${{{{ + if (opName.cudaMatrixVariantSupport) { +}}}} + case cuda: + __intrinsic_asm "_wavePrefix$(opName.spirvName)Multiple($1.x, $0) $(opName.cudaExtraOperation)"; +${{{{ + } +}}}} + default: + matrix<T,N,M> result; + [ForceUnroll] + for (int i = 0; i < N; ++i) + result[i] = WaveMultiPrefix$(opName.name)(value[i], mask); + return result; + } +} +${{{{ +} +// WaveMultiPrefixInclusiveBitAnd/WaveMultiPrefixInclusiveBitOr/WaveMultiInclusiveBitXor. +// WaveMultiPrefixExclusiveBitAnd/WaveMultiPrefixExclusiveBitXor/WaveMultiExclusiveBitXor. +// WaveMultiPrefixBitAnd/WaveMultiPrefixBitOr/WaveMultiBitXor. +}}}} + + // // Quad Control intrinsics // diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef index 1799d4bfc..48617c54d 100644 --- a/source/slang/slang-capabilities.capdef +++ b/source/slang/slang-capabilities.capdef @@ -1157,11 +1157,6 @@ alias fragmentshaderbarycentric = GL_EXT_fragment_shader_barycentric | _sm_6_1; /// (gfx targets) Capabilities needed to use memory barriers /// [Compound] alias shadermemorycontrol = glsl | _spirv_1_0 | _sm_5_0; -/// Capabilities needed to use HLSL tier wave operations -/// [Compound] -alias wave_multi_prefix = _sm_6_5 - | _cuda_sm_7_0 - | GL_KHR_shader_subgroup_ballot + GL_KHR_shader_subgroup_arithmetic + GL_NV_shader_subgroup_partitioned; /// Capabilities needed to use GLSL buffer-reference's /// [Compound] alias bufferreference = GL_EXT_buffer_reference; @@ -2186,7 +2181,9 @@ alias subgroup_quad = GL_KHR_shader_subgroup_quad ; /// Capabilities required to use GLSL-style subgroup operations 'subgroup_partitioned' /// [Compound] -alias subgroup_partitioned = GL_NV_shader_subgroup_partitioned + subgroup_ballot_activemask | _sm_6_5 | _cuda_sm_7_0; +alias subgroup_partitioned = _sm_6_5 + | _cuda_sm_7_0 + | GL_KHR_shader_subgroup_ballot + GL_KHR_shader_subgroup_arithmetic + GL_NV_shader_subgroup_partitioned; /// Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate' |
