diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-04-15 14:14:58 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-04-15 14:14:58 -0400 |
| commit | d5d32221daf950b2f923122a179e791572dd6cb6 (patch) | |
| tree | 0f4bd215c11abc98d0e1f9b3da920838e6e5862b /source | |
| parent | fbac017938343724407ab036abd736c942b4e187 (diff) | |
First support for 'WaveMask' intrinsics (#1321)
* WIP tests to confirm divergence on CUDA.
* Added wave.slang test that uses masks.
Made all CUDA intrinsic impls take a mask explicitly.
Added initial WaveMaskXXX intrinsics.
* Added WaveMaskSharedSync.
* Improvements aroung WaveMaskSharedSync/WaveMaskSync
* Remove tabs.
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 259 |
1 files changed, 202 insertions, 57 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 4279e4a4e..f096a125e 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2479,6 +2479,151 @@ matrix<T, N, M> trunc(matrix<T, N, M> x) MATRIX_MAP_UNARY(T, N, M, trunc, x); } +// Slang Specific Mask Wave Intrinsics + +typedef uint WaveMask; + +__target_intrinsic(cuda, "__activemask()") +WaveMask WaveGetActiveMask() { return 0xffffffff; } + +__glsl_extension(GL_KHR_shader_subgroup_vote) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAll($1)") +__target_intrinsic(cuda, "(__all_sync($0, $1) != 0)") +__target_intrinsic(hlsl, "WaveActiveAllTrue($1)") +bool WaveMaskAllTrue(WaveMask mask, bool condition); + +__glsl_extension(GL_KHR_shader_subgroup_vote) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAny($1)") +__target_intrinsic(cuda, "(__any_sync($0, $1) != 0)") +__target_intrinsic(hlsl, "WaveActiveAnyTrue($1)") +bool WaveMaskAnyTrue(WaveMask mask, bool condition); + +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBallot($1).x") +__target_intrinsic(cuda, "__ballot_sync($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBallot($1)") +WaveMask WaveMaskBallot(WaveMask mask, bool condition); + +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "bitCount(subgroupBallot($1))") +__target_intrinsic(cuda, "__popc(__ballot_sync($0, $1))") +__target_intrinsic(hlsl, "WaveActiveCountBits($1)") +WaveMask WaveMaskCountBits(WaveMask mask, bool value); + +// Waits until all warp lanes named in mask have executed a WaveMaskSharedSync (with the same mask) +// before resuming execution. Guarantees memory ordering in shared memory among threads participating +// in the barrier. +// +// The CUDA intrinsic says it orders *all* memory accesses, which appears to match most closely subgroupBarrier. +// +// TODO(JS): +// For HLSL it's not clear what to do. There is no explicit mechanism to 'reconverge' threads. In the docs it describes +// behavior as +// "These intrinsics are dependent on active lanes and therefore flow control. In the model of this document, implementations +// must enforce that the number of active lanes exactly corresponds to the programmer’s view of flow control." +// +// It seems this can only mean the active threads are the "threads the program flow would lead to". This implies a lockstep +// "straight SIMD" style interpretation. That being the case this op on HLSL is just a memory barrier without any Sync. + +__target_intrinsic(cuda, "__syncwarp($0)") +__glsl_extension(GL_KHR_shader_subgroup_basic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBarrier()") +__target_intrinsic(hlsl, "AllMemoryBarrier()") +void WaveMaskSync(WaveMask mask); + +// On GLSL, it appears we can't use subgroupMemoryBarrierShared, because it only implies a memory ordering, it does not +// imply convergence. For subgroupBarrier we have from the docs.. +// "The function subgroupBarrier() enforces that all active invocations within a subgroup must execute this function before any +// are allowed to continue their execution" + +__target_intrinsic(cuda, "__syncwarp($0)") +__glsl_extension(GL_KHR_shader_subgroup_basic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBarrier()") +__target_intrinsic(hlsl, "GroupMemoryBarrier()") +void WaveMaskSharedSync(WaveMask mask); + +// NOTE! WaveMaskBroadcastLaneAt is *NOT* standard HLSL +// It is provided as access to subgroupBroadcast which can only take a +// constexpr laneId. +// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt +// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index +// If that's the behavior required then client code should use WaveReadLaneAt which works this way. + +__generic<T : __BuiltinType> +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcast($1, $2)") +__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +T WaveMaskBroadcastLaneAt(WaveMask mask, T value, constexpr int lane); +__generic<T : __BuiltinType, let N : int> +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcast($1, $2)") +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +vector<T,N> WaveMaskBroadcastLaneAt(WaveMask mask, vector<T,N> value, constexpr int lane); +__generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +matrix<T,N,M> WaveMaskBroadcastLaneAt(WaveMask mask, matrix<T,N,M> value, constexpr int lane); + +// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast +// could be used on GLSL. For now we just use subgroupShuffle +__generic<T : __BuiltinType> +__glsl_extension(GL_KHR_shader_subgroup_shuffle) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupShuffle($1, $2)") +__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +T WaveMaskReadLaneAt(WaveMask mask, T value, int lane); +__generic<T : __BuiltinType, let N : int> +__spirv_version(1.3) +__glsl_extension(GL_KHR_shader_subgroup_shuffle) +__target_intrinsic(glsl, "subgroupShuffle($1, $2)") +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +vector<T,N> WaveMaskReadLaneAt(WaveMask mask, vector<T,N> value, int lane); +__generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +matrix<T,N,M> WaveMaskReadLaneAt(WaveMask mask, matrix<T,N,M> value, int lane); + +// NOTE! WaveMaskShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL +// which means it will only work on hardware which allows arbitrary laneIds which is not true +// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave. +__generic<T : __BuiltinType> +__glsl_extension(GL_KHR_shader_subgroup_shuffle) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupShuffle($1, $2)") +__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +T WaveMaskShuffle(WaveMask mask, T value, int lane); +__generic<T : __BuiltinType, let N : int> +__glsl_extension(GL_KHR_shader_subgroup_shuffle) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupShuffle($1, $2)") +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +vector<T,N> WaveMaskShuffle(WaveMask mask, vector<T,N> value, int lane); +__generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") +__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") +matrix<T,N,M> WaveMaskShuffle(WaveMask mask, matrix<T,N,M> value, int lane); + +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($1))") +__target_intrinsic(cuda, "__popc(__ballot_sync($0, $1) & _getLaneLtMask())") +__target_intrinsic(hlsl, "WavePrefixCountBits($1)") +uint WaveMaskPrefixCountBits(WaveMask mask, bool value); + // Shader model 6.0 stuff // Information for GLSL wave/subgroup support @@ -2504,112 +2649,112 @@ __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAnd($0)") +__target_intrinsic(cuda, "_waveAnd(_getActiveMask(), $0)") T WaveActiveBitAnd(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAndMultiple($0)") +__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveBitAnd(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveAndMultiple($0)") +__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr); __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOr($0)") +__target_intrinsic(cuda, "_waveOr(_getActiveMask(), $0)") T WaveActiveBitOr(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOrMultiple($0)") +__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveBitOr(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveOrMultiple($0)") +__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr); __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXor($0)") +__target_intrinsic(cuda, "_waveXor(_getActiveMask(), $0)") T WaveActiveBitXor(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXorMultiple($0)") +__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveBitXor(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveXorMultiple($0)") +__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMax($0)") +__target_intrinsic(cuda, "_waveMax(_getActiveMask(), $0)") T WaveActiveMax(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMaxMultiple($0)") +__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveMax(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveMaxMultiple($0)") +__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMin($0)") +__target_intrinsic(cuda, "_waveMin(_getActiveMask(), $0)") T WaveActiveMin(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMinMultiple($0)") +__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveMin(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveMinMultiple($0)") +__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProduct($0)") +__target_intrinsic(cuda, "_waveProduct(_getActiveMask(), $0)") T WaveActiveProduct(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProductMultiple($0)") +__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveProduct(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveProductMultiple($0)") +__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSum($0)") +__target_intrinsic(cuda, "_waveSum(_getActiveMask(), $0)") T WaveActiveSum(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSumMultiple($0)") +__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)") vector<T,N> WaveActiveSum(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveSumMultiple($0)") +__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr); __generic<T : __BuiltinType> @@ -2617,18 +2762,18 @@ __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqual($0)") +__target_intrinsic(cuda, "_waveAllEqual(_getActiveMask(), $0)") bool WaveActiveAllEqual(T value); __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") +__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)") bool WaveActiveAllEqual(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") +__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)") bool WaveActiveAllEqual(matrix<T,N,M> value); @@ -2679,48 +2824,48 @@ __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveMul($0)") -__target_intrinsic(cuda, "_wavePrefixProduct($0)") +__target_intrinsic(cuda, "_wavePrefixProduct(_getActiveMask(), $0)") T WavePrefixProduct(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveMul($0)") -__target_intrinsic(cuda, "_wavePrefixProductMultiple($0)") +__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)") vector<T,N> WavePrefixProduct(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_wavePrefixProductMultiple($0)") +__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)") matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAdd($0)") -__target_intrinsic(cuda, "_wavePrefixSum($0)") +__target_intrinsic(cuda, "_wavePrefixSum(_getActiveMask(), $0)") T WavePrefixSum(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAdd($0)") -__target_intrinsic(cuda, "_wavePrefixSumMultiple($0)") +__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)") vector<T,N> WavePrefixSum(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_wavePrefixSumMultiple($0)") +__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)") matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr); __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirst($0)") +__target_intrinsic(cuda, "_waveReadFirst(_getActiveMask(), $0)") T WaveReadLaneFirst(T expr); __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") +__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)") vector<T,N> WaveReadLaneFirst(vector<T,N> expr); __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") +__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)") matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr); // NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL @@ -2740,11 +2885,11 @@ __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane); __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") matrix<T,N,M> WaveBroadcastLaneAt(matrix<T,N,M> value, constexpr int lane); @@ -2760,10 +2905,10 @@ __generic<T : __BuiltinType, let N : int> __spirv_version(1.3) __glsl_extension(GL_KHR_shader_subgroup_shuffle) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane); __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane); // NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL @@ -2780,11 +2925,11 @@ __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_shuffle) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") vector<T,N> WaveShuffle(vector<T,N> value, int lane); __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") matrix<T,N,M> WaveShuffle(matrix<T,N,M> value, int lane); @@ -2800,17 +2945,17 @@ uint WavePrefixCountBits(bool value); __generic<T : __BuiltinType> __target_intrinsic(hlsl) __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchScalar($0)") +__target_intrinsic(cuda, "_waveMatchScalar(_getActiveMask(), $0)") uint4 WaveMatch(T value); __generic<T : __BuiltinType, let N : int> __target_intrinsic(hlsl) __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchMultiple($0)") +__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)") uint4 WaveMatch(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(hlsl) __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchMultiple($0)") +__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)") uint4 WaveMatch(matrix<T,N,M> value); __target_intrinsic(hlsl) @@ -2822,18 +2967,18 @@ __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) //__target_intrinsic(glsl, "subgroupExclusiveAnd($0)") -__target_intrinsic(cuda, "_wavePrefixAnd($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitAnd(T expr, uint4 mask); __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAnd($0)") -__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask); __generic<T : __BuiltinArithmeticType> @@ -2841,18 +2986,18 @@ __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) //__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -__target_intrinsic(cuda, "_wavePrefixOr($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitOr(T expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) //__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask); __generic<T : __BuiltinArithmeticType> @@ -2860,44 +3005,44 @@ __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") -__target_intrinsic(cuda, "_wavePrefixXor($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitXor(T expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") -__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask); __generic<T : __BuiltinArithmeticType> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProduct($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixProduct(T value, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask); __generic<T : __BuiltinArithmeticType> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSum($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixSum(T value, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )") vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) -__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))") +__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)") matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask); // `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points |
