diff options
| author | Tim Foley <tfoleyNV@users.noreply.github.com> | 2020-05-04 09:06:55 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-05-04 09:06:55 -0700 |
| commit | 5d3a737e75346b6ced204829a60be2837589e9ad (patch) | |
| tree | f60964d5f9c4042b49416082e81c35eef11b42fa | |
| parent | c697fe50db0a74d76c6922ee24eb1f8d87def5f8 (diff) | |
Make stdlib WaveActive* call WaveMask* (#1336)
This change makes the various `WaveActive*()` functions have default implementations that call `WaveMask*()` passing `WaveActiveMask()`. The new definitions will be used during CUDA code generation, which simplifies some of the duplication that was occuring in the `__target_intrinsic` modifiers.
This change does *not* add logic to make computation of `WaveGetActiveMask()` corect on CUDA, so these functions will still fail to provide the behavior that users need/expect. A future change will need to add logic to synthesize the value of `WaveGetActiveMask()` automatically.
| -rw-r--r-- | source/slang/hlsl.meta.slang | 399 |
1 files changed, 289 insertions, 110 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index a84e88ca8..c0dba51e3 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2531,6 +2531,7 @@ __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBallot(true).x") __target_intrinsic(hlsl, "WaveActiveBallot(true).x") +__target_intrinsic(cuda, "__activemask()") // Note: semantically incorrect, but best we can do for now. WaveMask WaveGetActiveMask(); __glsl_extension(GL_KHR_shader_subgroup_basic) @@ -2920,7 +2921,7 @@ matrix<T,N,M> WaveMaskReadLaneFirst(WaveMask mask, matrix<T,N,M> expr); __generic<T : __BuiltinType> __target_intrinsic(hlsl, "WaveMatch($1).x") __cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchScalar($0, $1)") +__target_intrinsic(cuda, "_waveMatchScalar($0, $1).x") WaveMask WaveMaskMatch(WaveMask mask, T value); __generic<T : __BuiltinType, let N : int> __target_intrinsic(hlsl, "WaveMatch($1).x") @@ -3016,158 +3017,253 @@ __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAnd(_getActiveMask(), $0)") -T WaveActiveBitAnd(T expr); +__target_intrinsic(hlsl) +T WaveActiveBitAnd(T expr) +{ + return WaveMaskBitAnd(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveBitAnd(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T, N> WaveActiveBitAnd(vector<T, N> expr) +{ + return WaveMaskBitAnd(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveBitAnd(matrix<T, N, M> expr) +{ + return WaveMaskBitAnd(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOr(_getActiveMask(), $0)") -T WaveActiveBitOr(T expr); +__target_intrinsic(hlsl) +T WaveActiveBitOr(T expr) +{ + return WaveMaskBitOr(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveBitOr(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WaveActiveBitOr(vector<T,N> expr) +{ + return WaveMaskBitOr(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveBitOr(matrix<T, N, M> expr) +{ + return WaveMaskBitOr(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXor(_getActiveMask(), $0)") -T WaveActiveBitXor(T expr); +__target_intrinsic(hlsl) +T WaveActiveBitXor(T expr) +{ + return WaveMaskBitXor(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveBitXor(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WaveActiveBitXor(vector<T,N> expr) +{ + return WaveMaskBitXor(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinIntegerType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveBitXor(matrix<T, N, M> expr) +{ + return WaveMaskBitXor(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMax(_getActiveMask(), $0)") -T WaveActiveMax(T expr); +__target_intrinsic(hlsl) +T WaveActiveMax(T expr) +{ + return WaveMaskMax(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveMax(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T, N> WaveActiveMax(vector<T, N> expr) +{ + return WaveMaskMax(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveMax(matrix<T, N, M> expr) +{ + return WaveMaskMax(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMin(_getActiveMask(), $0)") -T WaveActiveMin(T expr); +__target_intrinsic(hlsl) +T WaveActiveMin(T expr) +{ + return WaveMaskMin(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveMin(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T, N> WaveActiveMin(vector<T, N> expr) +{ + return WaveMaskMin(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveMin(matrix<T, N, M> expr) +{ + return WaveMaskMin(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProduct(_getActiveMask(), $0)") -T WaveActiveProduct(T expr); +__target_intrinsic(hlsl) +T WaveActiveProduct(T expr) +{ + return WaveMaskProduct(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveProduct(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WaveActiveProduct(vector<T,N> expr) +{ + return WaveMaskProduct(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveActiveProduct(matrix<T, N, M> expr) +{ + return WaveMaskProduct(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSum(_getActiveMask(), $0)") -T WaveActiveSum(T expr); +__target_intrinsic(hlsl) +T WaveActiveSum(T expr) +{ + return WaveMaskSum(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)") -vector<T,N> WaveActiveSum(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WaveActiveSum(vector<T,N> expr) +{ + return WaveMaskSum(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr) +{ + return WaveMaskSum(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqual(_getActiveMask(), $0)") -bool WaveActiveAllEqual(T value); +__target_intrinsic(hlsl) +bool WaveActiveAllEqual(T value) +{ + return WaveMaskAllEqual(WaveGetActiveMask(), value); +} + __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)") -bool WaveActiveAllEqual(vector<T,N> value); -__generic<T : __BuiltinType, let N : int, let M : int> -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)") -bool WaveActiveAllEqual(matrix<T,N,M> value); - +__target_intrinsic(hlsl) +bool WaveActiveAllEqual(vector<T,N> value) +{ + return WaveMaskAllEqual(WaveGetActiveMask(), value); +} +__generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(hlsl) +bool WaveActiveAllEqual(matrix<T, N, M> value) +{ + return WaveMaskAllEqual(WaveGetActiveMask(), value); +} __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAll($0)") -__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") -bool WaveActiveAllTrue(bool condition); +__target_intrinsic(hlsl) +bool WaveActiveAllTrue(bool condition) +{ + return WaveMaskAllTrue(WaveGetActiveMask(), condition); +} __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAny($0)") -__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)") -bool WaveActiveAnyTrue(bool condition); +__target_intrinsic(hlsl) +bool WaveActiveAnyTrue(bool condition) +{ + return WaveMaskAnyTrue(WaveGetActiveMask(), condition); +} __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBallot($0)") -__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") -uint4 WaveActiveBallot(bool condition); +__target_intrinsic(hlsl) +uint4 WaveActiveBallot(bool condition) +{ + return WaveMaskBallot(WaveGetActiveMask(), condition); +} __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "bitCount(subgroupBallot($0))") -__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))") -uint WaveActiveCountBits(bool value); +__target_intrinsic(hlsl) +uint WaveActiveCountBits(bool value) +{ + return WaveMaskCountBits(WaveGetActiveMask(), value); +} __glsl_extension(GL_KHR_shader_subgroup_basic) __spirv_version(1.3) @@ -3184,8 +3280,11 @@ uint WaveGetLaneIndex(); __glsl_extension(GL_KHR_shader_subgroup_basic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupElect()") -__target_intrinsic(cuda, "_waveIsFirstLane()") -bool WaveIsFirstLane(); +__target_intrinsic(hlsl) +bool WaveIsFirstLane() +{ + return WaveMaskIsFirstLane(WaveGetActiveMask()); +} // Prefix @@ -3193,49 +3292,83 @@ __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveMul($0)") -__target_intrinsic(cuda, "_wavePrefixProduct(_getActiveMask(), $0)") -T WavePrefixProduct(T expr); +__target_intrinsic(hlsl) +T WavePrefixProduct(T expr) +{ + return WaveMaskPrefixProduct(WaveGetActiveMask(), expr); +} + + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveMul($0)") -__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)") -vector<T,N> WavePrefixProduct(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WavePrefixProduct(vector<T,N> expr) +{ + return WaveMaskPrefixProduct(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T, N, M> WavePrefixProduct(matrix<T, N, M> expr) +{ + return WaveMaskPrefixProduct(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAdd($0)") -__target_intrinsic(cuda, "_wavePrefixSum(_getActiveMask(), $0)") -T WavePrefixSum(T expr); +__target_intrinsic(hlsl) +T WavePrefixSum(T expr) +{ + return WaveMaskPrefixSum(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAdd($0)") -__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)") -vector<T,N> WavePrefixSum(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WavePrefixSum(vector<T,N> expr) +{ + return WaveMaskPrefixSum(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr) +{ + return WaveMaskPrefixSum(WaveGetActiveMask(), expr); +} __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirst(_getActiveMask(), $0)") -T WaveReadLaneFirst(T expr); +__target_intrinsic(hlsl) +T WaveReadLaneFirst(T expr) +{ + return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)") -vector<T,N> WaveReadLaneFirst(vector<T,N> expr); +__target_intrinsic(hlsl) +vector<T,N> WaveReadLaneFirst(vector<T,N> expr) +{ + return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr); +} + __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)") -matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr) +{ + return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr); +} // NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL // It is provided as access to subgroupBroadcast which can only take a @@ -3247,20 +3380,29 @@ __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -T WaveBroadcastLaneAt(T value, constexpr int lane); +T WaveBroadcastLaneAt(T value, constexpr int lane) +{ + return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane); +vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane) +{ + return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -matrix<T,N,M> WaveBroadcastLaneAt(matrix<T,N,M> value, constexpr int lane); +matrix<T, N, M> WaveBroadcastLaneAt(matrix<T, N, M> value, constexpr int lane) +{ + return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane); +} // TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast // could be used on GLSL. For now we just use subgroupShuffle @@ -3268,17 +3410,29 @@ __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_shuffle) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") -T WaveReadLaneAt(T value, int lane); +__target_intrinsic(hlsl) +T WaveReadLaneAt(T value, int lane) +{ + return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int> __spirv_version(1.3) __glsl_extension(GL_KHR_shader_subgroup_shuffle) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") -vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane); +__target_intrinsic(hlsl) +vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane) +{ + return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") -matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane); +__target_intrinsic(hlsl) +matrix<T, N, M> WaveReadLaneAt(matrix<T, N, M> value, int lane) +{ + return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane); +} // NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL // which means it will only work on hardware which allows arbitrary laneIds which is not true @@ -3287,27 +3441,37 @@ __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_shuffle) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -T WaveShuffle(T value, int lane); +T WaveShuffle(T value, int lane) +{ + return WaveMaskShuffle(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_shuffle) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupShuffle($0, $1)") -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -vector<T,N> WaveShuffle(vector<T,N> value, int lane); +vector<T,N> WaveShuffle(vector<T,N> value, int lane) +{ + return WaveMaskShuffle(WaveGetActiveMask(), value, lane); +} + __generic<T : __BuiltinType, let N : int, let M : int> -__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)") __target_intrinsic(hlsl, "WaveReadLaneAt") -matrix<T,N,M> WaveShuffle(matrix<T,N,M> value, int lane); +matrix<T, N, M> WaveShuffle(matrix<T, N, M> value, int lane) +{ + return WaveMaskShuffle(WaveGetActiveMask(), value, lane); +} __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))") -__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())") -uint WavePrefixCountBits(bool value); - +__target_intrinsic(hlsl) +uint WavePrefixCountBits(bool value) +{ + return WaveMaskPrefixCountBits(WaveGetActiveMask(), value); +} __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) @@ -3327,19 +3491,24 @@ uint4 WaveGetActiveMulti(); __generic<T : __BuiltinType> __target_intrinsic(hlsl) -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchScalar(_getActiveMask(), $0)") -uint4 WaveMatch(T value); +uint4 WaveMatch(T value) +{ + return WaveMaskMatch(WaveGetActiveMask(), value); +} + __generic<T : __BuiltinType, let N : int> __target_intrinsic(hlsl) -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)") -uint4 WaveMatch(vector<T,N> value); +uint4 WaveMatch(vector<T,N> value) +{ + return WaveMaskMatch(WaveGetActiveMask(), value); +} + __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(hlsl) -__cuda_sm_version(7.0) -__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)") -uint4 WaveMatch(matrix<T,N,M> value); +uint4 WaveMatch(matrix<T,N,M> value) +{ + return WaveMaskMatch(WaveGetActiveMask(), value); +} __target_intrinsic(hlsl) __target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())") @@ -3352,6 +3521,7 @@ __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAnd($0)") __target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitAnd(T expr, uint4 mask); + __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -3359,6 +3529,7 @@ __target_intrinsic(glsl, "subgroupExclusiveAnd($0)") __target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)") @@ -3371,6 +3542,7 @@ __spirv_version(1.3) //__target_intrinsic(glsl, "subgroupExclusiveOr($0)") __target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitOr(T expr, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -3378,6 +3550,7 @@ __spirv_version(1.3) //__target_intrinsic(glsl, "subgroupExclusiveOr($0)") __target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)") @@ -3390,6 +3563,7 @@ __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") __target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixBitXor(T expr, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -3397,6 +3571,7 @@ __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") __target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)") @@ -3406,10 +3581,12 @@ __generic<T : __BuiltinArithmeticType> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixProduct(T value, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)") @@ -3419,10 +3596,12 @@ __generic<T : __BuiltinArithmeticType> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)") T WaveMultiPrefixSum(T value, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )") vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask); + __generic<T : __BuiltinArithmeticType, let N : int, let M : int> __target_intrinsic(hlsl) __target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)") |
