summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Foley <tfoleyNV@users.noreply.github.com>2020-05-04 09:06:55 -0700
committerGitHub <noreply@github.com>2020-05-04 09:06:55 -0700
commit5d3a737e75346b6ced204829a60be2837589e9ad (patch)
treef60964d5f9c4042b49416082e81c35eef11b42fa
parentc697fe50db0a74d76c6922ee24eb1f8d87def5f8 (diff)
Make stdlib WaveActive* call WaveMask* (#1336)
This change makes the various `WaveActive*()` functions have default implementations that call `WaveMask*()` passing `WaveActiveMask()`. The new definitions will be used during CUDA code generation, which simplifies some of the duplication that was occuring in the `__target_intrinsic` modifiers. This change does *not* add logic to make computation of `WaveGetActiveMask()` corect on CUDA, so these functions will still fail to provide the behavior that users need/expect. A future change will need to add logic to synthesize the value of `WaveGetActiveMask()` automatically.
-rw-r--r--source/slang/hlsl.meta.slang399
1 files changed, 289 insertions, 110 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index a84e88ca8..c0dba51e3 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2531,6 +2531,7 @@ __glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true).x")
__target_intrinsic(hlsl, "WaveActiveBallot(true).x")
+__target_intrinsic(cuda, "__activemask()") // Note: semantically incorrect, but best we can do for now.
WaveMask WaveGetActiveMask();
__glsl_extension(GL_KHR_shader_subgroup_basic)
@@ -2920,7 +2921,7 @@ matrix<T,N,M> WaveMaskReadLaneFirst(WaveMask mask, matrix<T,N,M> expr);
__generic<T : __BuiltinType>
__target_intrinsic(hlsl, "WaveMatch($1).x")
__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveMatchScalar($0, $1)")
+__target_intrinsic(cuda, "_waveMatchScalar($0, $1).x")
WaveMask WaveMaskMatch(WaveMask mask, T value);
__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl, "WaveMatch($1).x")
@@ -3016,158 +3017,253 @@ __generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($0)")
-__target_intrinsic(cuda, "_waveAnd(_getActiveMask(), $0)")
-T WaveActiveBitAnd(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveBitAnd(T expr)
+{
+ return WaveMaskBitAnd(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($0)")
-__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveBitAnd(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T, N> WaveActiveBitAnd(vector<T, N> expr)
+{
+ return WaveMaskBitAnd(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveAndMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveBitAnd(matrix<T, N, M> expr)
+{
+ return WaveMaskBitAnd(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($0)")
-__target_intrinsic(cuda, "_waveOr(_getActiveMask(), $0)")
-T WaveActiveBitOr(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveBitOr(T expr)
+{
+ return WaveMaskBitOr(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($0)")
-__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveBitOr(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WaveActiveBitOr(vector<T,N> expr)
+{
+ return WaveMaskBitOr(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveOrMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveBitOr(matrix<T, N, M> expr)
+{
+ return WaveMaskBitOr(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($0)")
-__target_intrinsic(cuda, "_waveXor(_getActiveMask(), $0)")
-T WaveActiveBitXor(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveBitXor(T expr)
+{
+ return WaveMaskBitXor(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($0)")
-__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveBitXor(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WaveActiveBitXor(vector<T,N> expr)
+{
+ return WaveMaskBitXor(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveXorMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveBitXor(matrix<T, N, M> expr)
+{
+ return WaveMaskBitXor(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($0)")
-__target_intrinsic(cuda, "_waveMax(_getActiveMask(), $0)")
-T WaveActiveMax(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveMax(T expr)
+{
+ return WaveMaskMax(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($0)")
-__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveMax(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T, N> WaveActiveMax(vector<T, N> expr)
+{
+ return WaveMaskMax(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveMaxMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveMax(matrix<T, N, M> expr)
+{
+ return WaveMaskMax(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($0)")
-__target_intrinsic(cuda, "_waveMin(_getActiveMask(), $0)")
-T WaveActiveMin(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveMin(T expr)
+{
+ return WaveMaskMin(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($0)")
-__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveMin(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T, N> WaveActiveMin(vector<T, N> expr)
+{
+ return WaveMaskMin(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveMinMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveMin(matrix<T, N, M> expr)
+{
+ return WaveMaskMin(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($0)")
-__target_intrinsic(cuda, "_waveProduct(_getActiveMask(), $0)")
-T WaveActiveProduct(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveProduct(T expr)
+{
+ return WaveMaskProduct(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($0)")
-__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveProduct(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WaveActiveProduct(vector<T,N> expr)
+{
+ return WaveMaskProduct(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveProductMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveActiveProduct(matrix<T, N, M> expr)
+{
+ return WaveMaskProduct(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($0)")
-__target_intrinsic(cuda, "_waveSum(_getActiveMask(), $0)")
-T WaveActiveSum(T expr);
+__target_intrinsic(hlsl)
+T WaveActiveSum(T expr)
+{
+ return WaveMaskSum(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($0)")
-__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveActiveSum(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WaveActiveSum(vector<T,N> expr)
+{
+ return WaveMaskSum(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveSumMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr)
+{
+ return WaveMaskSum(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($0)")
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveAllEqual(_getActiveMask(), $0)")
-bool WaveActiveAllEqual(T value);
+__target_intrinsic(hlsl)
+bool WaveActiveAllEqual(T value)
+{
+ return WaveMaskAllEqual(WaveGetActiveMask(), value);
+}
+
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($0)")
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)")
-bool WaveActiveAllEqual(vector<T,N> value);
-__generic<T : __BuiltinType, let N : int, let M : int>
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveAllEqualMultiple(_getActiveMask(), $0)")
-bool WaveActiveAllEqual(matrix<T,N,M> value);
-
+__target_intrinsic(hlsl)
+bool WaveActiveAllEqual(vector<T,N> value)
+{
+ return WaveMaskAllEqual(WaveGetActiveMask(), value);
+}
+__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+bool WaveActiveAllEqual(matrix<T, N, M> value)
+{
+ return WaveMaskAllEqual(WaveGetActiveMask(), value);
+}
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAll($0)")
-__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
-bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(hlsl)
+bool WaveActiveAllTrue(bool condition)
+{
+ return WaveMaskAllTrue(WaveGetActiveMask(), condition);
+}
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAny($0)")
-__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)")
-bool WaveActiveAnyTrue(bool condition);
+__target_intrinsic(hlsl)
+bool WaveActiveAnyTrue(bool condition)
+{
+ return WaveMaskAnyTrue(WaveGetActiveMask(), condition);
+}
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot($0)")
-__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
-uint4 WaveActiveBallot(bool condition);
+__target_intrinsic(hlsl)
+uint4 WaveActiveBallot(bool condition)
+{
+ return WaveMaskBallot(WaveGetActiveMask(), condition);
+}
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "bitCount(subgroupBallot($0))")
-__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
-uint WaveActiveCountBits(bool value);
+__target_intrinsic(hlsl)
+uint WaveActiveCountBits(bool value)
+{
+ return WaveMaskCountBits(WaveGetActiveMask(), value);
+}
__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
@@ -3184,8 +3280,11 @@ uint WaveGetLaneIndex();
__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupElect()")
-__target_intrinsic(cuda, "_waveIsFirstLane()")
-bool WaveIsFirstLane();
+__target_intrinsic(hlsl)
+bool WaveIsFirstLane()
+{
+ return WaveMaskIsFirstLane(WaveGetActiveMask());
+}
// Prefix
@@ -3193,49 +3292,83 @@ __generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($0)")
-__target_intrinsic(cuda, "_wavePrefixProduct(_getActiveMask(), $0)")
-T WavePrefixProduct(T expr);
+__target_intrinsic(hlsl)
+T WavePrefixProduct(T expr)
+{
+ return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
+}
+
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($0)")
-__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)")
-vector<T,N> WavePrefixProduct(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WavePrefixProduct(vector<T,N> expr)
+{
+ return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WavePrefixProduct(matrix<T, N, M> expr)
+{
+ return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($0)")
-__target_intrinsic(cuda, "_wavePrefixSum(_getActiveMask(), $0)")
-T WavePrefixSum(T expr);
+__target_intrinsic(hlsl)
+T WavePrefixSum(T expr)
+{
+ return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($0)")
-__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)")
-vector<T,N> WavePrefixSum(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WavePrefixSum(vector<T,N> expr)
+{
+ return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr)
+{
+ return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
+}
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
-__target_intrinsic(cuda, "_waveReadFirst(_getActiveMask(), $0)")
-T WaveReadLaneFirst(T expr);
+__target_intrinsic(hlsl)
+T WaveReadLaneFirst(T expr)
+{
+ return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
-__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)")
-vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
+__target_intrinsic(hlsl)
+vector<T,N> WaveReadLaneFirst(vector<T,N> expr)
+{
+ return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
+}
+
__generic<T : __BuiltinType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveReadFirstMultiple(_getActiveMask(), $0)")
-matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr)
+{
+ return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
+}
// NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL
// It is provided as access to subgroupBroadcast which can only take a
@@ -3247,20 +3380,29 @@ __generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-T WaveBroadcastLaneAt(T value, constexpr int lane);
+T WaveBroadcastLaneAt(T value, constexpr int lane)
+{
+ return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane);
+vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane)
+{
+ return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-matrix<T,N,M> WaveBroadcastLaneAt(matrix<T,N,M> value, constexpr int lane);
+matrix<T, N, M> WaveBroadcastLaneAt(matrix<T, N, M> value, constexpr int lane)
+{
+ return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
+}
// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
// could be used on GLSL. For now we just use subgroupShuffle
@@ -3268,17 +3410,29 @@ __generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
-T WaveReadLaneAt(T value, int lane);
+__target_intrinsic(hlsl)
+T WaveReadLaneAt(T value, int lane)
+{
+ return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
-__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
-vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
+__target_intrinsic(hlsl)
+vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane)
+{
+ return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
-matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
+__target_intrinsic(hlsl)
+matrix<T, N, M> WaveReadLaneAt(matrix<T, N, M> value, int lane)
+{
+ return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
+}
// NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
// which means it will only work on hardware which allows arbitrary laneIds which is not true
@@ -3287,27 +3441,37 @@ __generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-T WaveShuffle(T value, int lane);
+T WaveShuffle(T value, int lane)
+{
+ return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
-__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-vector<T,N> WaveShuffle(vector<T,N> value, int lane);
+vector<T,N> WaveShuffle(vector<T,N> value, int lane)
+{
+ return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
+}
+
__generic<T : __BuiltinType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
-matrix<T,N,M> WaveShuffle(matrix<T,N,M> value, int lane);
+matrix<T, N, M> WaveShuffle(matrix<T, N, M> value, int lane)
+{
+ return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
+}
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))")
-__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())")
-uint WavePrefixCountBits(bool value);
-
+__target_intrinsic(hlsl)
+uint WavePrefixCountBits(bool value)
+{
+ return WaveMaskPrefixCountBits(WaveGetActiveMask(), value);
+}
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
@@ -3327,19 +3491,24 @@ uint4 WaveGetActiveMulti();
__generic<T : __BuiltinType>
__target_intrinsic(hlsl)
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveMatchScalar(_getActiveMask(), $0)")
-uint4 WaveMatch(T value);
+uint4 WaveMatch(T value)
+{
+ return WaveMaskMatch(WaveGetActiveMask(), value);
+}
+
__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl)
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)")
-uint4 WaveMatch(vector<T,N> value);
+uint4 WaveMatch(vector<T,N> value)
+{
+ return WaveMaskMatch(WaveGetActiveMask(), value);
+}
+
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
-__cuda_sm_version(7.0)
-__target_intrinsic(cuda, "_waveMatchMultiple(_getActiveMask(), $0)")
-uint4 WaveMatch(matrix<T,N,M> value);
+uint4 WaveMatch(matrix<T,N,M> value)
+{
+ return WaveMaskMatch(WaveGetActiveMask(), value);
+}
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())")
@@ -3352,6 +3521,7 @@ __spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitAnd(T expr, uint4 mask);
+
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
@@ -3359,6 +3529,7 @@ __target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
__generic<T : __BuiltinArithmeticType, let N : int>
vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
@@ -3371,6 +3542,7 @@ __spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitOr(T expr, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
@@ -3378,6 +3550,7 @@ __spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
@@ -3390,6 +3563,7 @@ __spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitXor(T expr, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
@@ -3397,6 +3571,7 @@ __spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
@@ -3406,10 +3581,12 @@ __generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixProduct(T value, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
@@ -3419,10 +3596,12 @@ __generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixSum(T value, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )")
vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);
+
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)")