summaryrefslogtreecommitdiffstats
path: root/source
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-03-16 15:01:21 -0400
committerGitHub <noreply@github.com>2020-03-16 15:01:21 -0400
commit76b9ff6e65b4bd2be04a5bab0eb1464455c4b3ff (patch)
treea4aa6e8560984cbcbfd2c33df666b144b93418e8 /source
parent256a20a163ef6ee93a817472adcb24c076b0c0dc (diff)
CUDA support of MultiPrefix Wave intrinsics. (#1275)
Support for cs_6_5 cand cs_6_4 in profile Added wave-multi-prefix.slang etst
Diffstat (limited to 'source')
-rw-r--r--source/slang/hlsl.meta.slang103
-rw-r--r--source/slang/slang-profile-defs.h2
2 files changed, 75 insertions, 30 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 30c86b3eb..6ac1038f8 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2628,9 +2628,7 @@ __generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
bool WaveActiveAllEqual(matrix<T,N,M> value);
-__generic<T : __BuiltinType> uint4 WaveMatch(T value);
-__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
-__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
+
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
@@ -2650,11 +2648,9 @@ __target_intrinsic(glsl, "subgroupBallot($0)")
__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
uint4 WaveActiveBallot(bool condition);
-// TODO(JS):
-// subgroupBallotBitCount seems to take a uint4 parameter.
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
-__target_intrinsic(glsl, "subgroupBallotBitCount($0)")
+__target_intrinsic(glsl, "bitCount(subgroupBallot($0))")
__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
uint WaveActiveCountBits(bool value);
@@ -2751,58 +2747,105 @@ uint WavePrefixCountBits(bool value);
// https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md
// TODO(JS): Looks like they need a mask parameter
+__generic<T : __BuiltinType>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_waveMatchScalar($0)")
+uint4 WaveMatch(T value);
+__generic<T : __BuiltinType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_waveMatchMultiple($0)")
+uint4 WaveMatch(vector<T,N> value);
+__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_waveMatchMultiple($0)")
+uint4 WaveMatch(matrix<T,N,M> value);
+
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())")
+uint WaveMultiPrefixCountBits(bool value, uint4 mask);
+
__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
-__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
-__target_intrinsic(cuda, "_wavePrefixAnd($0)")
-T WaveMultiPrefixBitAnd(T expr);
+//__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
+__target_intrinsic(cuda, "_wavePrefixAnd($0, _getMultiPrefixMask(($1).x))")
+T WaveMultiPrefixBitAnd(T expr, uint4 mask);
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
+__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))")
__generic<T : __BuiltinArithmeticType, let N : int>
-vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr);
+vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))")
+matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask);
__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
-__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
-__target_intrinsic(cuda, "_wavePrefixOr($0)")
-T WaveMultiPrefixBitOr(T expr);
+//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
+__target_intrinsic(cuda, "_wavePrefixOr($0, _getMultiPrefixMask(($1).x))")
+T WaveMultiPrefixBitOr(T expr, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int>
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
-__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
-vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);
+//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
+__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))")
+vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))")
+matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask);
__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
-__target_intrinsic(cuda, "_wavePrefixXor($0)")
-T WaveMultiPrefixBitXor(T expr);
+__target_intrinsic(cuda, "_wavePrefixXor($0, _getMultiPrefixMask(($1).x))")
+T WaveMultiPrefixBitXor(T expr, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int>
+__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
-vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);
+__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))")
+vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);
-
-
-uint WaveMultiPrefixCountBits(bool value, uint4 mask);
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))")
+matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask);
-__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);
-__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask);
-__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixProduct($0, _getMultiPrefixMask(($1).x))")
+T WaveMultiPrefixProduct(T value, uint4 mask);
+__generic<T : __BuiltinArithmeticType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))")
+vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask);
+__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))")
+matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask);
-__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, uint4 mask);
-__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);
-__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixSum($0, _getMultiPrefixMask(($1).x))")
+T WaveMultiPrefixSum(T value, uint4 mask);
+__generic<T : __BuiltinArithmeticType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))")
+vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);
+__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))")
+matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);
// `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points
typedef Texture2D texture2D;
diff --git a/source/slang/slang-profile-defs.h b/source/slang/slang-profile-defs.h
index fc2722160..7066b5942 100644
--- a/source/slang/slang-profile-defs.h
+++ b/source/slang/slang-profile-defs.h
@@ -129,6 +129,8 @@ PROFILE(DX_Compute_6_0, cs_6_0, Compute, DX_6_0)
PROFILE(DX_Compute_6_1, cs_6_1, Compute, DX_6_1)
PROFILE(DX_Compute_6_2, cs_6_2, Compute, DX_6_2)
PROFILE(DX_Compute_6_3, cs_6_3, Compute, DX_6_3)
+PROFILE(DX_Compute_6_4, cs_6_4, Compute, DX_6_4)
+PROFILE(DX_Compute_6_5, cs_6_5, Compute, DX_6_5)
PROFILE(DX_Domain_5_0, ds_5_0, Domain, DX_5_0)
PROFILE(DX_Domain_5_1, ds_5_1, Domain, DX_5_1)