diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-03-16 15:01:21 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-16 15:01:21 -0400 |
| commit | 76b9ff6e65b4bd2be04a5bab0eb1464455c4b3ff (patch) | |
| tree | a4aa6e8560984cbcbfd2c33df666b144b93418e8 /source | |
| parent | 256a20a163ef6ee93a817472adcb24c076b0c0dc (diff) | |
CUDA support of MultiPrefix Wave intrinsics. (#1275)
Support for cs_6_5 cand cs_6_4 in profile
Added wave-multi-prefix.slang etst
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 103 | ||||
| -rw-r--r-- | source/slang/slang-profile-defs.h | 2 |
2 files changed, 75 insertions, 30 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 30c86b3eb..6ac1038f8 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2628,9 +2628,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(cuda, "_waveAllEqualMultiple($0)") bool WaveActiveAllEqual(matrix<T,N,M> value); -__generic<T : __BuiltinType> uint4 WaveMatch(T value); -__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value); -__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value); + __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) @@ -2650,11 +2648,9 @@ __target_intrinsic(glsl, "subgroupBallot($0)") __target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") uint4 WaveActiveBallot(bool condition); -// TODO(JS): -// subgroupBallotBitCount seems to take a uint4 parameter. __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBallotBitCount($0)") +__target_intrinsic(glsl, "bitCount(subgroupBallot($0))") __target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))") uint WaveActiveCountBits(bool value); @@ -2751,58 +2747,105 @@ uint WavePrefixCountBits(bool value); // https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md // TODO(JS): Looks like they need a mask parameter +__generic<T : __BuiltinType> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_waveMatchScalar($0)") +uint4 WaveMatch(T value); +__generic<T : __BuiltinType, let N : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_waveMatchMultiple($0)") +uint4 WaveMatch(vector<T,N> value); +__generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_waveMatchMultiple($0)") +uint4 WaveMatch(matrix<T,N,M> value); + +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())") +uint WaveMultiPrefixCountBits(bool value, uint4 mask); + __generic<T : __BuiltinArithmeticType> +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveAnd($0)") -__target_intrinsic(cuda, "_wavePrefixAnd($0)") -T WaveMultiPrefixBitAnd(T expr); +//__target_intrinsic(glsl, "subgroupExclusiveAnd($0)") +__target_intrinsic(cuda, "_wavePrefixAnd($0, _getMultiPrefixMask(($1).x))") +T WaveMultiPrefixBitAnd(T expr, uint4 mask); +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAnd($0)") +__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))") __generic<T : __BuiltinArithmeticType, let N : int> -vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr); +vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, _getMultiPrefixMask(($1).x))") +matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask); __generic<T : __BuiltinArithmeticType> +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -__target_intrinsic(cuda, "_wavePrefixOr($0)") -T WaveMultiPrefixBitOr(T expr); +//__target_intrinsic(glsl, "subgroupExclusiveOr($0)") +__target_intrinsic(cuda, "_wavePrefixOr($0, _getMultiPrefixMask(($1).x))") +T WaveMultiPrefixBitOr(T expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -__target_intrinsic(glsl, "subgroupExclusiveOr($0)") -vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr); +//__target_intrinsic(glsl, "subgroupExclusiveOr($0)") +__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))") +vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr); +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, _getMultiPrefixMask(($1).x))") +matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask); __generic<T : __BuiltinArithmeticType> +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") -__target_intrinsic(cuda, "_wavePrefixXor($0)") -T WaveMultiPrefixBitXor(T expr); +__target_intrinsic(cuda, "_wavePrefixXor($0, _getMultiPrefixMask(($1).x))") +T WaveMultiPrefixBitXor(T expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int> +__target_intrinsic(hlsl) __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") -vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr); +__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))") +vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> -matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr); - - -uint WaveMultiPrefixCountBits(bool value, uint4 mask); +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, _getMultiPrefixMask(($1).x))") +matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask); -__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask); -__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask); -__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixProduct($0, _getMultiPrefixMask(($1).x))") +T WaveMultiPrefixProduct(T value, uint4 mask); +__generic<T : __BuiltinArithmeticType, let N : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))") +vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask); +__generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, _getMultiPrefixMask(($1).x))") +matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask); -__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, uint4 mask); -__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask); -__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixSum($0, _getMultiPrefixMask(($1).x))") +T WaveMultiPrefixSum(T value, uint4 mask); +__generic<T : __BuiltinArithmeticType, let N : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))") +vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask); +__generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, _getMultiPrefixMask(($1).x))") +matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask); // `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points typedef Texture2D texture2D; diff --git a/source/slang/slang-profile-defs.h b/source/slang/slang-profile-defs.h index fc2722160..7066b5942 100644 --- a/source/slang/slang-profile-defs.h +++ b/source/slang/slang-profile-defs.h @@ -129,6 +129,8 @@ PROFILE(DX_Compute_6_0, cs_6_0, Compute, DX_6_0) PROFILE(DX_Compute_6_1, cs_6_1, Compute, DX_6_1) PROFILE(DX_Compute_6_2, cs_6_2, Compute, DX_6_2) PROFILE(DX_Compute_6_3, cs_6_3, Compute, DX_6_3) +PROFILE(DX_Compute_6_4, cs_6_4, Compute, DX_6_4) +PROFILE(DX_Compute_6_5, cs_6_5, Compute, DX_6_5) PROFILE(DX_Domain_5_0, ds_5_0, Domain, DX_5_0) PROFILE(DX_Domain_5_1, ds_5_1, Domain, DX_5_1) |
