summaryrefslogtreecommitdiff
path: root/source/slang/hlsl.meta.slang
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-03-02 17:22:03 -0500
committerGitHub <noreply@github.com>2020-03-02 17:22:03 -0500
commitdbd8e8dc0847338a2a93d35385f48b5ce5671dd6 (patch)
tree415b0fed637de144bf7385269efe0d8e0781ed98 /source/slang/hlsl.meta.slang
parent8899c149b05def1cce626ea649012c4c974861de (diff)
Feature/glsl wave intrinsic (#1253)
* Test for some wave intrinsics. More wave intrinsic support on CUDA. * Use shfl_xor_sync. * Improvements around wave intrinsics. Fix built in integer types belong to __BuiltinIntegerType. * Improvements and fixes around Wave intrinsics. * Added WaveIsFirstLane test. No longer use __wavemask_lt, as appears not available as an intrinsic. * Small fixes to CUDA prelude. * Add wave-active-product test. Handle the special case for arbitray sums. * Used macro to implement CUDA wave intrinsics. * First pass at glsl wave intrinsics. Doesn't work in practice because require mechanism to set spir-v version Replace use of _lanemask_lt() for CUDA.
Diffstat (limited to 'source/slang/hlsl.meta.slang')
-rw-r--r--source/slang/hlsl.meta.slang50
1 files changed, 48 insertions, 2 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index edb678ad6..417f4594d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1379,6 +1379,9 @@ __generic<T : __BuiltinFloatingPointType, let N : int, let M : int> matrix<T,N,M
// Shader model 6.0 stuff
+// Information for GLSL wave/subgroup support
+// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
+
__generic<T : __BuiltinType> T QuadReadLaneAt(T sourceValue, uint quadLaneID);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID);
@@ -1396,48 +1399,64 @@ __generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vec
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);
__generic<T : __BuiltinIntegerType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupAnd($0)")
__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)")
T WaveActiveBitAnd(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);
__generic<T : __BuiltinIntegerType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupOr($0)")
__target_intrinsic(cuda, "_waveOr(__activemask(), $0)")
T WaveActiveBitOr(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);
__generic<T : __BuiltinIntegerType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupXor($0)")
__target_intrinsic(cuda, "_waveXor(__activemask(), $0)")
T WaveActiveBitXor(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupMax($0)")
__target_intrinsic(cuda, "_waveMax(__activemask(), $0)")
T WaveActiveMax(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupMin($0)")
__target_intrinsic(cuda, "_waveMin(__activemask(), $0)")
T WaveActiveMin(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupMul($0)")
__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)")
T WaveActiveProduct(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupAdd($0)")
__target_intrinsic(cuda, "_waveSum(__activemask(), $0)")
T WaveActiveSum(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);
__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_vote)
+__target_intrinsic(glsl, "subgroupAllEqual($0)")
__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)")
bool WaveActiveAllEqual(T value);
__generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
@@ -1452,24 +1471,40 @@ __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,
// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask()
// seems to be appropriate.
+__glsl_extension(GL_KHR_shader_subgroup_vote)
+__target_intrinsic(glsl, "subgroupAll($0)")
__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
bool WaveActiveAllTrue(bool condition);
+
+__glsl_extension(GL_KHR_shader_subgroup_vote)
+__target_intrinsic(glsl, "subgroupAny($0)")
__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)")
bool WaveActiveAnyTrue(bool condition);
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
+__target_intrinsic(glsl, "subgroupBallot($0)")
__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
uint4 WaveActiveBallot(bool condition);
+// TODO(JS):
+// subgroupBallotBitCount seems to take a uint4 parameter.
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
+__target_intrinsic(glsl, "subgroupBallotBitCount($0)")
__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
uint WaveActiveCountBits(bool value);
+__glsl_extension(GL_KHR_shader_subgroup_basic)
+__target_intrinsic(glsl, "gl_SubgroupSize")
__target_intrinsic(cuda, "(warpSize)")
uint WaveGetLaneCount();
+__glsl_extension(GL_KHR_shader_subgroup_basic)
+__target_intrinsic(glsl, "gl_SubgroupInvocationID")
__target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();
-// If there are no *active* lanes less than this one, we must be the lowest lane
+__glsl_extension(GL_KHR_shader_subgroup_basic)
+__target_intrinsic(glsl, "subgroupElect()")
__target_intrinsic(cuda, "_waveIsFirstLane()")
bool WaveIsFirstLane();
@@ -1477,11 +1512,15 @@ bool WaveIsFirstLane();
// that would mean different lanes having a different mask, and they all have to have the same mask.
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupExcusiveMul($0)")
T WavePrefixProduct(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);
__generic<T : __BuiltinArithmeticType>
+__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
+__target_intrinsic(glsl, "subgroupExcusiveAdd($0)")
T WavePrefixSum(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);
@@ -1499,7 +1538,10 @@ T WaveMultiPrefixBitXor(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);
-__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())")
+// TODO(JS): This takes uvec4 parameter on GLSL
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
+__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount($0)")
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())")
uint WavePrefixCountBits(bool value);
uint WaveMultiPrefixCountBits(bool value, uint4 mask);
@@ -1513,12 +1555,16 @@ __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixS
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);
__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
+__target_intrinsic(glsl, "subgoupBroadcastFirst($0)")
__target_intrinsic(cuda, "_waveReadFirst($0)")
T WaveReadLaneFirst(T expr);
__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
+__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)")
T WaveReadLaneAt(T value, int lane);
__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);