From 7de5f63225cde20401da7c1c69b00d0b7dc8d89f Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Tue, 21 Apr 2020 09:32:21 -0400 Subject: WaveMask remaining intrinsics and tests (#1327) * Fix issues in wave-mask/wave.slang tests. WaveGetActiveMask -> WaveGetConvergedMask. Update target-compatibility.md * First pass at wave-intrinsics.md documentation. Write up around WaveMaskSharedSync. * Added more of the Wave intrinsics as WaveMask intrinsics. Improvements to documentation around wave-intrinsics. * Add the Wave intrinsics for SM6.5 for WaveMask Expand WaveMask intrinsics Improve WaveMask documentation * Added WaveMaskIsFirstLane. Co-authored-by: Tim Foley --- source/slang/hlsl.meta.slang | 88 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index e9da539bf..55c66ffc0 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2485,13 +2485,20 @@ matrix trunc(matrix x) MATRIX_MAP_UNARY(T, N, M, trunc, x); } -// Slang Specific Mask Wave Intrinsics +// Slang Specific 'Mask' Wave Intrinsics typedef uint WaveMask; __target_intrinsic(cuda, "__activemask()") WaveMask WaveGetConvergedMask() { return 0xffffffff; } +__glsl_extension(GL_KHR_shader_subgroup_basic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupElect()") +__target_intrinsic(cuda, "(($0 & -$0) == (WarpMask(1) << _getLaneId()))") +__target_intrinsic(hlsl, "WaveIsFirstLane()") +bool WaveMaskIsFirstLane(WaveMask mask); + __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAll($1)") @@ -2564,7 +2571,6 @@ __target_intrinsic(glsl, "subgroupBarrier()") __target_intrinsic(hlsl, "GroupMemoryBarrier()") void GroupMemoryBarrierWithWaveMaskSync(WaveMask mask); - __glsl_extension(GL_KHR_shader_subgroup_basic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBarrier()") @@ -2620,7 +2626,7 @@ __target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") __target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") vector WaveMaskReadLaneAt(WaveMask mask, vector value, int lane); __generic -__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)") +__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)") __target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)") matrix WaveMaskReadLaneAt(WaveMask mask, matrix value, int lane); @@ -2689,7 +2695,7 @@ __target_intrinsic(cuda, "_waveOrMultiple($0, $1)") __target_intrinsic(hlsl, "WaveActiveBitOr($1)") vector WaveMaskBitOr(WaveMask mask, vector expr); __generic -__target_intrinsic(cuda, "_waveOrMultiple(_$0, $1)") +__target_intrinsic(cuda, "_waveOrMultiple($0, $1)") __target_intrinsic(hlsl, "WaveActiveBitOr($1)") matrix WaveMaskBitOr(WaveMask mask, matrix expr); @@ -2866,8 +2872,82 @@ __generic __target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)") matrix WaveMaskReadLaneFirst(WaveMask mask, matrix expr); +// WaveMask SM6.5 like intrinsics +// TODO(JS): On HLSL it only works for 32 bits or less +__generic +__target_intrinsic(hlsl, "WaveMatch($1).x") +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveMatchScalar($0, $1)") +WaveMask WaveMaskMatch(WaveMask mask, T value); +__generic +__target_intrinsic(hlsl, "WaveMatch($1).x") +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)") +WaveMask WaveMaskMatch(WaveMask mask, vector value); +__generic +__target_intrinsic(hlsl, "WaveMatch($1).x") +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)") +WaveMask WaveMaskMatch(WaveMask mask, matrix value); + +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +//__target_intrinsic(glsl, "subgroupExclusiveAnd($1)") +__target_intrinsic(cuda, "_wavePrefixAnd($0, $1)") +T WaveMaskPrefixBitAnd(WaveMask mask, T expr); +__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveAnd($1)") +__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, $1)") +__generic +vector WaveMaskPrefixBitAnd(WaveMask mask, vector expr); +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))") +__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)") +matrix WaveMaskPrefixBitAnd(WaveMask mask, matrix expr); + +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +//__target_intrinsic(glsl, "subgroupExclusiveOr($1)") +__target_intrinsic(cuda, "_wavePrefixOr($0, $1)") +T WaveMaskPrefixBitOr(WaveMask mask, T expr); +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +//__target_intrinsic(glsl, "subgroupExclusiveOr($1)") +__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)") +vector WaveMaskPrefixBitOr(WaveMask mask, vector expr); +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))") +__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)") +matrix WaveMaskPrefixBitOr(WaveMask mask, matrix expr); + +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveXor($1)") +__target_intrinsic(cuda, "_wavePrefixXor($0, $1)") +T WaveMaskPrefixBitXor(WaveMask mask, T expr); +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))") +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveXor($1)") +__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)") +vector WaveMaskPrefixBitXor(WaveMask mask, vector expr); +__generic +__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))") +__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)") +matrix WaveMaskPrefixBitXor(WaveMask mask, matrix expr); // Shader model 6.0 stuff -- cgit v1.2.3