From c4441d804aaa97bad7ff01bef505491d30bbc046 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Mon, 20 Apr 2020 13:03:18 -0400 Subject: Feature/wave mask review (#1325) * Fix issues in wave-mask/wave.slang tests. WaveGetActiveMask -> WaveGetConvergedMask. Update target-compatibility.md * First pass at wave-intrinsics.md documentation. Write up around WaveMaskSharedSync. * Added more of the Wave intrinsics as WaveMask intrinsics. Improvements to documentation around wave-intrinsics. --- source/slang/hlsl.meta.slang | 250 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 246 insertions(+), 4 deletions(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index f096a125e..73cda7761 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2484,7 +2484,7 @@ matrix trunc(matrix x) typedef uint WaveMask; __target_intrinsic(cuda, "__activemask()") -WaveMask WaveGetActiveMask() { return 0xffffffff; } +WaveMask WaveGetConvergedMask() { return 0xffffffff; } __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) @@ -2534,19 +2534,42 @@ __glsl_extension(GL_KHR_shader_subgroup_basic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBarrier()") __target_intrinsic(hlsl, "AllMemoryBarrier()") -void WaveMaskSync(WaveMask mask); +void AllMemoryBarrierWithWaveMaskSync(WaveMask mask); // On GLSL, it appears we can't use subgroupMemoryBarrierShared, because it only implies a memory ordering, it does not // imply convergence. For subgroupBarrier we have from the docs.. // "The function subgroupBarrier() enforces that all active invocations within a subgroup must execute this function before any // are allowed to continue their execution" - +// TODO(JS): +// It's not entirely clear what to do here on HLSL. +// Reading the dxc wiki (https://github.com/Microsoft/DirectXShaderCompiler/wiki/Wave-Intrinsics), we have statements like: +// ... these intrinsics enable the elimination of barrier constructs when the scope of synchronization is within the width of the SIMD processor. +// Wave: A set of lanes executed simultaneously in the processor. No explicit barriers are required to guarantee that they execute in parallel. +// Which seems to imply at least some memory barriers like Shared might not be needed. +// +// The barrier is left here though, because not only is the barrier make writes before the barrier across the wave appear to others afterwards, it's +// also there to inform the compiler on what order reads and writes can take place. This might seem to be silly because of the 'Active' lanes +// aspect of HLSL seems to make everything in lock step - but that's not quite so, it only has to apparently be that way as far as the programmers +// model appears - divergence could perhaps potentially still happen. __target_intrinsic(cuda, "__syncwarp($0)") __glsl_extension(GL_KHR_shader_subgroup_basic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBarrier()") __target_intrinsic(hlsl, "GroupMemoryBarrier()") -void WaveMaskSharedSync(WaveMask mask); +void GroupMemoryBarrierWithWaveMaskSync(WaveMask mask); + + +__glsl_extension(GL_KHR_shader_subgroup_basic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBarrier()") +__target_intrinsic(hlsl, "AllMemoryBarrier()") +void AllMemoryBarrierWithWaveSync(); + +__glsl_extension(GL_KHR_shader_subgroup_basic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBarrier()") +__target_intrinsic(hlsl, "GroupMemoryBarrier()") +void GroupMemoryBarrierWithWaveSync(); // NOTE! WaveMaskBroadcastLaneAt is *NOT* standard HLSL // It is provided as access to subgroupBroadcast which can only take a @@ -2624,6 +2647,222 @@ __target_intrinsic(cuda, "__popc(__ballot_sync($0, $1) & _getLaneLtMask())") __target_intrinsic(hlsl, "WavePrefixCountBits($1)") uint WaveMaskPrefixCountBits(WaveMask mask, bool value); +// Across lane ops + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAnd($1)") +__target_intrinsic(cuda, "_waveAnd($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitAnd($1)") +T WaveMaskBitAnd(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAnd($1)") +__target_intrinsic(cuda, "_waveAndMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitAnd($1)") +vector WaveMaskBitAnd(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveAndMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitAnd($1)") +matrix WaveMaskBitAnd(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupOr($1)") +__target_intrinsic(cuda, "_waveOr($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitOr($1)") +T WaveMaskBitOr(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupOr($1)") +__target_intrinsic(cuda, "_waveOrMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitOr($1)") +vector WaveMaskBitOr(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveOrMultiple(_$0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitOr($1)") +matrix WaveMaskBitOr(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupXor($1)") +__target_intrinsic(cuda, "_waveXor($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitXor($1)") +T WaveMaskBitXor(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupXor($1)") +__target_intrinsic(cuda, "_waveXorMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitXor($1)") +vector WaveMaskBitXor(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveXorMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveBitXor($1)") +matrix WaveMaskBitXor(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMax($1)") +__target_intrinsic(cuda, "_waveMax($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMax($1)") +T WaveMaskMax(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMax($1)") +__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMax($1)") +vector WaveMaskMax(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMax($1)") +matrix WaveMaskMax(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMin($1)") +__target_intrinsic(cuda, "_waveMin($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMin($1)") +T WaveMaskMin(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMin($1)") +__target_intrinsic(cuda, "_waveMinMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMin($1)") +vector WaveMaskMin(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveMinMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveMin($1)") +matrix WaveMaskMin(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMul($1)") +__target_intrinsic(cuda, "_waveProduct($0, $1)") +__target_intrinsic(hlsl, "WaveActiveProduct($1)") +T WaveMaskProduct(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupMul($1)") +__target_intrinsic(cuda, "_waveProductMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveProduct($1)") +vector WaveMaskProduct(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveProductMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveProduct($1)") +matrix WaveMaskProduct(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAdd($1)") +__target_intrinsic(cuda, "_waveSum($0, $1)") +__target_intrinsic(hlsl, "WaveActiveSum($1)") +T WaveMaskSum(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAdd($1)") +__target_intrinsic(cuda, "_waveSumMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveSum($1)") +vector WaveMaskSum(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveSumMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveSum($1)") +matrix WaveMaskSum(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_vote) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAllEqual($1)") +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveAllEqual($0, $1)") +__target_intrinsic(hlsl, "WaveActiveAllEqual($1)") +bool WaveMaskAllEqual(WaveMask mask, T value); +__generic +__glsl_extension(GL_KHR_shader_subgroup_vote) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupAllEqual($1)") +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveAllEqual($1)") +bool WaveMaskAllEqual(WaveMask mask, vector value); +__generic +__cuda_sm_version(7.0) +__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)") +__target_intrinsic(hlsl, "WaveActiveAllEqual($1)") +bool WaveMaskAllEqual(WaveMask mask, matrix value); + +// Prefix + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveMul($1)") +__target_intrinsic(cuda, "_wavePrefixProduct($0, $1)") +__target_intrinsic(hlsl, "WavePrefixProduct($1)") +T WaveMaskPrefixProduct(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveMul($1)") +__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)") +__target_intrinsic(hlsl, "WavePrefixProduct($1)") +vector WaveMaskPrefixProduct(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)") +__target_intrinsic(hlsl, "WavePrefixProduct($1)") +matrix WaveMaskPrefixProduct(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveAdd($1)") +__target_intrinsic(cuda, "_wavePrefixSum($0, $1)") +__target_intrinsic(hlsl, "WavePrefixSum($1)") +T WaveMaskPrefixSum(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupExclusiveAdd($1)") +__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)") +__target_intrinsic(hlsl, "WavePrefixSum($1)") +vector WaveMaskPrefixSum(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)") +__target_intrinsic(hlsl, "WavePrefixSum($1)") +matrix WaveMaskPrefixSum(WaveMask mask, matrix expr); + +__generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcastFirst($1)") +__target_intrinsic(cuda, "_waveReadFirst($0, $1)") +T WaveMaskReadLaneFirst(WaveMask mask, T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcastFirst($1)") +__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)") +vector WaveMaskReadLaneFirst(WaveMask mask, vector expr); +__generic +__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)") +matrix WaveMaskReadLaneFirst(WaveMask mask, matrix expr); + + + + // Shader model 6.0 stuff // Information for GLSL wave/subgroup support @@ -2645,6 +2884,7 @@ __generic T QuadReadAcrossDiagonal(T localValue); __generic vector QuadReadAcrossDiagonal(vector localValue); __generic matrix QuadReadAcrossDiagonal(matrix localValue); + __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -2820,6 +3060,8 @@ __target_intrinsic(glsl, "subgroupElect()") __target_intrinsic(cuda, "_waveIsFirstLane()") bool WaveIsFirstLane(); +// Prefix + __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) -- cgit v1.2.3