diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-08-11 18:19:11 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-11 18:19:11 -0400 |
| commit | ec095a3a736f88f095fc947659da1793c8ccc54a (patch) | |
| tree | 4a4466c4fd49e7eef5f1e69480fd1185ba5ec1dd /source/slang | |
| parent | cc64c619baf9ca5cdeda3f2ec83f613e753a5934 (diff) | |
Bugfix: WaveActiveCountBits on glsl output. (#1488)
* Fix WaveActiveCountBits on glsl output.
* Fix warning `could not be inlined because the return instruction is not at the end of the function. This could be fixed by running merge-return before inlining.` from glslang - because we weren't including the CreateMergeReturnPasss on default optimization, and it's assumed in InlineExhaustivePass.
* Keep WaveActiveCountBits use the default WaveMask impl.
* Fix WaveCountBits calculation.
Use WaveActiveBallot instead of the _WaveActiveBallot.
Diffstat (limited to 'source/slang')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 27 |
1 files changed, 21 insertions, 6 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 01fb17851..62a548555 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2569,11 +2569,12 @@ __target_intrinsic(hlsl, "WaveActiveBallot($1)") WaveMask WaveMaskBallot(WaveMask mask, bool condition); __glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "bitCount(subgroupBallot($1))") __target_intrinsic(cuda, "__popc(__ballot_sync($0, $1))") __target_intrinsic(hlsl, "WaveActiveCountBits($1)") -WaveMask WaveMaskCountBits(WaveMask mask, bool value); +uint WaveMaskCountBits(WaveMask mask, bool value) +{ + return _WaveCountBits(WaveActiveBallot(value)); +} // Waits until all warp lanes named in mask have executed a WaveMaskSharedSync (with the same mask) // before resuming execution. Guarantees memory ordering in shared memory among threads participating @@ -3262,9 +3263,6 @@ uint4 WaveActiveBallot(bool condition) return WaveMaskBallot(WaveGetActiveMask(), condition); } -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "bitCount(subgroupBallot($0))") __target_intrinsic(hlsl) uint WaveActiveCountBits(bool value) { @@ -3292,6 +3290,23 @@ bool WaveIsFirstLane() return WaveMaskIsFirstLane(WaveGetActiveMask()); } +// It's useful to have a wave uint4 version of countbits, because some wave functions return uint4. +// This implementation tries to limit the amount of work required by the actual lane count. +uint _WaveCountBits(uint4 value) +{ + // Assume since WaveGetLaneCount should be known at compile time, the branches will hopefully boil away + const uint waveLaneCount = WaveGetLaneCount(); + switch ((waveLaneCount - 1) / 32) + { + default: + case 0: return countbits(value.x); + case 1: return countbits(value.x) + countbits(value.y); + case 2: return countbits(value.x) + countbits(value.y) + countbits(value.z); + case 3: return countbits(value.x) + countbits(value.y) + countbits(value.z) + countbits(value.w); + } +} + + // Prefix __generic<T : __BuiltinArithmeticType> |
