diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-08-11 18:19:11 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-11 18:19:11 -0400 |
| commit | ec095a3a736f88f095fc947659da1793c8ccc54a (patch) | |
| tree | 4a4466c4fd49e7eef5f1e69480fd1185ba5ec1dd /source | |
| parent | cc64c619baf9ca5cdeda3f2ec83f613e753a5934 (diff) | |
Bugfix: WaveActiveCountBits on glsl output. (#1488)
* Fix WaveActiveCountBits on glsl output.
* Fix warning `could not be inlined because the return instruction is not at the end of the function. This could be fixed by running merge-return before inlining.` from glslang - because we weren't including the CreateMergeReturnPasss on default optimization, and it's assumed in InlineExhaustivePass.
* Keep WaveActiveCountBits use the default WaveMask impl.
* Fix WaveCountBits calculation.
Use WaveActiveBallot instead of the _WaveActiveBallot.
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang-glslang/slang-glslang.cpp | 2 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 27 |
2 files changed, 23 insertions, 6 deletions
diff --git a/source/slang-glslang/slang-glslang.cpp b/source/slang-glslang/slang-glslang.cpp index 1c756cb6c..80087997c 100644 --- a/source/slang-glslang/slang-glslang.cpp +++ b/source/slang-glslang/slang-glslang.cpp @@ -168,6 +168,8 @@ static void glslang_optimizeSPIRV(std::vector<unsigned int>& spirv, spv_target_e break; case SLANG_OPTIMIZATION_LEVEL_DEFAULT: // Use a minimal set of performance settings + // If we run CreateInlineExhaustivePass, We need to run CreateMergeReturnPass first. + optimizer.RegisterPass(spvtools::CreateMergeReturnPass()); optimizer.RegisterPass(spvtools::CreateInlineExhaustivePass()); optimizer.RegisterPass(spvtools::CreateAggressiveDCEPass()); optimizer.RegisterPass(spvtools::CreatePrivateToLocalPass()); diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 01fb17851..62a548555 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2569,11 +2569,12 @@ __target_intrinsic(hlsl, "WaveActiveBallot($1)") WaveMask WaveMaskBallot(WaveMask mask, bool condition); __glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "bitCount(subgroupBallot($1))") __target_intrinsic(cuda, "__popc(__ballot_sync($0, $1))") __target_intrinsic(hlsl, "WaveActiveCountBits($1)") -WaveMask WaveMaskCountBits(WaveMask mask, bool value); +uint WaveMaskCountBits(WaveMask mask, bool value) +{ + return _WaveCountBits(WaveActiveBallot(value)); +} // Waits until all warp lanes named in mask have executed a WaveMaskSharedSync (with the same mask) // before resuming execution. Guarantees memory ordering in shared memory among threads participating @@ -3262,9 +3263,6 @@ uint4 WaveActiveBallot(bool condition) return WaveMaskBallot(WaveGetActiveMask(), condition); } -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "bitCount(subgroupBallot($0))") __target_intrinsic(hlsl) uint WaveActiveCountBits(bool value) { @@ -3292,6 +3290,23 @@ bool WaveIsFirstLane() return WaveMaskIsFirstLane(WaveGetActiveMask()); } +// It's useful to have a wave uint4 version of countbits, because some wave functions return uint4. +// This implementation tries to limit the amount of work required by the actual lane count. +uint _WaveCountBits(uint4 value) +{ + // Assume since WaveGetLaneCount should be known at compile time, the branches will hopefully boil away + const uint waveLaneCount = WaveGetLaneCount(); + switch ((waveLaneCount - 1) / 32) + { + default: + case 0: return countbits(value.x); + case 1: return countbits(value.x) + countbits(value.y); + case 2: return countbits(value.x) + countbits(value.y) + countbits(value.z); + case 3: return countbits(value.x) + countbits(value.y) + countbits(value.z) + countbits(value.w); + } +} + + // Prefix __generic<T : __BuiltinArithmeticType> |
