From a6deb5ed82cb8fc6b4f4c5c5fee264e09f97ff89 Mon Sep 17 00:00:00 2001 From: Yong He Date: Mon, 29 Sep 2025 17:45:08 -0700 Subject: Rewriting the lower-buffer-element-type pass to avoid unnecessary packing/unpacking. (#8526) Part of the effort to improve the performance of generated SPIRV code. The existing lower-buffer-element-type pass works by loading the entire buffer element content from memory, and translate it to logical type stored in a local variable at the earliest reference of a buffer handle. This means that is can generate inefficient code that reads more than necessary. Consider this example: ``` struct BigStruct { bool values[1024]; } ConstantBuffer cb; void test(BigStruct v) { if (v.values[0]) { printf("ok"); } } [numthreads(1,1,1)] void computeMain() { test(cb); } ``` In IR, the `computeMain` function before lower-buffer-element-type pass is something like following: ``` func test: %v = param : BigStruct %barr = fieldExtract(%v, "values") %element = elementExtract(%barr, 0) ... // uses %element func computeMain: %v = load(cb) call %test %v ``` The existing lower-buffer-element-type pass will rewrite the bool array in `BigStruct` into `int` array so it is legal in SPIRV. However, it does so by inserting the translation on the first `load` of the constant buffer: ``` struct BigStruct_std430 { int values[1024]; } var cb : ConstantBuffer; func computeMain: %tmpVar : var call %unpackStorage(%tmpVar, cb) %v : BigStruct = load %tmpVar call %test %v ``` This means that the entire array will be loaded and translated to int, before calling `test`, which only uses one element. It turns out that the downstream compiler isn't always able to optimize out this inefficient translation/copy. This PR completely rewrites the way buffer-element-type lowering is handled to avoid producing this inefficient code. It works in two parts: first we turn on the `transformParamsToConstRef` pass for SPIRV target as well, so we will translate the `test` function to take the `v` parameter as `constref`. The second part is a redesigned buffer-element-type pass that defers the storage-type to logical-type translation until a value is actually used by a `load` instruction. In this example, after `transformParamsToConstRef`, the IR is: ``` func test: %v = param : ConstRef %barr = fieldAddr(%v, "values") %elementPtr = elementAddr(%barr, 0) %element = load(%elementPtr) ... // uses %element func computeMain: call %test %cb ``` The new `buffer-element-type-lowering` pass will take this IR, and insert translation at latest possible time across the entire call graph, and translate the IR into: ``` func test: %v = param : ConstRef %barr = fieldAddr(%v, "values") %elementPtr : ptr = elementAddr(%barr, 0) %element_int = load(%elementPtr) %element = cast(%element_int) : %bool ... // uses %element func computeMain: call %test %cb ``` In this new IR, there is no longer a load and conversion of the entire array. See new comment in `slang-ir-lower-buffer-element-type.cpp` for more details of how the pass works. This PR also address many other issues surfaced by turning on `transformParamsToConstRef` pass on SPIRV backend. --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> --- source/slang/slang-emit.cpp | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'source/slang/slang-emit.cpp') diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index e1689ccfc..f1cc6090d 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -97,6 +97,7 @@ #include "slang-ir-restructure.h" #include "slang-ir-sccp.h" #include "slang-ir-simplify-for-emit.h" +#include "slang-ir-specialize-address-space.h" #include "slang-ir-specialize-arrays.h" #include "slang-ir-specialize-buffer-load-arg.h" #include "slang-ir-specialize-matrix-layout.h" @@ -1715,6 +1716,7 @@ Result linkAndOptimizeIR( if (targetProgram->getOptionSet().getBoolOption( CompilerOptionName::EnableExperimentalPasses)) introduceExplicitGlobalContext(irModule, target); + transformParamsToConstRef(irModule, codeGenContext->getSink()); #if 0 dumpIRIfEnabled(codeGenContext, irModule, "EXPLICIT GLOBAL CONTEXT INTRODUCED"); #endif @@ -1812,11 +1814,11 @@ Result linkAndOptimizeIR( if (requiredLoweringPassSet.meshOutput) legalizeMeshOutputTypes(irModule); - BufferElementTypeLoweringOptions bufferElementTypeLoweringOptions; - bufferElementTypeLoweringOptions.use16ByteArrayElementForConstantBuffer = - isWGPUTarget(targetRequest); - lowerBufferElementTypeToStorageType(targetProgram, irModule, bufferElementTypeLoweringOptions); - performForceInlining(irModule); + + // Lower all bit_cast operations on complex types into leaf-level + // bit_cast on basic types. + if (requiredLoweringPassSet.bitcast) + lowerBitCast(targetProgram, irModule, sink); // Rewrite functions that return arrays to return them via `out` parameter, // since our target languages doesn't allow returning arrays. @@ -1832,13 +1834,28 @@ Result linkAndOptimizeIR( rcpWOfPositionInput(irModule); } - // Lower all bit_cast operations on complex types into leaf-level - // bit_cast on basic types. - if (requiredLoweringPassSet.bitcast) - lowerBitCast(targetProgram, irModule, sink); - bool emitSpirvDirectly = targetProgram->shouldEmitSPIRVDirectly(); + BufferElementTypeLoweringOptions bufferElementTypeLoweringOptions; + bufferElementTypeLoweringOptions.use16ByteArrayElementForConstantBuffer = + isWGPUTarget(targetRequest); + lowerBufferElementTypeToStorageType(targetProgram, irModule, bufferElementTypeLoweringOptions); + + // If we are generating code for glsl or metal, perform address space propagation now. + // For SPIRV, we will do that during spirv legalization that happens after + // `linkAndOptimizeIR`. + if (target == CodeGenTarget::GLSL) + { + NoOpInitialAddressSpaceAssigner addrSpaceAssigner; + specializeAddressSpace(irModule, &addrSpaceAssigner); + } + else if (isMetalTarget(targetRequest)) + { + specializeAddressSpaceForMetal(irModule); + } + + performForceInlining(irModule); + if (emitSpirvDirectly) { performIntrinsicFunctionInlining(irModule); -- cgit v1.2.3