From a6deb5ed82cb8fc6b4f4c5c5fee264e09f97ff89 Mon Sep 17 00:00:00 2001 From: Yong He Date: Mon, 29 Sep 2025 17:45:08 -0700 Subject: Rewriting the lower-buffer-element-type pass to avoid unnecessary packing/unpacking. (#8526) Part of the effort to improve the performance of generated SPIRV code. The existing lower-buffer-element-type pass works by loading the entire buffer element content from memory, and translate it to logical type stored in a local variable at the earliest reference of a buffer handle. This means that is can generate inefficient code that reads more than necessary. Consider this example: ``` struct BigStruct { bool values[1024]; } ConstantBuffer cb; void test(BigStruct v) { if (v.values[0]) { printf("ok"); } } [numthreads(1,1,1)] void computeMain() { test(cb); } ``` In IR, the `computeMain` function before lower-buffer-element-type pass is something like following: ``` func test: %v = param : BigStruct %barr = fieldExtract(%v, "values") %element = elementExtract(%barr, 0) ... // uses %element func computeMain: %v = load(cb) call %test %v ``` The existing lower-buffer-element-type pass will rewrite the bool array in `BigStruct` into `int` array so it is legal in SPIRV. However, it does so by inserting the translation on the first `load` of the constant buffer: ``` struct BigStruct_std430 { int values[1024]; } var cb : ConstantBuffer; func computeMain: %tmpVar : var call %unpackStorage(%tmpVar, cb) %v : BigStruct = load %tmpVar call %test %v ``` This means that the entire array will be loaded and translated to int, before calling `test`, which only uses one element. It turns out that the downstream compiler isn't always able to optimize out this inefficient translation/copy. This PR completely rewrites the way buffer-element-type lowering is handled to avoid producing this inefficient code. It works in two parts: first we turn on the `transformParamsToConstRef` pass for SPIRV target as well, so we will translate the `test` function to take the `v` parameter as `constref`. The second part is a redesigned buffer-element-type pass that defers the storage-type to logical-type translation until a value is actually used by a `load` instruction. In this example, after `transformParamsToConstRef`, the IR is: ``` func test: %v = param : ConstRef %barr = fieldAddr(%v, "values") %elementPtr = elementAddr(%barr, 0) %element = load(%elementPtr) ... // uses %element func computeMain: call %test %cb ``` The new `buffer-element-type-lowering` pass will take this IR, and insert translation at latest possible time across the entire call graph, and translate the IR into: ``` func test: %v = param : ConstRef %barr = fieldAddr(%v, "values") %elementPtr : ptr = elementAddr(%barr, 0) %element_int = load(%elementPtr) %element = cast(%element_int) : %bool ... // uses %element func computeMain: call %test %cb ``` In this new IR, there is no longer a load and conversion of the entire array. See new comment in `slang-ir-lower-buffer-element-type.cpp` for more details of how the pass works. This PR also address many other issues surfaced by turning on `transformParamsToConstRef` pass on SPIRV backend. --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> --- tests/spirv/buffer-pointer-matrix-layout.slang | 69 ++++++++++++++++---------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'tests/spirv/buffer-pointer-matrix-layout.slang') diff --git a/tests/spirv/buffer-pointer-matrix-layout.slang b/tests/spirv/buffer-pointer-matrix-layout.slang index cbb8f2857..4d80419e4 100644 --- a/tests/spirv/buffer-pointer-matrix-layout.slang +++ b/tests/spirv/buffer-pointer-matrix-layout.slang @@ -1,33 +1,50 @@ -//TEST:SIMPLE(filecheck=CHECK): -target spirv -emit-spirv-directly -stage compute -entry main -matrix-layout-column-major +//TEST:COMPARE_COMPUTE(filecheck-buffer=CHECK): -vk -output-using-type -xslang -matrix-layout-column-major -emit-spirv-directly -// CHECK: OpLoad {{.*}} Aligned 4 +// TEST_INPUT: set ptr = ubuffer(data=[1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0],stride=4) +uniform float3x4 *ptr; -struct Push -{ - float3x4* ptr; -}; +// TEST_INPUT: set outputBuffer = out ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],stride=4) +RWStructuredBuffer outputBuffer; -[[vk::push_constant]] Push push; [shader("compute")] [numthreads(1, 1, 1)] -void main(uint3 dtid : SV_DispatchThreadID) -{ +void computeMain(uint3 dtid: SV_DispatchThreadID) +{ // This matrix is in memry column major. Slang respects this here and load it properly! - float3x4 correctly_read_matrix = *push.ptr; - printf("(%f,%f,%f,%f)\n(%f,%f,%f,%f)\n", - correctly_read_matrix[0][0], correctly_read_matrix[0][1], correctly_read_matrix[0][2], correctly_read_matrix[0][3], - correctly_read_matrix[1][0], correctly_read_matrix[1][1], correctly_read_matrix[1][2], correctly_read_matrix[1][3] - ); - printf("(%f,%f,%f,%f)\n\n", - correctly_read_matrix[2][0], correctly_read_matrix[2][1], correctly_read_matrix[2][2], correctly_read_matrix[2][3] - ); - // With this syntax however, Slang ignores the column major setting and loads it as it it was row major! - float3x4 broken_matrix = push.ptr[0]; - printf("(%f,%f,%f,%f)\n(%f,%f,%f,%f)\n", - broken_matrix[0][0], broken_matrix[0][1], broken_matrix[0][2], broken_matrix[0][3], - broken_matrix[1][0], broken_matrix[1][1], broken_matrix[1][2], broken_matrix[1][3] - ); - printf("(%f,%f,%f,%f)\n\n", - broken_matrix[2][0], broken_matrix[2][1], broken_matrix[2][2], broken_matrix[2][3] - ); + float3x4 correctly_read_matrix = *ptr; + outputBuffer[0] = correctly_read_matrix[0][0]; + outputBuffer[1] = correctly_read_matrix[0][1]; + outputBuffer[2] = correctly_read_matrix[0][2]; + outputBuffer[3] = correctly_read_matrix[0][3]; + outputBuffer[4] = correctly_read_matrix[1][0]; + outputBuffer[5] = correctly_read_matrix[1][1]; + outputBuffer[6] = correctly_read_matrix[1][2]; + outputBuffer[7] = correctly_read_matrix[1][3]; + // CHECK: 1.0 + // CHECK: 4.0 + // CHECK: 7.0 + // CHECK: 10.0 + // CHECK: 2.0 + // CHECK: 5.0 + // CHECK: 8.0 + // CHECK: 11.0 + + // With this syntax however, Slang was ignoring the column major setting and loads it as it it was row major! + float3x4 broken_matrix = ptr[0]; + outputBuffer[8] = broken_matrix[0][0]; + outputBuffer[9] = broken_matrix[0][1]; + outputBuffer[10] = broken_matrix[0][2]; + outputBuffer[11] = broken_matrix[0][3]; + outputBuffer[12] = broken_matrix[1][0]; + outputBuffer[13] = broken_matrix[1][1]; + outputBuffer[14] = broken_matrix[1][2]; + outputBuffer[15] = broken_matrix[1][3]; + // CHECK: 1.0 + // CHECK: 4.0 + // CHECK: 7.0 + // CHECK: 10.0 + // CHECK: 2.0 + // CHECK: 5.0 + // CHECK: 8.0 + // CHECK: 11.0 } \ No newline at end of file -- cgit v1.2.3