Enhance buffer load specialization pass to specialize past field extracts. (#8547)

This allows us to specialize functions whose argument is a sub element of a constant buffer, instead of being only applicable to entire buffer element. Closes #8421. This change also implements a proper heuristic to determine when to specialize the calls and defer the buffer loads. This PR addresses a pathological case exposed in `slangpy\slangpy\benchmarks\test_benchmark_tensor.py`, which used to take 27ms to finish, and now takes 1.25ms. For example, given: ``` struct Bottom { float bigArray[1024]; [mutating] void setVal(int index, float value) { bigArray[index] = value; } } struct Root { Bottom top[2]; [mutating] void setTopVal(int x, int y, float value) { top[x].setVal(y, value); } } RWStructuredBuffer<Root> sb; [shader("compute")] [numthreads(1, 1, 1)] void compute_main(uint3 tid: SV_DispatchThreadID) { sb[0].setTopVal(1, 2, 100.0f); } ``` We are now able to specialize the call to `setTopVal` into: ``` void compute_main(uint3 tid: SV_DispatchThreadID) { setTopVal_specialized(0, 1, 2, 100.0f); } void setTopVal_specialized(int sbIdx, int x, int y, float value) { Bottom_setVal_specialized(sbIdx, x, y, value); } void Bottom_setVal_specialized(int sbIdx, int x, int y, float value) { sb[sbIdx].top[x].bigArray[y] = value; } ``` And get rid of all unnecessary loads. Achieving this requires a combination of function call specialization and buffer-load-defer pass. The buffer-load-defer pass has been completely rewritten to be more correct and avoid introducing redundant loads. This PR also adds tests to make sure pointers, bindless handles, and loads from structured buffer or constant buffers works as expected.
author: Yong He <yonghe@outlook.com> 2025-09-30 19:08:23 -0700
committer: GitHub <noreply@github.com> 2025-09-30 19:08:23 -0700
commit: e4611e2e30a3e5969d402f5ed7e72706a0e3b024 (patch)
tree: 0f4240ccf8c4f0786949ab33adb0fcc332890d11 /tests/optimization
parent: b6422e50cb19f7f790f29678ba22f31b0b305511 (diff)
8 files changed, 366 insertions, 0 deletions
diff --git a/tests/optimization/buffer-load-defer-aliasing-1.slang b/tests/optimization/buffer-load-defer-aliasing-1.slang
new file mode 100644
index 000000000..f50d5306c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing-1.slang
@@ -0,0 +1,45 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we can defer buffer loads by ruling out potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // RWStructuredBuffer is considered to not alias with anything else.
+        // this write should not prevent deferring loading bigArray.
+        gOther[0] = 100;
+        // this write should not prevent deferring loading bigArray.
+        gSharedVar = 1;
+        // this write should not prevent deferring loading bigArray.
+        gStaticVar = 2;
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform RWStructuredBuffer<int> gOther;
+static int gStaticVar;
+groupshared int gSharedVar;
+
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpEntryPoint
+    // SPV-NOT: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-aliasing.slang b/tests/optimization/buffer-load-defer-aliasing.slang
new file mode 100644
index 000000000..a0240cc40
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we are not deferring buffer loads due to potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // this write may cause data stored at gRoot to be modified,
+        // thus bigArray[index] may be different from what it was before the call to
+        // bottomGetValue. So we should not defer loading bigArray until after this write.
+        *gOther = 100; 
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform int* gOther;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-bindless.slang b/tests/optimization/buffer-load-defer-bindless.slang
new file mode 100644
index 000000000..2108d562c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-bindless.slang
@@ -0,0 +1,58 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Check that we can specialize buffer loads through bindless handles, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `StructuredBuffer<Middle>.Handle` and an `int`
+    // index, and recursively specialize `bottomGetValue` to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpLoad %Middle
+// SPV: %[[REG:[A-Za-z0-9_]+]] = OpLoad %float
+// SPV: OpStore {{.*}} %[[REG]]
+
+// Check that the generated CUDA code contains a specialized `bottomGetValue` function that has
+// the complete parameter list to access the `bigArray` element directly, without needing to load
+// the entire `Bottom` struct from the caller.
+//
+// CUDA-DAG: __device__ float Bottom_bottomGetValue{{.*}}(StructuredBuffer<Middle{{.*}}> {{.*}}, int {{.*}}, int {{.*}})
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-user-pointer.slang b/tests/optimization/buffer-load-defer-user-pointer.slang
new file mode 100644
index 000000000..58e6386f9
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-user-pointer.slang
@@ -0,0 +1,63 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+// Check that we can specialize buffer loads through user pointers, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle*>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `Middle*` and recursively specialize `bottomGetValue`
+    // to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that the generated CUDA code never loads a `Middle` or `Bottom` struct into a local var.
+// CUDA-NOT: Middle{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Bottom{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Top{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+
+// Check that the generated CUDA code can be compiled by nvrtc correctly into PTX.
+// PTX: compute_main
+
+// Check that the generated (unoptimized) SPIR-V contains a specialized Bottom_bottomGetValue function
+// that takes in a Bottom* and use access chain to load the required array element directly, without 
+// needing to load the entire Bottom struct.
+// SPV: %Bottom_bottomGetValue = OpFunction %float None
+// SPV: OpFunctionParameter %_ptr_PhysicalStorageBuffer_Middle_natural
+// SPV: %[[INDEX:[A-Za-z0-9_]+]] = OpFunctionParameter %int
+// SPV: %[[PTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_float %{{.*}} %[[INDEX]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[PTR]]
+// SPV: OpReturnValue %[[VALUE]]
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer.slang b/tests/optimization/buffer-load-defer.slang
new file mode 100644
index 000000000..b2df43c13
--- /dev/null
+++ b/tests/optimization/buffer-load-defer.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    Middle middle;
+    float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-specialize-1.slang b/tests/optimization/buffer-load-specialize-1.slang
new file mode 100644
index 000000000..55f2df473
--- /dev/null
+++ b/tests/optimization/buffer-load-specialize-1.slang
@@ -0,0 +1,35 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+struct Bottom
+{
+    float bigArray[1024];
+    // SPV: %Bottom_bottomGetValue = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    // SPV-NOT: OpCompositeConstruct
+    // SPV: OpFunctionEnd
+
+    // SPV: %Bottom_bottomGetValue_0 = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.bottom1.bottomGetValue(0);
+    outputBuffer[1] = cb.bottom2.bottomGetValue(1);
+    outputBuffer[2] = cb.bottom2.bottomGetValue(2);
+}
diff --git a/tests/optimization/buffer-store-defer.slang b/tests/optimization/buffer-store-defer.slang
new file mode 100644
index 000000000..14362a477
--- /dev/null
+++ b/tests/optimization/buffer-store-defer.slang
@@ -0,0 +1,51 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=PTX): -target ptx -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    [mutating]
+    void setVal(int index, float value) { bigArray[index] = value; }
+}
+
+struct Root
+{
+    Bottom top[2];
+    [mutating]
+    void setTopVal(int x, int y, float value)
+    {
+        top[x].setVal(y, value);
+    }
+}
+
+RWStructuredBuffer<Root> sb;
+
+// Check that we don't load the entire `Root` struct, modify it, and then write it back.
+// Instead we should generate a single store instruction to write the single float value
+// directly to the buffer.
+
+// SPV: OpEntryPoint
+// SPV: OpLabel
+// SPV-NEXT: OpAccessChain
+// SPV-NOT: OpCompositeInsert
+// SPV-NOT: OpLoad
+// SPV: OpStore
+// SPV-NOT: OpLoad
+// SPV-NOT: OpCompositeInsert
+// SPV: OpStore
+// SPV: OpReturn
+
+// CUDA: __device__ void Bottom_setVal_0(int [[INDEX0:[A-Za-z0-9_]+]], int [[INDEX1:[A-Za-z0-9_]+]], int [[INDEX2:[A-Za-z0-9_]+]], float [[VAL:[A-Za-z0-9_]+]])
+// CUDA:  (&(&(globalParams{{.*}}->sb{{.*}}){{\[}}[[INDEX0]]{{\]}})->top{{.*}}{{\[}}[[INDEX1]]{{\]}})->bigArray{{.*}}{{\[}}[[INDEX2]]{{\]}} = [[VAL]];
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    sb[0].setTopVal(1, 2, 100.0f);
+
+    sb[3].top[1].setVal(8, 200.0f);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
new file mode 100644
index 000000000..f7f9b1888
--- /dev/null
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Test that we can defer loading big structured buffer elements.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom;
+}
+
+StructuredBuffer<Root> sb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that we don't load the entire `Root` struct and then do ElementExtract to get to `bigArray[0]`.
+// Instead we use access chain all the way to point to the required array element, and load just a single float.
+
+// SPV: OpEntryPoint
+// SPV: %[[SBPTRARRAY:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer__arr_float_int_1024
+// SPV: %[[SBPTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer_float %[[SBPTRARRAY]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[SBPTR]]
+// SPV: OpStore %{{.*}} %[[VALUE]]
+
+// CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
+// CUDA:  return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = sb[tid.x].bottom.bottomGetValue(0);
+}
author	Yong He <yonghe@outlook.com>	2025-09-30 19:08:23 -0700
committer	GitHub <noreply@github.com>	2025-09-30 19:08:23 -0700
commit	e4611e2e30a3e5969d402f5ed7e72706a0e3b024 (patch)
tree	0f4240ccf8c4f0786949ab33adb0fcc332890d11 /tests/optimization
parent	b6422e50cb19f7f790f29678ba22f31b0b305511 (diff)