From e4611e2e30a3e5969d402f5ed7e72706a0e3b024 Mon Sep 17 00:00:00 2001 From: Yong He Date: Tue, 30 Sep 2025 19:08:23 -0700 Subject: Enhance buffer load specialization pass to specialize past field extracts. (#8547) This allows us to specialize functions whose argument is a sub element of a constant buffer, instead of being only applicable to entire buffer element. Closes #8421. This change also implements a proper heuristic to determine when to specialize the calls and defer the buffer loads. This PR addresses a pathological case exposed in `slangpy\slangpy\benchmarks\test_benchmark_tensor.py`, which used to take 27ms to finish, and now takes 1.25ms. For example, given: ``` struct Bottom { float bigArray[1024]; [mutating] void setVal(int index, float value) { bigArray[index] = value; } } struct Root { Bottom top[2]; [mutating] void setTopVal(int x, int y, float value) { top[x].setVal(y, value); } } RWStructuredBuffer sb; [shader("compute")] [numthreads(1, 1, 1)] void compute_main(uint3 tid: SV_DispatchThreadID) { sb[0].setTopVal(1, 2, 100.0f); } ``` We are now able to specialize the call to `setTopVal` into: ``` void compute_main(uint3 tid: SV_DispatchThreadID) { setTopVal_specialized(0, 1, 2, 100.0f); } void setTopVal_specialized(int sbIdx, int x, int y, float value) { Bottom_setVal_specialized(sbIdx, x, y, value); } void Bottom_setVal_specialized(int sbIdx, int x, int y, float value) { sb[sbIdx].top[x].bigArray[y] = value; } ``` And get rid of all unnecessary loads. Achieving this requires a combination of function call specialization and buffer-load-defer pass. The buffer-load-defer pass has been completely rewritten to be more correct and avoid introducing redundant loads. This PR also adds tests to make sure pointers, bindless handles, and loads from structured buffer or constant buffers works as expected. --- tests/cuda/copy-elision-this-1.slang | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tests/cuda') diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang index 295b45c73..273e6dc58 100644 --- a/tests/cuda/copy-elision-this-1.slang +++ b/tests/cuda/copy-elision-this-1.slang @@ -1,4 +1,6 @@ -//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda +//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -line-directive-mode none +//TEST:SIMPLE(filecheck=PTX): -stage compute -entry computeMain -target cuda + struct Data { StructuredBuffer input[2]; RWStructuredBuffer output; @@ -6,7 +8,9 @@ struct Data { StructuredBuffer index_buffer; uint index_count; - // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this + // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}}) + // CUDA-NEXT: { + // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load float fetch(int buffer, int index) { return input[buffer][index]; @@ -15,6 +19,8 @@ struct Data { ParameterBlock data; +// PTX: computeMain + [shader("compute")] [numthreads(8, 8, 1)] void computeMain(uint3 tid: SV_DispatchThreadID) -- cgit v1.2.3