1 files changed, 58 insertions, 0 deletions
diff --git a/tests/optimization/buffer-load-defer-bindless.slang b/tests/optimization/buffer-load-defer-bindless.slang
new file mode 100644
index 000000000..2108d562c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-bindless.slang
@@ -0,0 +1,58 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Check that we can specialize buffer loads through bindless handles, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `StructuredBuffer<Middle>.Handle` and an `int`
+    // index, and recursively specialize `bottomGetValue` to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpLoad %Middle
+// SPV: %[[REG:[A-Za-z0-9_]+]] = OpLoad %float
+// SPV: OpStore {{.*}} %[[REG]]
+
+// Check that the generated CUDA code contains a specialized `bottomGetValue` function that has
+// the complete parameter list to access the `bigArray` element directly, without needing to load
+// the entire `Bottom` struct from the caller.
+//
+// CUDA-DAG: __device__ float Bottom_bottomGetValue{{.*}}(StructuredBuffer<Middle{{.*}}> {{.*}}, int {{.*}}, int {{.*}})
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}