13 files changed, 399 insertions, 23 deletions
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
index 295b45c73..273e6dc58 100644
--- a/tests/cuda/copy-elision-this-1.slang
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -1,4 +1,6 @@
-//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda
+//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=PTX): -stage compute -entry computeMain -target cuda
+
 struct Data {
     StructuredBuffer<float> input[2];
     RWStructuredBuffer<float> output;
@@ -6,7 +8,9 @@ struct Data {
     StructuredBuffer<uint> index_buffer;
     uint index_count;
 
-    // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this
+    // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}})
+    // CUDA-NEXT: {
+    // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load
     float fetch(int buffer, int index)
     {
         return input[buffer][index];
@@ -15,6 +19,8 @@ struct Data {
 
 ParameterBlock<Data> data;
 
+// PTX: computeMain
+
 [shader("compute")]
 [numthreads(8, 8, 1)]
 void computeMain(uint3 tid: SV_DispatchThreadID)
diff --git a/tests/glsl/global-uniform-with-varyings.slang b/tests/glsl/global-uniform-with-varyings.slang
index 678855dbf..174560840 100644
--- a/tests/glsl/global-uniform-with-varyings.slang
+++ b/tests/glsl/global-uniform-with-varyings.slang
@@ -3,9 +3,9 @@
 
 // CHECK_SPIRV: OpEntryPoint
 // CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 0
-// CHECK_SPIRV: OpVariable {{.*}} Uniform
-// CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 1
-// CHECK_SPIRV: OpVariable {{.*}} Output {{.*}} Location 0
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Uniform
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Input {{.*}} Location 1
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Output {{.*}} Location 0
 
 // CHECK_GLSL: layout(location = 0)
 // CHECK_GLSL-NEXT: in
diff --git a/tests/metal/out-param.slang b/tests/metal/out-param.slang
index e488f8844..68b6e2b62 100644
--- a/tests/metal/out-param.slang
+++ b/tests/metal/out-param.slang
@@ -9,13 +9,20 @@
 //TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
 
-// METAL: void Test_out_param{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
 
-// METAL: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
 
 void Test_out_param(out int value)
 {
diff --git a/tests/optimization/buffer-load-defer-aliasing-1.slang b/tests/optimization/buffer-load-defer-aliasing-1.slang
new file mode 100644
index 000000000..f50d5306c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing-1.slang
@@ -0,0 +1,45 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we can defer buffer loads by ruling out potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // RWStructuredBuffer is considered to not alias with anything else.
+        // this write should not prevent deferring loading bigArray.
+        gOther[0] = 100;
+        // this write should not prevent deferring loading bigArray.
+        gSharedVar = 1;
+        // this write should not prevent deferring loading bigArray.
+        gStaticVar = 2;
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform RWStructuredBuffer<int> gOther;
+static int gStaticVar;
+groupshared int gSharedVar;
+
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpEntryPoint
+    // SPV-NOT: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-aliasing.slang b/tests/optimization/buffer-load-defer-aliasing.slang
new file mode 100644
index 000000000..a0240cc40
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we are not deferring buffer loads due to potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // this write may cause data stored at gRoot to be modified,
+        // thus bigArray[index] may be different from what it was before the call to
+        // bottomGetValue. So we should not defer loading bigArray until after this write.
+        *gOther = 100; 
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform int* gOther;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-bindless.slang b/tests/optimization/buffer-load-defer-bindless.slang
new file mode 100644
index 000000000..2108d562c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-bindless.slang
@@ -0,0 +1,58 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Check that we can specialize buffer loads through bindless handles, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `StructuredBuffer<Middle>.Handle` and an `int`
+    // index, and recursively specialize `bottomGetValue` to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpLoad %Middle
+// SPV: %[[REG:[A-Za-z0-9_]+]] = OpLoad %float
+// SPV: OpStore {{.*}} %[[REG]]
+
+// Check that the generated CUDA code contains a specialized `bottomGetValue` function that has
+// the complete parameter list to access the `bigArray` element directly, without needing to load
+// the entire `Bottom` struct from the caller.
+//
+// CUDA-DAG: __device__ float Bottom_bottomGetValue{{.*}}(StructuredBuffer<Middle{{.*}}> {{.*}}, int {{.*}}, int {{.*}})
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-user-pointer.slang b/tests/optimization/buffer-load-defer-user-pointer.slang
new file mode 100644
index 000000000..58e6386f9
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-user-pointer.slang
@@ -0,0 +1,63 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+// Check that we can specialize buffer loads through user pointers, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle*>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `Middle*` and recursively specialize `bottomGetValue`
+    // to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that the generated CUDA code never loads a `Middle` or `Bottom` struct into a local var.
+// CUDA-NOT: Middle{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Bottom{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Top{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+
+// Check that the generated CUDA code can be compiled by nvrtc correctly into PTX.
+// PTX: compute_main
+
+// Check that the generated (unoptimized) SPIR-V contains a specialized Bottom_bottomGetValue function
+// that takes in a Bottom* and use access chain to load the required array element directly, without 
+// needing to load the entire Bottom struct.
+// SPV: %Bottom_bottomGetValue = OpFunction %float None
+// SPV: OpFunctionParameter %_ptr_PhysicalStorageBuffer_Middle_natural
+// SPV: %[[INDEX:[A-Za-z0-9_]+]] = OpFunctionParameter %int
+// SPV: %[[PTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_float %{{.*}} %[[INDEX]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[PTR]]
+// SPV: OpReturnValue %[[VALUE]]
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer.slang b/tests/optimization/buffer-load-defer.slang
new file mode 100644
index 000000000..b2df43c13
--- /dev/null
+++ b/tests/optimization/buffer-load-defer.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    Middle middle;
+    float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-specialize-1.slang b/tests/optimization/buffer-load-specialize-1.slang
new file mode 100644
index 000000000..55f2df473
--- /dev/null
+++ b/tests/optimization/buffer-load-specialize-1.slang
@@ -0,0 +1,35 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+struct Bottom
+{
+    float bigArray[1024];
+    // SPV: %Bottom_bottomGetValue = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    // SPV-NOT: OpCompositeConstruct
+    // SPV: OpFunctionEnd
+
+    // SPV: %Bottom_bottomGetValue_0 = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.bottom1.bottomGetValue(0);
+    outputBuffer[1] = cb.bottom2.bottomGetValue(1);
+    outputBuffer[2] = cb.bottom2.bottomGetValue(2);
+}
diff --git a/tests/optimization/buffer-store-defer.slang b/tests/optimization/buffer-store-defer.slang
new file mode 100644
index 000000000..14362a477
--- /dev/null
+++ b/tests/optimization/buffer-store-defer.slang
@@ -0,0 +1,51 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=PTX): -target ptx -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    [mutating]
+    void setVal(int index, float value) { bigArray[index] = value; }
+}
+
+struct Root
+{
+    Bottom top[2];
+    [mutating]
+    void setTopVal(int x, int y, float value)
+    {
+        top[x].setVal(y, value);
+    }
+}
+
+RWStructuredBuffer<Root> sb;
+
+// Check that we don't load the entire `Root` struct, modify it, and then write it back.
+// Instead we should generate a single store instruction to write the single float value
+// directly to the buffer.
+
+// SPV: OpEntryPoint
+// SPV: OpLabel
+// SPV-NEXT: OpAccessChain
+// SPV-NOT: OpCompositeInsert
+// SPV-NOT: OpLoad
+// SPV: OpStore
+// SPV-NOT: OpLoad
+// SPV-NOT: OpCompositeInsert
+// SPV: OpStore
+// SPV: OpReturn
+
+// CUDA: __device__ void Bottom_setVal_0(int [[INDEX0:[A-Za-z0-9_]+]], int [[INDEX1:[A-Za-z0-9_]+]], int [[INDEX2:[A-Za-z0-9_]+]], float [[VAL:[A-Za-z0-9_]+]])
+// CUDA:  (&(&(globalParams{{.*}}->sb{{.*}}){{\[}}[[INDEX0]]{{\]}})->top{{.*}}{{\[}}[[INDEX1]]{{\]}})->bigArray{{.*}}{{\[}}[[INDEX2]]{{\]}} = [[VAL]];
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    sb[0].setTopVal(1, 2, 100.0f);
+
+    sb[3].top[1].setVal(8, 200.0f);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
new file mode 100644
index 000000000..f7f9b1888
--- /dev/null
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Test that we can defer loading big structured buffer elements.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom;
+}
+
+StructuredBuffer<Root> sb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that we don't load the entire `Root` struct and then do ElementExtract to get to `bigArray[0]`.
+// Instead we use access chain all the way to point to the required array element, and load just a single float.
+
+// SPV: OpEntryPoint
+// SPV: %[[SBPTRARRAY:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer__arr_float_int_1024
+// SPV: %[[SBPTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer_float %[[SBPTRARRAY]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[SBPTR]]
+// SPV: OpStore %{{.*}} %[[VALUE]]
+
+// CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
+// CUDA:  return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = sb[tid.x].bottom.bottomGetValue(0);
+}
diff --git a/tests/vkray/raygen-trace-ray-param-non-struct.slang b/tests/vkray/raygen-trace-ray-param-non-struct.slang
index b0a129761..72d85ed02 100644
--- a/tests/vkray/raygen-trace-ray-param-non-struct.slang
+++ b/tests/vkray/raygen-trace-ray-param-non-struct.slang
@@ -28,7 +28,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: TraceRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     TraceRay(as,
              1,
              0xff,
@@ -39,9 +39,9 @@ void main()
              someInData1);
     outputBuffer1[0] = outputBuffer1[0]+someInData1;
 
-    // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
+    // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = rayPayload{{.*}}.data{{.*}};
     // CHECK: TraceMotionRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     TraceMotionRay(as,
             1,
             0xff,
@@ -55,7 +55,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: NvTraceRayHitObject(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject::TraceRay(as,
              1,
              0xff,
@@ -68,7 +68,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: TraceMotionRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject::TraceMotionRay(as,
             1,
             0xff,
@@ -82,7 +82,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: NvInvokeHitObject(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject hitObject_HitOrMiss;
     HitObject::Invoke(
         as,
diff --git a/tests/wgsl/switch-case.slang b/tests/wgsl/switch-case.slang
index c4ff0996e..fc24bd67a 100644
--- a/tests/wgsl/switch-case.slang
+++ b/tests/wgsl/switch-case.slang
@@ -70,17 +70,14 @@ func fs_main(VertexOutput input)->FragmentOutput
     return output;
 }
 
-//WGSL: fn _S9( _S10 : Tuple_0) -> f32
-//WGSL-NEXT: {
-//WGSL-NEXT:     switch(_S10.value1_0.x)
+//WGSL:     switch({{.*}})
 //WGSL-NEXT:     {
 //WGSL-NEXT:     case u32(0):
 //WGSL-NEXT:         {
-//WGSL-NEXT:             return Circle_getArea_0(unpackAnyValue16_0(_S10.value2_0));
+//WGSL-NEXT:             return Circle_getArea_0
 //WGSL-NEXT:         }
 //WGSL-NEXT:     default :
 //WGSL-NEXT:         {
-//WGSL-NEXT:             return Rectangle_getArea_0(unpackAnyValue16_1(_S10.value2_0));
+//WGSL-NEXT:             return Rectangle_getArea_0
 //WGSL-NEXT:         }
 //WGSL-NEXT:     }
-//WGSL-NEXT: }