summaryrefslogtreecommitdiffstats
path: root/tests/cuda
diff options
context:
space:
mode:
authorArielG-NV <159081215+ArielG-NV@users.noreply.github.com>2025-08-07 00:22:22 -0700
committerGitHub <noreply@github.com>2025-08-07 07:22:22 +0000
commit063cbeaaea2fb00a10c6058ea4a9632092772ea5 (patch)
treeb4412347d6c264c3b1a84ec971921a5e2fe76134 /tests/cuda
parent9e2685853033f4286feaf22d04a755a7395d95ce (diff)
Initial copy elision pass (#8042)
Fixes #7574 Changes: * Add an initial (fairly simple) optimization pass which is able to eliminate redundant copies. * Our current existing optimizer passes remove redundant load/store very robustly, this pass will focus on other cases of copy elimination * Primary approach is to make all functions which are `in T` and `T` is trivial to copy into a `__constref T`. We then (depending on scenario) manually insert a variable+load if a pass-by-reference is not possible; otherwise we pass by `constref`. * Added optimizations to eliminate redundant code which causes `constref` to fail to compile --------- Co-authored-by: Harsh Aggarwal <haaggarwal@nvidia.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: slangbot <ellieh+slangbot@nvidia.com> Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
Diffstat (limited to 'tests/cuda')
-rw-r--r--tests/cuda/copy-elision-this-1.slang28
-rw-r--r--tests/cuda/copy-elision-this-2.slang141
2 files changed, 169 insertions, 0 deletions
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
new file mode 100644
index 000000000..295b45c73
--- /dev/null
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -0,0 +1,28 @@
+//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda
+struct Data {
+ StructuredBuffer<float> input[2];
+ RWStructuredBuffer<float> output;
+ uint input_tensor_count;
+ StructuredBuffer<uint> index_buffer;
+ uint index_count;
+
+ // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this
+ float fetch(int buffer, int index)
+ {
+ return input[buffer][index];
+ }
+};
+
+ParameterBlock<Data> data;
+
+[shader("compute")]
+[numthreads(8, 8, 1)]
+void computeMain(uint3 tid: SV_DispatchThreadID)
+{
+ float result = 0.0;
+ for (int i = 0; i < data.index_count; ++i) {
+ uint buffer = data.index_buffer[i];
+ result += data.fetch(buffer, tid.x * 1024 + tid.y);
+ }
+ data.output[tid.x * 1024 + tid.y] = result;
+}
diff --git a/tests/cuda/copy-elision-this-2.slang b/tests/cuda/copy-elision-this-2.slang
new file mode 100644
index 000000000..60bb948c9
--- /dev/null
+++ b/tests/cuda/copy-elision-this-2.slang
@@ -0,0 +1,141 @@
+//TEST:COMPARE_COMPUTE(filecheck-buffer=BUF): -cuda -compute
+//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -O3
+
+struct Data
+{
+ int val;
+
+
+ __init(int val)
+ {
+ this.val = val;
+ }
+};
+
+struct DataWrapped
+{
+ Data field;
+ Data element[2];
+
+ __init(int val)
+ {
+ field.val = val;
+ element[0].val = val;
+ element[1].val = val;
+ }
+}
+
+//TEST_INPUT:uniform(data=[1]):name=globalData
+uniform Data globalData;
+
+//TEST_INPUT: set input = ubuffer(data=[1 2 3 4], stride=4)
+RWStructuredBuffer<int> input;
+
+//TEST_INPUT: set output = out ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0], stride=4)
+RWStructuredBuffer<int> output;
+
+// CUDA: addCopyElision{{.*}}Data{{.*}}*{{.*}}data
+int addCopyElision(Data data, int val)
+{
+ // ensure we do not introduce a temporary
+ // CUDA-NOT: Data{{.*}};
+ return data.val + val;
+}
+
+// CUDA: nested{{.*}}Data{{.*}}*{{.*}}data
+int nested(Data data, int val)
+{
+
+ return addCopyElision(data, val);
+}
+
+// CUDA: addCopyElision{{.*}}FixedArray{{.*}}*{{.*}}data
+int addCopyElision(int data[10], int val)
+{
+// ensure we do not introduce a temporary
+// CUDA-NOT: {{.*}}FixedArray{{.*}};
+ return data[1] + val;
+}
+
+// CUDA: nested{{.*}}Array{{.*}}*{{.*}}data
+int nested(int data[10], int val)
+{
+ return addCopyElision(data, val);
+}
+
+void modify(inout int data[10])
+{
+ data[1] = input[0];
+}
+// CUDA: notDirectlyUsingParam{{.*}}Array{{.*}}*{{.*}}data
+int notDirectlyUsingParam(int data[10], int val)
+{
+// ensure we create a temporary for the array
+// CUDA: FixedArray{{.*}};
+ modify(data);
+ return data[1] + val;
+}
+
+
+// CUDA:computeMain
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain()
+{
+
+ // struct
+ Data data = Data(input[0]);
+ int structVal = addCopyElision(data, input[1]);
+
+ // struct which is globalParam
+ int globalParamStructVal = addCopyElision(globalData, input[1]);
+
+ // passing nested struct
+ int nestedStructVal = nested(data, input[1]);
+
+ // field
+ DataWrapped dataWrapped = DataWrapped(input[0]);
+ int fieldVal = addCopyElision(dataWrapped.field, input[1]);
+
+ // element
+ int elementVal = addCopyElision(dataWrapped.element[0], input[1]);
+
+ // A non-variable
+ int nonVariableVal = addCopyElision(Data(input[0]), input[1]);
+
+ // array
+ int val[10];
+ val[1] = input[0];
+ int arrayVal = addCopyElision(val, input[1]);
+
+ // passing nested array
+ int nestedArrayVal = nested(val, input[1]);
+
+ // not directly using param
+ int notDirectlyUsingParamVal = notDirectlyUsingParam(val, input[1]);
+
+ output[0] =
+ structVal == 3 &&
+ globalParamStructVal == 3 &&
+ nestedStructVal == 3 &&
+ fieldVal == 3 &&
+ elementVal == 3 &&
+ nonVariableVal == 3 &&
+ arrayVal == 3 &&
+ nestedArrayVal == 3 &&
+ notDirectlyUsingParamVal == 3
+ ? 1 : 0;
+
+ // For debugging
+ //output[1] = structVal;
+ //output[2] = globalParamStructVal;
+ //output[3] = nestedStructVal;
+ //output[4] = fieldVal;
+ //output[5] = elementVal;
+ //output[6] = nonVariableVal;
+ //output[7] = arrayVal;
+ //output[8] = nestedArrayVal;
+ //output[9] = notDirectlyUsingParamVal;
+}
+
+//BUF: 1 \ No newline at end of file