diff options
| author | ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> | 2025-08-07 00:22:22 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-07 07:22:22 +0000 |
| commit | 063cbeaaea2fb00a10c6058ea4a9632092772ea5 (patch) | |
| tree | b4412347d6c264c3b1a84ec971921a5e2fe76134 /tests/cuda | |
| parent | 9e2685853033f4286feaf22d04a755a7395d95ce (diff) | |
Initial copy elision pass (#8042)
Fixes #7574
Changes:
* Add an initial (fairly simple) optimization pass which is able to
eliminate redundant copies.
* Our current existing optimizer passes remove redundant load/store very
robustly, this pass will focus on other cases of copy elimination
* Primary approach is to make all functions which are `in T` and `T` is
trivial to copy into a `__constref T`. We then (depending on scenario)
manually insert a variable+load if a pass-by-reference is not possible;
otherwise we pass by `constref`.
* Added optimizations to eliminate redundant code which causes
`constref` to fail to compile
---------
Co-authored-by: Harsh Aggarwal <haaggarwal@nvidia.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: slangbot <ellieh+slangbot@nvidia.com>
Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
Diffstat (limited to 'tests/cuda')
| -rw-r--r-- | tests/cuda/copy-elision-this-1.slang | 28 | ||||
| -rw-r--r-- | tests/cuda/copy-elision-this-2.slang | 141 |
2 files changed, 169 insertions, 0 deletions
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang new file mode 100644 index 000000000..295b45c73 --- /dev/null +++ b/tests/cuda/copy-elision-this-1.slang @@ -0,0 +1,28 @@ +//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda +struct Data { + StructuredBuffer<float> input[2]; + RWStructuredBuffer<float> output; + uint input_tensor_count; + StructuredBuffer<uint> index_buffer; + uint index_count; + + // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this + float fetch(int buffer, int index) + { + return input[buffer][index]; + } +}; + +ParameterBlock<Data> data; + +[shader("compute")] +[numthreads(8, 8, 1)] +void computeMain(uint3 tid: SV_DispatchThreadID) +{ + float result = 0.0; + for (int i = 0; i < data.index_count; ++i) { + uint buffer = data.index_buffer[i]; + result += data.fetch(buffer, tid.x * 1024 + tid.y); + } + data.output[tid.x * 1024 + tid.y] = result; +} diff --git a/tests/cuda/copy-elision-this-2.slang b/tests/cuda/copy-elision-this-2.slang new file mode 100644 index 000000000..60bb948c9 --- /dev/null +++ b/tests/cuda/copy-elision-this-2.slang @@ -0,0 +1,141 @@ +//TEST:COMPARE_COMPUTE(filecheck-buffer=BUF): -cuda -compute +//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -O3 + +struct Data +{ + int val; + + + __init(int val) + { + this.val = val; + } +}; + +struct DataWrapped +{ + Data field; + Data element[2]; + + __init(int val) + { + field.val = val; + element[0].val = val; + element[1].val = val; + } +} + +//TEST_INPUT:uniform(data=[1]):name=globalData +uniform Data globalData; + +//TEST_INPUT: set input = ubuffer(data=[1 2 3 4], stride=4) +RWStructuredBuffer<int> input; + +//TEST_INPUT: set output = out ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0], stride=4) +RWStructuredBuffer<int> output; + +// CUDA: addCopyElision{{.*}}Data{{.*}}*{{.*}}data +int addCopyElision(Data data, int val) +{ + // ensure we do not introduce a temporary + // CUDA-NOT: Data{{.*}}; + return data.val + val; +} + +// CUDA: nested{{.*}}Data{{.*}}*{{.*}}data +int nested(Data data, int val) +{ + + return addCopyElision(data, val); +} + +// CUDA: addCopyElision{{.*}}FixedArray{{.*}}*{{.*}}data +int addCopyElision(int data[10], int val) +{ +// ensure we do not introduce a temporary +// CUDA-NOT: {{.*}}FixedArray{{.*}}; + return data[1] + val; +} + +// CUDA: nested{{.*}}Array{{.*}}*{{.*}}data +int nested(int data[10], int val) +{ + return addCopyElision(data, val); +} + +void modify(inout int data[10]) +{ + data[1] = input[0]; +} +// CUDA: notDirectlyUsingParam{{.*}}Array{{.*}}*{{.*}}data +int notDirectlyUsingParam(int data[10], int val) +{ +// ensure we create a temporary for the array +// CUDA: FixedArray{{.*}}; + modify(data); + return data[1] + val; +} + + +// CUDA:computeMain +[shader("compute")] +[numthreads(1, 1, 1)] +void computeMain() +{ + + // struct + Data data = Data(input[0]); + int structVal = addCopyElision(data, input[1]); + + // struct which is globalParam + int globalParamStructVal = addCopyElision(globalData, input[1]); + + // passing nested struct + int nestedStructVal = nested(data, input[1]); + + // field + DataWrapped dataWrapped = DataWrapped(input[0]); + int fieldVal = addCopyElision(dataWrapped.field, input[1]); + + // element + int elementVal = addCopyElision(dataWrapped.element[0], input[1]); + + // A non-variable + int nonVariableVal = addCopyElision(Data(input[0]), input[1]); + + // array + int val[10]; + val[1] = input[0]; + int arrayVal = addCopyElision(val, input[1]); + + // passing nested array + int nestedArrayVal = nested(val, input[1]); + + // not directly using param + int notDirectlyUsingParamVal = notDirectlyUsingParam(val, input[1]); + + output[0] = + structVal == 3 && + globalParamStructVal == 3 && + nestedStructVal == 3 && + fieldVal == 3 && + elementVal == 3 && + nonVariableVal == 3 && + arrayVal == 3 && + nestedArrayVal == 3 && + notDirectlyUsingParamVal == 3 + ? 1 : 0; + + // For debugging + //output[1] = structVal; + //output[2] = globalParamStructVal; + //output[3] = nestedStructVal; + //output[4] = fieldVal; + //output[5] = elementVal; + //output[6] = nonVariableVal; + //output[7] = arrayVal; + //output[8] = nestedArrayVal; + //output[9] = notDirectlyUsingParamVal; +} + +//BUF: 1
\ No newline at end of file |
