From 063cbeaaea2fb00a10c6058ea4a9632092772ea5 Mon Sep 17 00:00:00 2001 From: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> Date: Thu, 7 Aug 2025 00:22:22 -0700 Subject: Initial copy elision pass (#8042) Fixes #7574 Changes: * Add an initial (fairly simple) optimization pass which is able to eliminate redundant copies. * Our current existing optimizer passes remove redundant load/store very robustly, this pass will focus on other cases of copy elimination * Primary approach is to make all functions which are `in T` and `T` is trivial to copy into a `__constref T`. We then (depending on scenario) manually insert a variable+load if a pass-by-reference is not possible; otherwise we pass by `constref`. * Added optimizations to eliminate redundant code which causes `constref` to fail to compile --------- Co-authored-by: Harsh Aggarwal Co-authored-by: Claude Co-authored-by: slangbot Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> --- tests/cuda/copy-elision-this-1.slang | 28 +++++ tests/cuda/copy-elision-this-2.slang | 141 +++++++++++++++++++++++++ tests/language-feature/pointer/const-ref.slang | 8 +- 3 files changed, 173 insertions(+), 4 deletions(-) create mode 100644 tests/cuda/copy-elision-this-1.slang create mode 100644 tests/cuda/copy-elision-this-2.slang (limited to 'tests') diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang new file mode 100644 index 000000000..295b45c73 --- /dev/null +++ b/tests/cuda/copy-elision-this-1.slang @@ -0,0 +1,28 @@ +//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda +struct Data { + StructuredBuffer input[2]; + RWStructuredBuffer output; + uint input_tensor_count; + StructuredBuffer index_buffer; + uint index_count; + + // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this + float fetch(int buffer, int index) + { + return input[buffer][index]; + } +}; + +ParameterBlock data; + +[shader("compute")] +[numthreads(8, 8, 1)] +void computeMain(uint3 tid: SV_DispatchThreadID) +{ + float result = 0.0; + for (int i = 0; i < data.index_count; ++i) { + uint buffer = data.index_buffer[i]; + result += data.fetch(buffer, tid.x * 1024 + tid.y); + } + data.output[tid.x * 1024 + tid.y] = result; +} diff --git a/tests/cuda/copy-elision-this-2.slang b/tests/cuda/copy-elision-this-2.slang new file mode 100644 index 000000000..60bb948c9 --- /dev/null +++ b/tests/cuda/copy-elision-this-2.slang @@ -0,0 +1,141 @@ +//TEST:COMPARE_COMPUTE(filecheck-buffer=BUF): -cuda -compute +//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -O3 + +struct Data +{ + int val; + + + __init(int val) + { + this.val = val; + } +}; + +struct DataWrapped +{ + Data field; + Data element[2]; + + __init(int val) + { + field.val = val; + element[0].val = val; + element[1].val = val; + } +} + +//TEST_INPUT:uniform(data=[1]):name=globalData +uniform Data globalData; + +//TEST_INPUT: set input = ubuffer(data=[1 2 3 4], stride=4) +RWStructuredBuffer input; + +//TEST_INPUT: set output = out ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0], stride=4) +RWStructuredBuffer output; + +// CUDA: addCopyElision{{.*}}Data{{.*}}*{{.*}}data +int addCopyElision(Data data, int val) +{ + // ensure we do not introduce a temporary + // CUDA-NOT: Data{{.*}}; + return data.val + val; +} + +// CUDA: nested{{.*}}Data{{.*}}*{{.*}}data +int nested(Data data, int val) +{ + + return addCopyElision(data, val); +} + +// CUDA: addCopyElision{{.*}}FixedArray{{.*}}*{{.*}}data +int addCopyElision(int data[10], int val) +{ +// ensure we do not introduce a temporary +// CUDA-NOT: {{.*}}FixedArray{{.*}}; + return data[1] + val; +} + +// CUDA: nested{{.*}}Array{{.*}}*{{.*}}data +int nested(int data[10], int val) +{ + return addCopyElision(data, val); +} + +void modify(inout int data[10]) +{ + data[1] = input[0]; +} +// CUDA: notDirectlyUsingParam{{.*}}Array{{.*}}*{{.*}}data +int notDirectlyUsingParam(int data[10], int val) +{ +// ensure we create a temporary for the array +// CUDA: FixedArray{{.*}}; + modify(data); + return data[1] + val; +} + + +// CUDA:computeMain +[shader("compute")] +[numthreads(1, 1, 1)] +void computeMain() +{ + + // struct + Data data = Data(input[0]); + int structVal = addCopyElision(data, input[1]); + + // struct which is globalParam + int globalParamStructVal = addCopyElision(globalData, input[1]); + + // passing nested struct + int nestedStructVal = nested(data, input[1]); + + // field + DataWrapped dataWrapped = DataWrapped(input[0]); + int fieldVal = addCopyElision(dataWrapped.field, input[1]); + + // element + int elementVal = addCopyElision(dataWrapped.element[0], input[1]); + + // A non-variable + int nonVariableVal = addCopyElision(Data(input[0]), input[1]); + + // array + int val[10]; + val[1] = input[0]; + int arrayVal = addCopyElision(val, input[1]); + + // passing nested array + int nestedArrayVal = nested(val, input[1]); + + // not directly using param + int notDirectlyUsingParamVal = notDirectlyUsingParam(val, input[1]); + + output[0] = + structVal == 3 && + globalParamStructVal == 3 && + nestedStructVal == 3 && + fieldVal == 3 && + elementVal == 3 && + nonVariableVal == 3 && + arrayVal == 3 && + nestedArrayVal == 3 && + notDirectlyUsingParamVal == 3 + ? 1 : 0; + + // For debugging + //output[1] = structVal; + //output[2] = globalParamStructVal; + //output[3] = nestedStructVal; + //output[4] = fieldVal; + //output[5] = elementVal; + //output[6] = nonVariableVal; + //output[7] = arrayVal; + //output[8] = nestedArrayVal; + //output[9] = notDirectlyUsingParamVal; +} + +//BUF: 1 \ No newline at end of file diff --git a/tests/language-feature/pointer/const-ref.slang b/tests/language-feature/pointer/const-ref.slang index f62fda697..06bb9dc07 100644 --- a/tests/language-feature/pointer/const-ref.slang +++ b/tests/language-feature/pointer/const-ref.slang @@ -3,7 +3,7 @@ //TEST:SIMPLE(filecheck=CHECK): -target cuda -entry computeMain -stage compute //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUFFER): -slang -compute -output-using-type -shaderobj //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUFFER): -vk -compute -output-using-type -shaderobj - +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUFFER): -cuda -compute -output-using-type -shaderobj //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer RWStructuredBuffer outputBuffer; @@ -14,8 +14,8 @@ struct Thing int bigArray[128]; // Check that we are not inserting local variables that are copies of `this` parameter. - - // CHECK: __device__ int Thing_getSum{{.*}}(Thing{{.*}} * this{{.*}}) + + // CHECK: __device__ int Thing_getSum{{.*}}Thing{{.*}}*{{.*}}this{{.*}}) // CHECK-NOT: Thing{{[a-zA-Z0-9_]*}} {{[a-zA-Z0-9_]+}} // CHECK: } [constref] @@ -32,7 +32,7 @@ struct Thing // Check that we are not inserting local variables that are copies of `thing` parameter. -// CHECK: __device__ int test{{.*}}(Thing{{.*}} * thing{{.*}}) +// CHECK: __device__ int test{{.*}}Thing{{.*}}*{{.*}}thing{{.*}}) // CHECK-NOT: Thing{{[a-zA-Z0-9_]*}} {{[a-zA-Z0-9_]+}} // CHECK: } -- cgit v1.2.3