From 487ae034e2b03ddd67945132c8fecbd937952705 Mon Sep 17 00:00:00 2001 From: Sriram Murali <85252063+sriramm-nv@users.noreply.github.com> Date: Mon, 13 May 2024 23:57:57 -0700 Subject: Add LoadAligned and StoreAligned methods to ByteAddressBuffers (#4066) Fixes #4062 This change enables wide load/stores for byte-address-buffer backed resources, when the data is accessed at an offset that is aligned. **Goals** - Improve performance by issuing wider instructions instead of sequence of scalar instructions, for load and stores of byte-address buffers. - Reduce code-size and readability of the generated shaders. - Help naive users as well as ninja programmers, generate optimal code. **Non Goals** - Help with Structured buffers, or other resources. - Target compilation time improvements. **Key changes** Adds 2 new overloads for Load and Store operations on ByteAddress Buffers. 1. Load / Store with an extra alignment parameter ``` resource.Load(offset, alignment); resource.Store(offset, value, alignment); ``` 2. LoadAligned / StoreAligned with no extra parameter, with the same signature as orignial Load / Store. ``` resource.LoadAligned(offset); resource.StoreAligned(offset, value); ``` - This overload will implicitly identify the alignment value, from the base type T of the elementary unit of the resource. **Supported resources** 1. Vectors This can be upto 4 elements, i.e. float -- float4. 2. Arrays This does not have a limit on number of elements, but on a conservative estimate, we can limit to few hundreds. 3. Structures This is used to group a resource of a single type. ``` struct { float4 x; } ``` **Code updates** - Modified byte-address-ir legalize to handle struct, array and vector kinds of load or store access - Added custom hlsl stdlib functions to implement all the overloads for Load, Store etc. - Added C-like emitter, SPIR-V emitter for handling ByteAddressBuffers. - Added a new core stdlib function intrinsic to wrap around alignOf(). - Added a new peephole optimization entry to identify the equivalent IntLiteral value from the alignOf() inst. - Added tests to check explicit, and implicit aligned Load and Store operations. --- tests/compute/byte-address-buffer-array.slang | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tests/compute/byte-address-buffer-array.slang (limited to 'tests/compute/byte-address-buffer-array.slang') diff --git a/tests/compute/byte-address-buffer-array.slang b/tests/compute/byte-address-buffer-array.slang new file mode 100644 index 000000000..1a23821a1 --- /dev/null +++ b/tests/compute/byte-address-buffer-array.slang @@ -0,0 +1,77 @@ +// byte-address-buffer-array.slang +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -d3d12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//DISABLED_TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -output-using-type + +//TEST:SIMPLE(filecheck=CHECK1):-target glsl -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK2):-target hlsl -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK3):-target spirv -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK3):-target spirv -emit-spirv-directly -entry computeMain -stage compute + +// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types. + +[vk::binding(2, 3)] RWByteAddressBuffer buffer; +struct Block { + float4 val[2]; +}; +[shader("compute")] +[numthreads(1,1,1)] +void computeMain(uint3 threadId : SV_DispatchThreadID) +{ + // CHECK-NOT: warning + // CHECK1: _Array_std430_vector{{.*}} _data[] + // CHECK1: _Array_std430_vector{{.*}} packStorage_0(vec4 {{.*}}[2]) + // CHECK1: vec4 {{.*}}[2] = { {{.*}}[0], {{.*}}[1] }; + // CHECK1: _Array_std430_vector{{.*}} {{.*}} = { {{.*}} }; + // CHECK1: void unpackStorage_0(_Array_std430_vector{{.*}}, out vec4 {{.*}}[2]) + // CHECK1: {{.*}}[0] = {{.*}}.data_0[0]; + // CHECK1: {{.*}}[1] = {{.*}}.data_0[1]; + // CHECK1: vec4 {{.*}}[2]; + // CHECK1: unpackStorage_0(buffer_0._data[0], {{.*}}); + // CHECK1: vec4 {{.*}}[2] = buffer_0._data[0] = packStorage_0({{.*}}); + // CHECK1: vec4 {{.*}} = buffer_2._data[1]; + // CHECK1: vec4 {{.*}} = buffer_2._data[1] = vec4(buffer_1._data[1], buffer_1._data[2], buffer_1._data[3], buffer_1._data[4]); + + // CHECK2: float4 {{.*}}[int(2)] = (buffer_0).Load(int(0)); + // CHECK2: buffer_0.Store(int(0),{{.*}}); + // CHECK2: float {{.*}} = (buffer_0).Load(int(4)); + // CHECK2: float {{.*}} = (buffer_0).Load(int(8)); + // CHECK2: float {{.*}} = (buffer_0).Load(int(12)); + // CHECK2: float {{.*}} = (buffer_0).Load(int(16)); + // CHECK2: float4 {{.*}} = float4({{.*}}, {{.*}}, {{.*}}, {{.*}}); + // CHECK2: float4 {{.*}} = (buffer_0).Load(int(20)); + // CHECK2: buffer_0.Store(int(16),{{.*}}); + // CHECK2: buffer_0.Store(int(32),{{.*}}); + + // CHECK3-DAG: %[[ARRV4_2:[a-zA-Z0-9_]+]] = OpTypeArray %v4float %int_2 + // CHECK3-DAG: %[[SARRV4_2:[a-zA-Z0-9_]+]] = OpTypeStruct %[[ARRV4_2]] + // CHECK3-DAG: %[[SBA:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[SARRV4_2]] + // CHECK3-DAG: %[[SBF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %float + // CHECK3-DAG: %[[SBVF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %v4float + // CHECK3-DAG: %[[V0:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}} + // CHECK3-DAG: %[[V1:[a-zA-Z0-9_]+]] = OpLoad %[[SARRV4_2]] %[[V0]] + // CHECK3-DAG: %[[V2:[a-zA-Z0-9_]+]] = OpCompositeExtract %[[ARRV4_2]] %[[V1]] 0 + // CHECK3-DAG: %[[V3:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 0 + // CHECK3-DAG: %[[V4:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 1 + // CHECK3-DAG: %[[V5:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[ARRV4_2]] %[[V3]] %[[V4]] + // CHECK3-DAG: %[[V6:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[SARRV4_2]] %[[V5]] + // CHECK3-DAG: %[[V7:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}} + // CHECK3-DAG: OpStore %[[V7]] %[[V6]] + // CHECK3-DAG: %[[V8:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}} + // CHECK3-DAG: %[[V9:[a-zA-Z0-9_]+]] = OpLoad %float %[[V8]] + // CHECK3-DAG: %[[V10:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}} + // CHECK3-DAG: %[[V11:[a-zA-Z0-9_]+]] = OpLoad %float %[[V10]] + // CHECK3-DAG: %[[V12:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}} + // CHECK3-DAG: %[[V13:[a-zA-Z0-9_]+]] = OpLoad %float %[[V12]] + // CHECK3-DAG: %[[V14:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}} + // CHECK3-DAG: %[[V15:[a-zA-Z0-9_]+]] = OpLoad %float %[[V14]] + // CHECK3-DAG: %[[V16:[a-zA-Z0-9_]+]] = OpCompositeConstruct %v4float %[[V9]] %[[V11]] %[[V13]] %[[V15]] + // CHECK3-DAG: %[[V17:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}} + // CHECK3-DAG: %[[V18:[a-zA-Z0-9_]+]] = OpLoad %v4float %[[V17]] + // CHECK3-DAG: %[[V19:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}} + // CHECK3-DAG: OpStore %[[V19]] %[[V16]] + // CHECK3-DAG: %[[V20:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}} + // CHECK3-DAG: OpStore %[[V20]] %[[V18]] + buffer.Store(0, buffer.LoadAligned(0)); + buffer.StoreAligned(16, buffer.Load(4, 16)); +} + -- cgit v1.2.3