diff options
| author | Sriram Murali <85252063+sriramm-nv@users.noreply.github.com> | 2024-04-30 12:20:16 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-04-30 12:20:16 -0700 |
| commit | 70111daf43c87e182695666c34345e061e114a68 (patch) | |
| tree | 5510ea669f9c5e7853810f84dfe9e01358ab77c6 | |
| parent | 95ca2aa72cca9968dabd6dfa1a4dcf7de3909035 (diff) | |
Generate vectorized version of byteaddress load/store methods (#4036)
Fixes #3533
- Add logic to perform aligned memory operations for loading from and storing to composite resources, like vectors within the ByteAddress legalize pass.
- Checks
Added a new test for byte address with/without alignment.
---------
Co-authored-by: Yong He <yonghe@outlook.com>
| -rw-r--r-- | source/slang/hlsl.meta.slang | 6 | ||||
| -rw-r--r-- | source/slang/slang-emit.cpp | 15 | ||||
| -rw-r--r-- | source/slang/slang-ir-byte-address-legalize.cpp | 63 | ||||
| -rw-r--r-- | tests/compute/byte-address-buffer-aligned.slang | 116 |
4 files changed, 176 insertions, 24 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 284cb2abc..8593dc268 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2801,6 +2801,7 @@ struct $(item.name) } [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint Load(int location) { @@ -2816,6 +2817,7 @@ struct $(item.name) uint Load(int location, out uint status); [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint2 Load2(int location) { @@ -2831,6 +2833,7 @@ struct $(item.name) uint2 Load2(int location, out uint status); [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint3 Load3(int location) { @@ -2846,6 +2849,7 @@ struct $(item.name) uint3 Load3(int location, out uint status); [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] uint4 Load4(int location) { @@ -2861,6 +2865,7 @@ struct $(item.name) uint4 Load4(int location, out uint status); [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] T Load<T>(int location) { @@ -3764,6 +3769,7 @@ ${{{{ } } + [ForceInline] void Store<T>(int offset, T value) { __byteAddressBufferStore(this, offset, value); diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 87f0911e7..1fa04b4be 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -716,14 +716,13 @@ Result linkAndOptimizeIR( // of a buffer need not be more than 4-byte aligned, and loads // of vectors need only be aligned based on their element type). // - // TODO: We should consider having an extended variant of `Load<T>` - // on byte-address buffers which expresses a programmer's knowledge - // that the load will have greater alignment than required by D3D. - // That could either come as an explicit guaranteed-alignment - // operand, or instead as something like a `Load4Aligned<T>` operation - // that returns a `vector<4,T>` and assumes `4*sizeof(T)` alignemtn. - // - byteAddressBufferOptions.scalarizeVectorLoadStore = true; + // Slang IR supports a variant of `Load<T>` on byte-address buffers + // that will have greater alignment than required by D3D. The + // alignment information is inferred from the operation like a + // `Load4Aligned<T>` that returns a `vector<4,T>` that assumes a + // `4*sizeof(T)` alignment. We may choose to disable that in favor + // of byte-address indexing by setting this flag to true. + byteAddressBufferOptions.scalarizeVectorLoadStore = false; // For GLSL targets, there really isn't a low-level concept // of a byte-address buffer at all, and the standard "shader storage diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp index a089ca8ff..35040da64 100644 --- a/source/slang/slang-ir-byte-address-legalize.cpp +++ b/source/slang/slang-ir-byte-address-legalize.cpp @@ -205,6 +205,19 @@ struct ByteAddressBufferLegalizationContext return false; } + bool checkUnaligned(IRInst* baseOffset, IRIntegerValue immediateOffset, IRType* elementType, IRIntegerValue elementCount) + { + // Check whether the given composite resource type is aligned to the baseOffset + IRSizeAndAlignment elementLayout; + SLANG_RETURN_FALSE_ON_FAIL(getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), elementType, &elementLayout)); + IRIntegerValue elementStride = elementLayout.getStride(); + bool isUnaligned = true; + if (auto baseOffsetVal = as<IRIntLit>(baseOffset)) { + isUnaligned = ((baseOffsetVal->getValue() + immediateOffset) % (elementStride * elementCount)) != 0; + } + return isUnaligned; + } + SlangResult getOffset(TargetProgram* target, IRStructField* field, IRIntegerValue* outOffset) { if (target->getHLSLToVulkanLayoutOptions() && target->getHLSLToVulkanLayoutOptions()->shouldUseGLLayout()) @@ -299,7 +312,7 @@ struct ByteAddressBufferLegalizationContext // return m_builder.emitMakeStruct(type, fieldVals); } - else if( auto arrayType = as<IRArrayTypeBase>(type) ) + else if (auto arrayType = as<IRArrayTypeBase>(type)) { // Loading a value of array type amounts to loading each // of its elements. There is shared logic between the @@ -311,8 +324,7 @@ struct ByteAddressBufferLegalizationContext // legalization if the array type isn't in the right form // for us to proceed. // - auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()); - if( elementCountInst ) + if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount())) { return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue()); } @@ -361,17 +373,25 @@ struct ByteAddressBufferLegalizationContext return m_builder.emitMakeMatrix(matType, (UInt)args.getCount(), args.getBuffer()); } } - else if( auto vecType = as<IRVectorType>(type) ) + else if (auto vecType = as<IRVectorType>(type)) { // One of the options that can vary per-target is whether to // scalarize vetor load/store operations. When that option // is turned on, we can treat a vector load just like an // array load. // - auto elementCountInst = as<IRIntLit>(vecType->getElementCount()); - if( m_options.scalarizeVectorLoadStore && elementCountInst) + if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount())) { - return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue()); + // Emit an aligned vector load operation when the data (elementCount * elementSize) is divisible + // by the offset. Else, fallback to scalarizing the loads. + if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue())) + { + return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue()); + } + else + { + return emitSimpleLoad(type, buffer, baseOffset, immediateOffset); + } } // If we aren't scalarizing a vetor load then we next need @@ -649,6 +669,10 @@ struct ByteAddressBufferLegalizationContext IRInst* getEquivalentStructuredBuffer(IRType* elementType, IRInst* byteAddressBuffer) { + if (!elementType) + { + return nullptr; + } // The simple case for replacement is when the byte-address buffer to // be replaced is a global shader parameter. That path will get its // own routine. @@ -869,13 +893,12 @@ struct ByteAddressBufferLegalizationContext } return SLANG_OK; } - else if( auto arrayType = as<IRArrayTypeBase>(type) ) + else if (auto arrayType = as<IRArrayTypeBase>(type)) { // Arrays and other sequences bottleneck through a helper // function, which we will cover later. // - auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()); - if( elementCountInst ) + if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount())) { return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue()); } @@ -918,12 +941,20 @@ struct ByteAddressBufferLegalizationContext return SLANG_OK; } } - else if( auto vecType = as<IRVectorType>(type) ) + else if (auto vecType = as<IRVectorType>(type)) { - auto elementCountInst = as<IRIntLit>(vecType->getElementCount()); - if( m_options.scalarizeVectorLoadStore && elementCountInst) + if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount())) { - return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue()); + // Emit an aligned vector store operation when the data (elementCount * elementSize) is divisible + // by the offset. Else, fallback to scalarizing the stores. + if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue())) + { + return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue()); + } + else + { + return emitSimpleStore(value->getDataType(), buffer, baseOffset, immediateOffset, value); + } } if(m_options.useBitCastFromUInt) @@ -956,9 +987,9 @@ struct ByteAddressBufferLegalizationContext return emitSimpleStore(type, buffer, baseOffset, immediateOffset, value); } - Result emitSimpleStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOfset, IRInst* value) + Result emitSimpleStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value) { - IRInst* offset = emitOffsetAddIfNeeded(baseOffset, immediateOfset); + IRInst* offset = emitOffsetAddIfNeeded(baseOffset, immediateOffset); if( m_options.translateToStructuredBufferOps ) { if( auto structuredBuffer = getEquivalentStructuredBuffer(type, buffer) ) diff --git a/tests/compute/byte-address-buffer-aligned.slang b/tests/compute/byte-address-buffer-aligned.slang new file mode 100644 index 000000000..5024987aa --- /dev/null +++ b/tests/compute/byte-address-buffer-aligned.slang @@ -0,0 +1,116 @@ +// byte-address-buffer-aligned.slang +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -d3d12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//DISABLED_TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -output-using-type + +//TEST:SIMPLE(filecheck=CHECK1):-target glsl -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK2):-target hlsl -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK3):-target spirv -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=CHECK3):-target spirv -emit-spirv-directly -entry computeMain -stage compute + +// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types. + +[vk::binding(2, 3)] RWByteAddressBuffer buffer0; + +[shader("compute")] +[numthreads(1,1,1)] +void computeMain(uint3 threadId : SV_DispatchThreadID) +{ + // CHECK-NOT: warning + + // CHECK1: vec4 {{.*}} = buffer0_{{.*}}.{{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: vec4 {{.*}} = buffer0_{{.*}}.{{.*}} = vec4(buffer0_{{.*}}.{{.*}}, buffer0_{{.*}}.{{.*}}, buffer0_{{.*}}.{{.*}}, buffer0_{{.*}}.{{.*}}); + // CHECK1: vec4 {{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = _S3[{{.*}}]; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = _S3[{{.*}}]; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = _S3[{{.*}}]; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = _S3[{{.*}}]; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = buffer0_{{.*}}.{{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = {{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = {{.*}}; + // CHECK1: float {{.*}} = buffer0_{{.*}}.{{.*}} = {{.*}}; + + // CHECK2: float4 {{.*}} = (buffer0_0).Load<float4 >(int(32)); + // CHECK2: buffer0_0.Store(int(32),{{.*}}); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(8)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(12)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(16)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(20)); + // CHECK2: buffer0_0.Store(int(32),float4({{.*}}, {{.*}}, {{.*}}, {{.*}})); + // CHECK2: float4 {{.*}} = (buffer0_0).Load<float4 >(int(32)); + // CHECK2: buffer0_0.Store(int(8),{{.*}}[int(0)]); + // CHECK2: buffer0_0.Store(int(12),{{.*}}[int(1)]); + // CHECK2: buffer0_0.Store(int(16),{{.*}}[int(2)]); + // CHECK2: buffer0_0.Store(int(20),{{.*}}[int(3)]); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(8)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(12)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(16)); + // CHECK2: float {{.*}} = (buffer0_0).Load<float >(int(20)); + // CHECK2: buffer0_0.Store(int(8),{{.*}}); + // CHECK2: buffer0_0.Store(int(12),{{.*}}); + // CHECK2: buffer0_0.Store(int(16),{{.*}}); + // CHECK2: buffer0_0.Store(int(20),{{.*}}); + + // CHECK3-DAG: %[[v4f:[a-zA-Z0-9_]+]] = OpTypeVector %float 4 + // CHECK3-DAG: %[[SBv4f:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[v4f]] + // CHECK3-DAG: %[[RTv4f:[a-zA-Z0-9_]+]] = OpTypeRuntimeArray %[[v4f]] + // CHECK3-DAG: %[[RWBv4f:[a-zA-Z0-9_]+]] = OpTypeStruct %[[RTv4f]] + // CHECK3-DAG: %[[SBRW:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[RWBv4f]] + // CHECK3-DAG: %[[RTf:[a-zA-Z0-9_]+]] = OpTypeRuntimeArray %float + // CHECK3-DAG: %[[RWBf:[a-zA-Z0-9_]+]] = OpTypeStruct %[[RTf]] + // CHECK3-DAG: %[[SBRW0:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[RWBf]] + // CHECK3-DAG: %[[SBf:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %float + // CHECK3-DAG: %[[BUF0:[a-zA-Z0-9_]+]] = OpVariable %[[SBRW]] StorageBuffer + // CHECK3-DAG: %[[BUF00:[a-zA-Z0-9_]+]] = OpVariable %[[SBRW0]] StorageBuffer + // CHECK3-DAG: %[[V0:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBv4f]] %[[BUF0]] %{{.*}} + // CHECK3-DAG: %[[V1:[a-zA-Z0-9_]+]] = OpLoad %[[v4f]] %[[V0]] + // CHECK3-DAG: %[[V2:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBv4f]] %[[BUF0]] %{{.*}} + // CHECK3-DAG: OpStore %[[V2]] %[[V1]] + // CHECK3-DAG: %[[V3:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V4:[a-zA-Z0-9_]+]] = OpLoad %float %[[V3]] + // CHECK3-DAG: %[[V5:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V6:[a-zA-Z0-9_]+]] = OpLoad %float %[[V5]] + // CHECK3-DAG: %[[V7:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V8:[a-zA-Z0-9_]+]] = OpLoad %float %[[V7]] + // CHECK3-DAG: %[[V9:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V10:[a-zA-Z0-9_]+]] = OpLoad %float %[[V9]] + // CHECK3-DAG: %[[V11:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[v4f]] %[[V4]] %[[V6]] %[[V8]] %[[V10]] + // CHECK3-DAG: %[[V12:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBv4f]] %[[BUF0]] + // CHECK3-DAG: OpStore %[[V12]] %[[V11]] + // CHECK3-DAG: %[[V13:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBv4f]] %[[BUF0]] + // CHECK3-DAG: %[[V14:[a-zA-Z0-9_]+]] = OpLoad %[[v4f]] %[[V13]] + // CHECK3-DAG: %[[V15:[a-zA-Z0-9_]+]] = OpCompositeExtract %float %[[V14]] 0 + // CHECK3-DAG: %[[V16:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V16]] %[[V15]] + // CHECK3-DAG: %[[V17:[a-zA-Z0-9_]+]] = OpCompositeExtract %float %[[V14]] 1 + // CHECK3-DAG: %[[V18:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V18]] %[[V17]] + // CHECK3-DAG: %[[V19:[a-zA-Z0-9_]+]] = OpCompositeExtract %float %[[V14]] 2 + // CHECK3-DAG: %[[V20:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V20]] %[[V19]] + // CHECK3-DAG: %[[V21:[a-zA-Z0-9_]+]] = OpCompositeExtract %float %[[V14]] 3 + // CHECK3-DAG: %[[V22:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V22]] %[[V21]] + // CHECK3-DAG: %[[V23:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V24:[a-zA-Z0-9_]+]] = OpLoad %float %[[V23]] + // CHECK3-DAG: %[[V25:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V26:[a-zA-Z0-9_]+]] = OpLoad %float %[[V25]] + // CHECK3-DAG: %[[V27:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V28:[a-zA-Z0-9_]+]] = OpLoad %float %[[V27]] + // CHECK3-DAG: %[[V29:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: %[[V30:[a-zA-Z0-9_]+]] = OpLoad %float %[[V29]] + // CHECK3-DAG: %[[V31:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V31]] %[[V24]] + // CHECK3-DAG: %[[V32:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V32]] %[[V26]] + // CHECK3-DAG: %[[V33:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V33]] %[[V28]] + // CHECK3-DAG: %[[V34:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]] + // CHECK3-DAG: OpStore %[[V34]] %[[V30]] + buffer0.Store(32, buffer0.Load<float4>(32)); + buffer0.Store(32, buffer0.Load<float4>(8)); + buffer0.Store(8, buffer0.Load<float4>(32)); + buffer0.Store(8, buffer0.Load<float4>(8)); +} |
