diff options
| author | Sriram Murali <85252063+sriramm-nv@users.noreply.github.com> | 2024-05-13 23:57:57 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-13 23:57:57 -0700 |
| commit | 487ae034e2b03ddd67945132c8fecbd937952705 (patch) | |
| tree | 036d318a64385151ad9d5e7275c2e387fdca6cee /source/slang | |
| parent | 9f23046138629f78995d54a7722ad6749bd84db9 (diff) | |
Add LoadAligned and StoreAligned methods to ByteAddressBuffers (#4066)
Fixes #4062
This change enables wide load/stores for byte-address-buffer backed
resources, when the data is accessed at an offset that is aligned.
**Goals**
- Improve performance by issuing wider instructions instead of sequence
of scalar instructions, for load and stores of byte-address buffers.
- Reduce code-size and readability of the generated shaders.
- Help naive users as well as ninja programmers, generate optimal code.
**Non Goals**
- Help with Structured buffers, or other resources.
- Target compilation time improvements.
**Key changes**
Adds 2 new overloads for Load and Store operations on ByteAddress Buffers.
1. Load / Store with an extra alignment parameter
```
resource.Load<T>(offset, alignment);
resource.Store<T>(offset, value, alignment);
```
2. LoadAligned / StoreAligned with no extra parameter,
with the same signature as orignial Load / Store.
```
resource.LoadAligned<T>(offset);
resource.StoreAligned<T>(offset, value);
```
- This overload will implicitly identify the alignment value,
from the base type T of the elementary unit of the resource.
**Supported resources**
1. Vectors
This can be upto 4 elements, i.e. float -- float4.
2. Arrays
This does not have a limit on number of elements, but on a
conservative estimate, we can limit to few hundreds.
3. Structures
This is used to group a resource of a single type.
```
struct {
float4 x;
}
```
**Code updates**
- Modified byte-address-ir legalize to handle struct, array and vector
kinds of load or store access
- Added custom hlsl stdlib functions to implement all the overloads for Load,
Store etc.
- Added C-like emitter, SPIR-V emitter for handling ByteAddressBuffers.
- Added a new core stdlib function intrinsic to wrap around alignOf<T>().
- Added a new peephole optimization entry to identify the equivalent
IntLiteral value from the alignOf<T>() inst.
- Added tests to check explicit, and implicit aligned Load and Store
operations.
Diffstat (limited to 'source/slang')
| -rw-r--r-- | source/slang/core.meta.slang | 9 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 327 | ||||
| -rw-r--r-- | source/slang/slang-diagnostic-defs.h | 1 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.cpp | 22 | ||||
| -rw-r--r-- | source/slang/slang-emit.cpp | 2 | ||||
| -rw-r--r-- | source/slang/slang-ir-byte-address-legalize.cpp | 140 | ||||
| -rw-r--r-- | source/slang/slang-ir-byte-address-legalize.h | 2 | ||||
| -rw-r--r-- | source/slang/slang-ir-inst-defs.h | 10 | ||||
| -rw-r--r-- | source/slang/slang-ir-insts.h | 4 | ||||
| -rw-r--r-- | source/slang/slang-ir-layout.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-ir-peephole.cpp | 24 |
11 files changed, 456 insertions, 86 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index bde943972..63bc2571b 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -2412,6 +2412,15 @@ int __naturalStrideOf() return __naturalStrideOf_impl(__declVal<T>()); } +__intrinsic_op($(kIROp_AlignOf)) +int __alignOf_intrinsic_impl<T>(T t); + +[ForceInline] +int __alignOf_intrinsic<T>() +{ + return __alignOf_intrinsic_impl<T>(__default<T>()); +} + __intrinsic_op($(kIROp_TreatAsDynamicUniform)) T asDynamicUniform<T>(T v); diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 303d18771..95ca03beb 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -108,7 +108,7 @@ struct ByteAddressBuffer { case hlsl: __intrinsic_asm ".Load"; default: - return __byteAddressBufferLoad<uint>(this, location); + return __byteAddressBufferLoad<uint>(this, location, 0); } } @@ -124,7 +124,33 @@ struct ByteAddressBuffer { case hlsl: __intrinsic_asm ".Load2"; default: - return __byteAddressBufferLoad<uint2>(this, location); + return __byteAddressBufferLoad<uint2>(this, location, 0); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint2 Load2(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad<uint2>(this, location, alignment); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint2 Load2Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>()); } } @@ -140,7 +166,33 @@ struct ByteAddressBuffer { case hlsl: __intrinsic_asm ".Load3"; default: - return __byteAddressBufferLoad<uint3>(this, location); + return __byteAddressBufferLoad<uint3>(this, location, 0); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint3 Load3(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad<uint3>(this, location, alignment); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint3 Load3Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>()); } } @@ -156,7 +208,33 @@ struct ByteAddressBuffer { case hlsl: __intrinsic_asm ".Load4"; default: - return __byteAddressBufferLoad<uint4>(this, location); + return __byteAddressBufferLoad<uint4>(this, location, 0); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint4 Load4(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad<uint4>(this, location, alignment); + } + } + + [__readNone] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] + uint4 Load4Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>()); } } @@ -167,7 +245,21 @@ struct ByteAddressBuffer [ForceInline] T Load<T>(int location) { - return __byteAddressBufferLoad<T>(this, location); + return __byteAddressBufferLoad<T>(this, location, 0); + } + + [__readNone] + [ForceInline] + T Load<T>(int location, int alignment) + { + return __byteAddressBufferLoad<T>(this, location, alignment); + } + + [__readNone] + [ForceInline] + T LoadAligned<T>(int location) + { + return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>()); } }; @@ -2765,23 +2857,23 @@ uint64_t __asuint64(uint2 i) __intrinsic_op($(kIROp_ByteAddressBufferLoad)) [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)] -T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset); +T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset, int alignment); __intrinsic_op($(kIROp_ByteAddressBufferLoad)) [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset); +T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset, int alignment); __intrinsic_op($(kIROp_ByteAddressBufferLoad)) [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset); +T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment); __intrinsic_op($(kIROp_ByteAddressBufferStore)) [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, T value); +void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, int alignment, T value); __intrinsic_op($(kIROp_ByteAddressBufferStore)) [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, T value); +void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment, T value); __generic<T, L:IBufferDataLayout=DefaultDataLayout> __magic_type(HLSLStructuredBufferType) @@ -2898,7 +2990,7 @@ struct $(item.name) { case hlsl: __intrinsic_asm ".Load"; default: - return __byteAddressBufferLoad<uint>(this, location); + return __byteAddressBufferLoad<uint>(this, location, 0); } } @@ -2914,7 +3006,33 @@ struct $(item.name) { case hlsl: __intrinsic_asm ".Load2"; default: - return __byteAddressBufferLoad<uint2>(this, location); + return __byteAddressBufferLoad<uint2>(this, location, 0); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint2 Load2(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad<uint2>(this, location, alignment); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint2 Load2Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>()); } } @@ -2930,7 +3048,33 @@ struct $(item.name) { case hlsl: __intrinsic_asm ".Load3"; default: - return __byteAddressBufferLoad<uint3>(this, location); + return __byteAddressBufferLoad<uint3>(this, location, 0); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint3 Load3(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad<uint3>(this, location, alignment); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint3 Load3Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>()); } } @@ -2946,7 +3090,33 @@ struct $(item.name) { case hlsl: __intrinsic_asm ".Load4"; default: - return __byteAddressBufferLoad<uint4>(this, location); + return __byteAddressBufferLoad<uint4>(this, location, 0); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint4 Load4(int location, int alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad<uint4>(this, location, alignment); + } + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + uint4 Load4Aligned(int location) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>()); } } @@ -2958,8 +3128,25 @@ struct $(item.name) [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] T Load<T>(int location) { - return __byteAddressBufferLoad<T>(this, location); + return __byteAddressBufferLoad<T>(this, location, 0); } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + T Load<T>(int location, int alignment) + { + return __byteAddressBufferLoad<T>(this, location, alignment); + } + + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + T LoadAligned<T>(int location) + { + return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>()); + } + ${{{{ if (item.op == kIROp_HLSLRWByteAddressBufferType) { @@ -3806,18 +3993,17 @@ ${{{{ [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] - void Store( - uint address, - uint value) + void Store(uint address, uint value) { __target_switch { case hlsl: __intrinsic_asm ".Store"; default: - __byteAddressBufferStore(this, address, value); + __byteAddressBufferStore(this, address, 0, value); } } + [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] void Store2(uint address, uint2 value) @@ -3826,42 +4012,125 @@ ${{{{ { case hlsl: __intrinsic_asm ".Store2"; default: - __byteAddressBufferStore(this, address, value); + __byteAddressBufferStore(this, address, 0, value); + } + } + + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store2(uint address, uint2 value, uint alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, alignment, value); } } [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] - void Store3( - uint address, - uint3 value) + void Store2Aligned(uint address, uint2 value) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint2>(), value); + } + } + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store3(uint address, uint3 value) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, 0, value); + } + } + + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store3(uint address, uint3 value, uint alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, alignment, value); + } + } + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store3Aligned(uint address, uint3 value) { __target_switch { case hlsl: __intrinsic_asm ".Store3"; default: - __byteAddressBufferStore(this, address, value); + __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint3>(), value); + } + } + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store4(uint address, uint4 value) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, 0, value); + } + } + + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store4(uint address, uint4 value, uint alignment) + { + __target_switch + { + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, alignment, value); } } [ForceInline] [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] - void Store4( - uint address, - uint4 value) + void Store4Aligned(uint address, uint4 value) { __target_switch { case hlsl: __intrinsic_asm ".Store4"; default: - __byteAddressBufferStore(this, address, value); + __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint4>(), value); } } [ForceInline] void Store<T>(int offset, T value) { - __byteAddressBufferStore(this, offset, value); + __byteAddressBufferStore(this, offset, 0, value); + } + + [ForceInline] + void Store<T>(int offset, T value, uint alignment) + { + __byteAddressBufferStore(this, offset, alignment, value); + } + + [ForceInline] + void StoreAligned<T>(int offset, T value) + { + __byteAddressBufferStore(this, offset, __alignOf_intrinsic<T>(), value); } }; diff --git a/source/slang/slang-diagnostic-defs.h b/source/slang/slang-diagnostic-defs.h index eb131df21..c2c4953e0 100644 --- a/source/slang/slang-diagnostic-defs.h +++ b/source/slang/slang-diagnostic-defs.h @@ -753,6 +753,7 @@ DIAGNOSTIC(41201, Warning, expectDynamicUniformValue, "value stored at this loca DIAGNOSTIC(41202, Error, notEqualBitCastSize, "invalid to bit_cast differently sized types: '$0' with size '$1' casted into '$2' with size '$3'") DIAGNOSTIC(41203, Warning, notEqualReinterpretCastSize, "reinterpret<> into not equally sized types: '$0' with size '$1' casted into '$2' with size '$3'") +DIAGNOSTIC(41300, Error, byteAddressBufferUnaligned, "invalid alignment `$0` specified for the byte address buffer resource with the element size of `$1`") // // 5xxxx - Target code generation. // diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index b44fa677c..19a7930f6 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -2674,6 +2674,7 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO break; case kIROp_ByteAddressBufferLoad: + { m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(").Load<"); @@ -2682,20 +2683,21 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(")"); break; + } case kIROp_ByteAddressBufferStore: - { - auto prec = getInfo(EmitOp::Postfix); - needClose = maybeEmitParens(outerPrec, prec); + { + auto prec = getInfo(EmitOp::Postfix); + needClose = maybeEmitParens(outerPrec, prec); - emitOperand(inst->getOperand(0), leftSide(outerPrec, prec)); - m_writer->emit(".Store("); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(","); - emitOperand(inst->getOperand(2), getInfo(EmitOp::General)); - m_writer->emit(")"); - } + emitOperand(inst->getOperand(0), leftSide(outerPrec, prec)); + m_writer->emit(".Store("); + emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + m_writer->emit(","); + emitOperand(inst->getOperand(inst->getOperandCount() - 1), getInfo(EmitOp::General)); + m_writer->emit(")"); break; + } case kIROp_PackAnyValue: { m_writer->emit("packAnyValue<"); diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 39ebaa64d..ab22e1f90 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -777,7 +777,7 @@ Result linkAndOptimizeIR( break; } - legalizeByteAddressBufferOps(session, targetProgram, irModule, byteAddressBufferOptions); + legalizeByteAddressBufferOps(session, targetProgram, irModule, codeGenContext->getSink(), byteAddressBufferOptions); } // For CUDA targets only, we will need to turn operations diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp index 35040da64..0561d8744 100644 --- a/source/slang/slang-ir-byte-address-legalize.cpp +++ b/source/slang/slang-ir-byte-address-legalize.cpp @@ -28,6 +28,7 @@ struct ByteAddressBufferLegalizationContext TargetRequest* m_target = nullptr; ByteAddressBufferLegalizationOptions m_options; + DiagnosticSink* m_sink = nullptr; // We will also use a central IR builder when generating new // code as part of legalization (rather than create/destroy // IR builders on the fly). @@ -124,14 +125,15 @@ struct ByteAddressBufferLegalizationContext // auto buffer = load->getOperand(0); auto offset = load->getOperand(1); - auto legalLoad = emitLegalLoad(type, buffer, offset, 0); + auto alignment = load->getOperand(2); + auto legalLoad = emitLegalLoad(type, buffer, offset, 0, alignment); // If it currently possible for the legalization // to fail (perhaps because of something else that // is invalid in the IR), so we will defensively // leave the code along in that case. // - if(!legalLoad) + if (!legalLoad) return; // If we were able to generate a legal load operation, @@ -154,21 +156,21 @@ struct ByteAddressBufferLegalizationContext // operations, then that means *no* type is // legal for byte-address load/store. // - if(m_options.translateToStructuredBufferOps) + if (m_options.translateToStructuredBufferOps) return false; // Basic types are usually legal to load/store // on all targets. // - if( auto basicType = as<IRBasicType>(type) ) + if (auto basicType = as<IRBasicType>(type)) { // On targets that require translation to // make all load/store use `uint` values, // any scalar type that isn't `uint` is // illegal. // - if( m_options.useBitCastFromUInt - && basicType->getBaseType() != BaseType::UInt ) + if (m_options.useBitCastFromUInt + && basicType->getBaseType() != BaseType::UInt) { return false; } @@ -181,13 +183,13 @@ struct ByteAddressBufferLegalizationContext // Vector types also depend on the options. // - if( as<IRVectorType>(type) ) + if (as<IRVectorType>(type)) { // If we've been asked to scalarize all // vector load/store, then we need to // tread them as illegal. // - if(m_options.scalarizeVectorLoadStore) + if (m_options.scalarizeVectorLoadStore) return false; } @@ -205,17 +207,35 @@ struct ByteAddressBufferLegalizationContext return false; } - bool checkUnaligned(IRInst* baseOffset, IRIntegerValue immediateOffset, IRType* elementType, IRIntegerValue elementCount) + // Helper function to check if the alignment value passed is + // divisible by the offset at which the resource is indexed into + // in order to ensure if the load or store can be vectorized. + bool isAligned(IRInst* offset, IRInst* unknownOffsetAlignment, IRIntegerValue alignmentVal) { - // Check whether the given composite resource type is aligned to the baseOffset - IRSizeAndAlignment elementLayout; - SLANG_RETURN_FALSE_ON_FAIL(getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), elementType, &elementLayout)); - IRIntegerValue elementStride = elementLayout.getStride(); - bool isUnaligned = true; - if (auto baseOffsetVal = as<IRIntLit>(baseOffset)) { - isUnaligned = ((baseOffsetVal->getValue() + immediateOffset) % (elementStride * elementCount)) != 0; + if (auto baseOffsetVal = as<IRIntLit>(offset)) + { + // If the offset is a constant known at compile time, simply check if it aligned to + // the elementsize of the underlying resource. + return (baseOffsetVal->getValue() % alignmentVal) == 0; + } + else if (auto alignInst = as<IRIntLit>(unknownOffsetAlignment)) + { + // If the offset is not known during compile time, use the explicit align + // field of the overloaded `Load` or `Store` operation or vi `LoadAligned` + // or `StoreAligned` function. + // + // Unaligned `Load`s or `Store`s are identified with 0 alignment, to prevent + // accidentally issuing a wide vectorized operations. + if (!alignInst->getValue()) + return false; + + if ((alignInst->getValue() % alignmentVal) == 0) + { + return true; + } + m_sink->diagnose(offset->sourceLoc, Slang::Diagnostics::byteAddressBufferUnaligned, alignInst->getValue(), alignmentVal); } - return isUnaligned; + return false; } SlangResult getOffset(TargetProgram* target, IRStructField* field, IRIntegerValue* outOffset) @@ -241,7 +261,7 @@ struct ByteAddressBufferLegalizationContext // given `type` from the given `buffer` at the required `baseOffset` // plus the `immediateOffset` if any. // - IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset) + IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment) { // The right way to load a value depends primarily // on the type, and secondarily on the options @@ -299,7 +319,7 @@ struct ByteAddressBufferLegalizationContext // for earlier fields will be left behind but can be eliminated // as dead code. // - auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset); + auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment); if(!fieldVal) return nullptr; @@ -324,9 +344,23 @@ struct ByteAddressBufferLegalizationContext // legalization if the array type isn't in the right form // for us to proceed. // + if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount())) { - return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue()); + // Emit an aligned load operation on an array when using a LoadAligned inst. + // Else, fallback to scalarizing the loads. + IRSizeAndAlignment elementLayout; + SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout)); + IRIntegerValue elementStride = elementLayout.getStride(); + auto alignmentVal = elementStride * elementCountInst->getValue(); + if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal)) + { + return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue(), alignment); + } + else + { + return emitSimpleLoad(type, buffer, baseOffset, immediateOffset); + } } } else if( auto matType = as<IRMatrixType>(type) ) @@ -341,7 +375,7 @@ struct ByteAddressBufferLegalizationContext if( rowCountInst ) { auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount()); - return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue()); + return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue(), alignment); } } else @@ -354,7 +388,7 @@ struct ByteAddressBufferLegalizationContext getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment); for (Index c = 0; c < colCount; c++) { - auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset); + auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset, alignment); for (Index r = 0; r < rowCount; r++) { elements.add(m_builder.emitElementExtract(colVector, (IRIntegerValue)r)); @@ -382,11 +416,15 @@ struct ByteAddressBufferLegalizationContext // if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount())) { - // Emit an aligned vector load operation when the data (elementCount * elementSize) is divisible - // by the offset. Else, fallback to scalarizing the loads. - if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue())) + // Emit an aligned vector load operation when using a LoadAligned inst. + // Else, fallback to scalarizing the loads. + IRSizeAndAlignment elementLayout; + SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout)); + IRIntegerValue elementStride = elementLayout.getStride(); + auto alignmentVal = elementStride * elementCountInst->getValue(); + if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal)) { - return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue()); + return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue(), alignment); } else { @@ -464,7 +502,7 @@ struct ByteAddressBufferLegalizationContext // Loading of sequences for arrays, matrices, and vectors is // bottlenecked through a single function. // - IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount) + IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment) { // Or goal here is to produce a value of the given `type`, loaded from `buffer` // at `baseOffset` plus `immediateOffset`. @@ -486,7 +524,7 @@ struct ByteAddressBufferLegalizationContext List<IRInst*> elementVals; for( IRIntegerValue ii = 0; ii < elementCount; ++ii ) { - auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride); + auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment); if(!elementVal) return nullptr; @@ -844,7 +882,7 @@ struct ByteAddressBufferLegalizationContext // the type of the store operation, but instead the operand // that represents the value to be stored. // - auto value = store->getOperand(2); + auto value = store->getOperand(3); auto type = value->getDataType(); // Types that are already legal to use don't require any processing. @@ -863,14 +901,14 @@ struct ByteAddressBufferLegalizationContext // performance issue, but we should still consider trying to // tighten this up and make all uhandled cases be hard errors). // - auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, value); + auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, store->getOperand(2), value); if(SLANG_FAILED(result)) return; store->removeAndDeallocate(); } - Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value) + Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment, IRInst* value) { // The flow for emitting a legal store is very similar to that for // legal loads; we will recurse on the structure of `type` and @@ -889,7 +927,7 @@ struct ByteAddressBufferLegalizationContext SLANG_RETURN_ON_FAIL(getOffset(m_targetProgram, field, &fieldOffset)); auto fieldVal = m_builder.emitFieldExtract(fieldType, value, field->getKey()); - SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, fieldVal)); + SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment, fieldVal)); } return SLANG_OK; } @@ -900,7 +938,20 @@ struct ByteAddressBufferLegalizationContext // if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount())) { - return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue()); + // Emit an aligned store operation on an array when using a StoreAligned inst. + // Else, fallback to scalarizing the stores. + IRSizeAndAlignment elementLayout; + SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout)); + IRIntegerValue elementStride = elementLayout.getStride(); + auto alignmentVal = elementStride * elementCountInst->getValue(); + if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal)) + { + return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue(), alignment); + } + else + { + return emitSimpleStore(value->getDataType(), buffer, baseOffset, immediateOffset, value); + } } } else if( auto matType = as<IRMatrixType>(type) ) @@ -912,7 +963,7 @@ struct ByteAddressBufferLegalizationContext if( rowCountInst ) { auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount()); - return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue()); + return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue(), alignment); } } else @@ -935,7 +986,7 @@ struct ByteAddressBufferLegalizationContext auto colVector = m_builder.emitMakeVector(colVectorType, colVectorArgs); IRSizeAndAlignment colVectorSizeAlignment; getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment); - emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, colVector); + emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, alignment, colVector); immediateOffset += colVectorSizeAlignment.getStride(); } return SLANG_OK; @@ -945,11 +996,16 @@ struct ByteAddressBufferLegalizationContext { if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount())) { - // Emit an aligned vector store operation when the data (elementCount * elementSize) is divisible - // by the offset. Else, fallback to scalarizing the stores. - if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue())) + // Emit an aligned vector store operation when using a StoreAligned inst. + // Else, fallback to scalarizing the stores. + + IRSizeAndAlignment elementLayout; + SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout)); + IRIntegerValue elementStride = elementLayout.getStride(); + auto alignmentVal = elementStride * elementCountInst->getValue(); + if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal)) { - return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue()); + return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue(), alignment); } else { @@ -1023,7 +1079,7 @@ struct ByteAddressBufferLegalizationContext } } - Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount) + Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment) { // The store case for sequences is similar to the load case. // @@ -1038,7 +1094,7 @@ struct ByteAddressBufferLegalizationContext { auto elementIndex = m_builder.getIntValue(indexType, ii); auto elementVal = m_builder.emitElementExtract(elementType, value, elementIndex); - SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, elementVal)); + SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment, elementVal)); } return SLANG_OK; @@ -1050,6 +1106,7 @@ void legalizeByteAddressBufferOps( Session* session, TargetProgram* program, IRModule* module, + DiagnosticSink* sink, ByteAddressBufferLegalizationOptions const& options) { ByteAddressBufferLegalizationContext context; @@ -1057,6 +1114,7 @@ void legalizeByteAddressBufferOps( context.m_target = program->getTargetReq(); context.m_options = options; context.m_targetProgram = program; + context.m_sink = sink; context.processModule(module); } diff --git a/source/slang/slang-ir-byte-address-legalize.h b/source/slang/slang-ir-byte-address-legalize.h index 996a93c73..71ab8a4e1 100644 --- a/source/slang/slang-ir-byte-address-legalize.h +++ b/source/slang/slang-ir-byte-address-legalize.h @@ -6,6 +6,7 @@ namespace Slang class Session; class TargetProgram; struct IRModule; +class DiagnosticSink; struct ByteAddressBufferLegalizationOptions { @@ -24,6 +25,7 @@ void legalizeByteAddressBufferOps( Session* session, TargetProgram* target, IRModule* module, + DiagnosticSink* sink, ByteAddressBufferLegalizationOptions const& options); } diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h index f4954375d..4bad614b3 100644 --- a/source/slang/slang-ir-inst-defs.h +++ b/source/slang/slang-ir-inst-defs.h @@ -428,25 +428,27 @@ INST(ImageStore, imageStore, 3, 0) // Load (almost) arbitrary-type data from a byte-address buffer // -// %dst = byteAddressBufferLoad(%buffer, %offset) +// %dst = byteAddressBufferLoad(%buffer, %offset, %alignment) // // where // - `buffer` is a value of some `ByteAddressBufferTypeBase` type // - `offset` is an `int` +// - `alignment` is an `int` // - `dst` is a value of some type containing only ordinary data // -INST(ByteAddressBufferLoad, byteAddressBufferLoad, 2, 0) +INST(ByteAddressBufferLoad, byteAddressBufferLoad, 3, 0) // Store (almost) arbitrary-type data to a byte-address buffer // -// byteAddressBufferLoad(%buffer, %offset, %src) +// byteAddressBufferLoad(%buffer, %offset, %alignment, %src) // // where // - `buffer` is a value of some `ByteAddressBufferTypeBase` type // - `offset` is an `int` +// - `alignment` is an `int` // - `src` is a value of some type containing only ordinary data // -INST(ByteAddressBufferStore, byteAddressBufferStore, 3, 0) +INST(ByteAddressBufferStore, byteAddressBufferStore, 4, 0) // Load data from a structured buffer // diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h index 5c4f01ae7..f0613dfa5 100644 --- a/source/slang/slang-ir-insts.h +++ b/source/slang/slang-ir-insts.h @@ -2252,6 +2252,10 @@ struct IRLayoutDecoration : IRDecoration }; // +struct IRAlignOf : IRInst +{ + IRInst* getBaseOp() { return getOperand(0); } +}; struct IRCall : IRInst { diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp index f35fa6750..6a4e9360a 100644 --- a/source/slang/slang-ir-layout.cpp +++ b/source/slang/slang-ir-layout.cpp @@ -497,7 +497,6 @@ struct Std140LayoutRules : IRTypeLayoutRules Result getNaturalSizeAndAlignment(CompilerOptionSet& optionSet, IRType* type, IRSizeAndAlignment* outSizeAndAlignment) { return getSizeAndAlignment(optionSet, IRTypeLayoutRules::getNatural(), type, outSizeAndAlignment); - } Result getNaturalOffset(CompilerOptionSet& optionSet, IRStructField* field, IRIntegerValue* outOffset) diff --git a/source/slang/slang-ir-peephole.cpp b/source/slang/slang-ir-peephole.cpp index 88b26fbd3..16e440b32 100644 --- a/source/slang/slang-ir-peephole.cpp +++ b/source/slang/slang-ir-peephole.cpp @@ -250,6 +250,30 @@ struct PeepholeContext : InstPassBase switch (inst->getOp()) { + case kIROp_AlignOf: + // Fold all calls to alignOf<T>() that returns a simple integer value. + if (inst->getDataType()->getOp() == kIROp_IntType) + { + if (!targetProgram) + break; + + // Save the alignment information and exit early if it is invalid + IRSizeAndAlignment sizeAlignment; + auto alignOfInst = as<IRAlignOf>(inst); + auto baseType = alignOfInst->getBaseOp()->getDataType(); + if (SLANG_FAILED(getNaturalSizeAndAlignment(targetProgram->getOptionSet(), baseType, &sizeAlignment))) + break; + if (sizeAlignment.size == 0) + break; + + IRBuilder builder(module); + builder.setInsertBefore(inst); + auto stride = builder.getIntValue(inst->getDataType(), sizeAlignment.getStride()); + inst->replaceUsesWith(stride); + maybeRemoveOldInst(inst); + changed = true; + } + break; case kIROp_GetResultError: if (inst->getOperand(0)->getOp() == kIROp_MakeResultError) { |
