summaryrefslogtreecommitdiff
path: root/source/slang
diff options
context:
space:
mode:
authorSriram Murali <85252063+sriramm-nv@users.noreply.github.com>2024-04-30 12:20:16 -0700
committerGitHub <noreply@github.com>2024-04-30 12:20:16 -0700
commit70111daf43c87e182695666c34345e061e114a68 (patch)
tree5510ea669f9c5e7853810f84dfe9e01358ab77c6 /source/slang
parent95ca2aa72cca9968dabd6dfa1a4dcf7de3909035 (diff)
Generate vectorized version of byteaddress load/store methods (#4036)
Fixes #3533 - Add logic to perform aligned memory operations for loading from and storing to composite resources, like vectors within the ByteAddress legalize pass. - Checks Added a new test for byte address with/without alignment. --------- Co-authored-by: Yong He <yonghe@outlook.com>
Diffstat (limited to 'source/slang')
-rw-r--r--source/slang/hlsl.meta.slang6
-rw-r--r--source/slang/slang-emit.cpp15
-rw-r--r--source/slang/slang-ir-byte-address-legalize.cpp63
3 files changed, 60 insertions, 24 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 284cb2abc..8593dc268 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2801,6 +2801,7 @@ struct $(item.name)
}
[__NoSideEffect]
+ [ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
uint Load(int location)
{
@@ -2816,6 +2817,7 @@ struct $(item.name)
uint Load(int location, out uint status);
[__NoSideEffect]
+ [ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
uint2 Load2(int location)
{
@@ -2831,6 +2833,7 @@ struct $(item.name)
uint2 Load2(int location, out uint status);
[__NoSideEffect]
+ [ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
uint3 Load3(int location)
{
@@ -2846,6 +2849,7 @@ struct $(item.name)
uint3 Load3(int location, out uint status);
[__NoSideEffect]
+ [ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
uint4 Load4(int location)
{
@@ -2861,6 +2865,7 @@ struct $(item.name)
uint4 Load4(int location, out uint status);
[__NoSideEffect]
+ [ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
T Load<T>(int location)
{
@@ -3764,6 +3769,7 @@ ${{{{
}
}
+ [ForceInline]
void Store<T>(int offset, T value)
{
__byteAddressBufferStore(this, offset, value);
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 87f0911e7..1fa04b4be 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -716,14 +716,13 @@ Result linkAndOptimizeIR(
// of a buffer need not be more than 4-byte aligned, and loads
// of vectors need only be aligned based on their element type).
//
- // TODO: We should consider having an extended variant of `Load<T>`
- // on byte-address buffers which expresses a programmer's knowledge
- // that the load will have greater alignment than required by D3D.
- // That could either come as an explicit guaranteed-alignment
- // operand, or instead as something like a `Load4Aligned<T>` operation
- // that returns a `vector<4,T>` and assumes `4*sizeof(T)` alignemtn.
- //
- byteAddressBufferOptions.scalarizeVectorLoadStore = true;
+ // Slang IR supports a variant of `Load<T>` on byte-address buffers
+ // that will have greater alignment than required by D3D. The
+ // alignment information is inferred from the operation like a
+ // `Load4Aligned<T>` that returns a `vector<4,T>` that assumes a
+ // `4*sizeof(T)` alignment. We may choose to disable that in favor
+ // of byte-address indexing by setting this flag to true.
+ byteAddressBufferOptions.scalarizeVectorLoadStore = false;
// For GLSL targets, there really isn't a low-level concept
// of a byte-address buffer at all, and the standard "shader storage
diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp
index a089ca8ff..35040da64 100644
--- a/source/slang/slang-ir-byte-address-legalize.cpp
+++ b/source/slang/slang-ir-byte-address-legalize.cpp
@@ -205,6 +205,19 @@ struct ByteAddressBufferLegalizationContext
return false;
}
+ bool checkUnaligned(IRInst* baseOffset, IRIntegerValue immediateOffset, IRType* elementType, IRIntegerValue elementCount)
+ {
+ // Check whether the given composite resource type is aligned to the baseOffset
+ IRSizeAndAlignment elementLayout;
+ SLANG_RETURN_FALSE_ON_FAIL(getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), elementType, &elementLayout));
+ IRIntegerValue elementStride = elementLayout.getStride();
+ bool isUnaligned = true;
+ if (auto baseOffsetVal = as<IRIntLit>(baseOffset)) {
+ isUnaligned = ((baseOffsetVal->getValue() + immediateOffset) % (elementStride * elementCount)) != 0;
+ }
+ return isUnaligned;
+ }
+
SlangResult getOffset(TargetProgram* target, IRStructField* field, IRIntegerValue* outOffset)
{
if (target->getHLSLToVulkanLayoutOptions() && target->getHLSLToVulkanLayoutOptions()->shouldUseGLLayout())
@@ -299,7 +312,7 @@ struct ByteAddressBufferLegalizationContext
//
return m_builder.emitMakeStruct(type, fieldVals);
}
- else if( auto arrayType = as<IRArrayTypeBase>(type) )
+ else if (auto arrayType = as<IRArrayTypeBase>(type))
{
// Loading a value of array type amounts to loading each
// of its elements. There is shared logic between the
@@ -311,8 +324,7 @@ struct ByteAddressBufferLegalizationContext
// legalization if the array type isn't in the right form
// for us to proceed.
//
- auto elementCountInst = as<IRIntLit>(arrayType->getElementCount());
- if( elementCountInst )
+ if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
{
return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue());
}
@@ -361,17 +373,25 @@ struct ByteAddressBufferLegalizationContext
return m_builder.emitMakeMatrix(matType, (UInt)args.getCount(), args.getBuffer());
}
}
- else if( auto vecType = as<IRVectorType>(type) )
+ else if (auto vecType = as<IRVectorType>(type))
{
// One of the options that can vary per-target is whether to
// scalarize vetor load/store operations. When that option
// is turned on, we can treat a vector load just like an
// array load.
//
- auto elementCountInst = as<IRIntLit>(vecType->getElementCount());
- if( m_options.scalarizeVectorLoadStore && elementCountInst)
+ if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
{
- return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue());
+ // Emit an aligned vector load operation when the data (elementCount * elementSize) is divisible
+ // by the offset. Else, fallback to scalarizing the loads.
+ if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+ {
+ return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue());
+ }
+ else
+ {
+ return emitSimpleLoad(type, buffer, baseOffset, immediateOffset);
+ }
}
// If we aren't scalarizing a vetor load then we next need
@@ -649,6 +669,10 @@ struct ByteAddressBufferLegalizationContext
IRInst* getEquivalentStructuredBuffer(IRType* elementType, IRInst* byteAddressBuffer)
{
+ if (!elementType)
+ {
+ return nullptr;
+ }
// The simple case for replacement is when the byte-address buffer to
// be replaced is a global shader parameter. That path will get its
// own routine.
@@ -869,13 +893,12 @@ struct ByteAddressBufferLegalizationContext
}
return SLANG_OK;
}
- else if( auto arrayType = as<IRArrayTypeBase>(type) )
+ else if (auto arrayType = as<IRArrayTypeBase>(type))
{
// Arrays and other sequences bottleneck through a helper
// function, which we will cover later.
//
- auto elementCountInst = as<IRIntLit>(arrayType->getElementCount());
- if( elementCountInst )
+ if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
{
return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue());
}
@@ -918,12 +941,20 @@ struct ByteAddressBufferLegalizationContext
return SLANG_OK;
}
}
- else if( auto vecType = as<IRVectorType>(type) )
+ else if (auto vecType = as<IRVectorType>(type))
{
- auto elementCountInst = as<IRIntLit>(vecType->getElementCount());
- if( m_options.scalarizeVectorLoadStore && elementCountInst)
+ if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
{
- return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue());
+ // Emit an aligned vector store operation when the data (elementCount * elementSize) is divisible
+ // by the offset. Else, fallback to scalarizing the stores.
+ if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+ {
+ return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue());
+ }
+ else
+ {
+ return emitSimpleStore(value->getDataType(), buffer, baseOffset, immediateOffset, value);
+ }
}
if(m_options.useBitCastFromUInt)
@@ -956,9 +987,9 @@ struct ByteAddressBufferLegalizationContext
return emitSimpleStore(type, buffer, baseOffset, immediateOffset, value);
}
- Result emitSimpleStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOfset, IRInst* value)
+ Result emitSimpleStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value)
{
- IRInst* offset = emitOffsetAddIfNeeded(baseOffset, immediateOfset);
+ IRInst* offset = emitOffsetAddIfNeeded(baseOffset, immediateOffset);
if( m_options.translateToStructuredBufferOps )
{
if( auto structuredBuffer = getEquivalentStructuredBuffer(type, buffer) )