summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSriram Murali <85252063+sriramm-nv@users.noreply.github.com>2024-05-13 23:57:57 -0700
committerGitHub <noreply@github.com>2024-05-13 23:57:57 -0700
commit487ae034e2b03ddd67945132c8fecbd937952705 (patch)
tree036d318a64385151ad9d5e7275c2e387fdca6cee
parent9f23046138629f78995d54a7722ad6749bd84db9 (diff)
Add LoadAligned and StoreAligned methods to ByteAddressBuffers (#4066)
Fixes #4062 This change enables wide load/stores for byte-address-buffer backed resources, when the data is accessed at an offset that is aligned. **Goals** - Improve performance by issuing wider instructions instead of sequence of scalar instructions, for load and stores of byte-address buffers. - Reduce code-size and readability of the generated shaders. - Help naive users as well as ninja programmers, generate optimal code. **Non Goals** - Help with Structured buffers, or other resources. - Target compilation time improvements. **Key changes** Adds 2 new overloads for Load and Store operations on ByteAddress Buffers. 1. Load / Store with an extra alignment parameter ``` resource.Load<T>(offset, alignment); resource.Store<T>(offset, value, alignment); ``` 2. LoadAligned / StoreAligned with no extra parameter, with the same signature as orignial Load / Store. ``` resource.LoadAligned<T>(offset); resource.StoreAligned<T>(offset, value); ``` - This overload will implicitly identify the alignment value, from the base type T of the elementary unit of the resource. **Supported resources** 1. Vectors This can be upto 4 elements, i.e. float -- float4. 2. Arrays This does not have a limit on number of elements, but on a conservative estimate, we can limit to few hundreds. 3. Structures This is used to group a resource of a single type. ``` struct { float4 x; } ``` **Code updates** - Modified byte-address-ir legalize to handle struct, array and vector kinds of load or store access - Added custom hlsl stdlib functions to implement all the overloads for Load, Store etc. - Added C-like emitter, SPIR-V emitter for handling ByteAddressBuffers. - Added a new core stdlib function intrinsic to wrap around alignOf<T>(). - Added a new peephole optimization entry to identify the equivalent IntLiteral value from the alignOf<T>() inst. - Added tests to check explicit, and implicit aligned Load and Store operations.
-rw-r--r--source/slang/core.meta.slang9
-rw-r--r--source/slang/hlsl.meta.slang327
-rw-r--r--source/slang/slang-diagnostic-defs.h1
-rw-r--r--source/slang/slang-emit-c-like.cpp22
-rw-r--r--source/slang/slang-emit.cpp2
-rw-r--r--source/slang/slang-ir-byte-address-legalize.cpp140
-rw-r--r--source/slang/slang-ir-byte-address-legalize.h2
-rw-r--r--source/slang/slang-ir-inst-defs.h10
-rw-r--r--source/slang/slang-ir-insts.h4
-rw-r--r--source/slang/slang-ir-layout.cpp1
-rw-r--r--source/slang/slang-ir-peephole.cpp24
-rw-r--r--tests/compute/byte-address-buffer-align-error.slang24
-rw-r--r--tests/compute/byte-address-buffer-aligned.slang8
-rw-r--r--tests/compute/byte-address-buffer-array.slang77
14 files changed, 561 insertions, 90 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index bde943972..63bc2571b 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -2412,6 +2412,15 @@ int __naturalStrideOf()
return __naturalStrideOf_impl(__declVal<T>());
}
+__intrinsic_op($(kIROp_AlignOf))
+int __alignOf_intrinsic_impl<T>(T t);
+
+[ForceInline]
+int __alignOf_intrinsic<T>()
+{
+ return __alignOf_intrinsic_impl<T>(__default<T>());
+}
+
__intrinsic_op($(kIROp_TreatAsDynamicUniform))
T asDynamicUniform<T>(T v);
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 303d18771..95ca03beb 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -108,7 +108,7 @@ struct ByteAddressBuffer
{
case hlsl: __intrinsic_asm ".Load";
default:
- return __byteAddressBufferLoad<uint>(this, location);
+ return __byteAddressBufferLoad<uint>(this, location, 0);
}
}
@@ -124,7 +124,33 @@ struct ByteAddressBuffer
{
case hlsl: __intrinsic_asm ".Load2";
default:
- return __byteAddressBufferLoad<uint2>(this, location);
+ return __byteAddressBufferLoad<uint2>(this, location, 0);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint2 Load2(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load2";
+ default:
+ return __byteAddressBufferLoad<uint2>(this, location, alignment);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint2 Load2Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load2";
+ default:
+ return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>());
}
}
@@ -140,7 +166,33 @@ struct ByteAddressBuffer
{
case hlsl: __intrinsic_asm ".Load3";
default:
- return __byteAddressBufferLoad<uint3>(this, location);
+ return __byteAddressBufferLoad<uint3>(this, location, 0);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint3 Load3(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load3";
+ default:
+ return __byteAddressBufferLoad<uint3>(this, location, alignment);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint3 Load3Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load3";
+ default:
+ return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>());
}
}
@@ -156,7 +208,33 @@ struct ByteAddressBuffer
{
case hlsl: __intrinsic_asm ".Load4";
default:
- return __byteAddressBufferLoad<uint4>(this, location);
+ return __byteAddressBufferLoad<uint4>(this, location, 0);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint4 Load4(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load4";
+ default:
+ return __byteAddressBufferLoad<uint4>(this, location, alignment);
+ }
+ }
+
+ [__readNone]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+ uint4 Load4Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load4";
+ default:
+ return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>());
}
}
@@ -167,7 +245,21 @@ struct ByteAddressBuffer
[ForceInline]
T Load<T>(int location)
{
- return __byteAddressBufferLoad<T>(this, location);
+ return __byteAddressBufferLoad<T>(this, location, 0);
+ }
+
+ [__readNone]
+ [ForceInline]
+ T Load<T>(int location, int alignment)
+ {
+ return __byteAddressBufferLoad<T>(this, location, alignment);
+ }
+
+ [__readNone]
+ [ForceInline]
+ T LoadAligned<T>(int location)
+ {
+ return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>());
}
};
@@ -2765,23 +2857,23 @@ uint64_t __asuint64(uint2 i)
__intrinsic_op($(kIROp_ByteAddressBufferLoad))
[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
-T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset, int alignment);
__intrinsic_op($(kIROp_ByteAddressBufferLoad))
[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset, int alignment);
__intrinsic_op($(kIROp_ByteAddressBufferLoad))
[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment);
__intrinsic_op($(kIROp_ByteAddressBufferStore))
[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, T value);
+void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, int alignment, T value);
__intrinsic_op($(kIROp_ByteAddressBufferStore))
[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, T value);
+void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment, T value);
__generic<T, L:IBufferDataLayout=DefaultDataLayout>
__magic_type(HLSLStructuredBufferType)
@@ -2898,7 +2990,7 @@ struct $(item.name)
{
case hlsl: __intrinsic_asm ".Load";
default:
- return __byteAddressBufferLoad<uint>(this, location);
+ return __byteAddressBufferLoad<uint>(this, location, 0);
}
}
@@ -2914,7 +3006,33 @@ struct $(item.name)
{
case hlsl: __intrinsic_asm ".Load2";
default:
- return __byteAddressBufferLoad<uint2>(this, location);
+ return __byteAddressBufferLoad<uint2>(this, location, 0);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint2 Load2(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load2";
+ default:
+ return __byteAddressBufferLoad<uint2>(this, location, alignment);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint2 Load2Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load2";
+ default:
+ return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>());
}
}
@@ -2930,7 +3048,33 @@ struct $(item.name)
{
case hlsl: __intrinsic_asm ".Load3";
default:
- return __byteAddressBufferLoad<uint3>(this, location);
+ return __byteAddressBufferLoad<uint3>(this, location, 0);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint3 Load3(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load3";
+ default:
+ return __byteAddressBufferLoad<uint3>(this, location, alignment);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint3 Load3Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load3";
+ default:
+ return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>());
}
}
@@ -2946,7 +3090,33 @@ struct $(item.name)
{
case hlsl: __intrinsic_asm ".Load4";
default:
- return __byteAddressBufferLoad<uint4>(this, location);
+ return __byteAddressBufferLoad<uint4>(this, location, 0);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint4 Load4(int location, int alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load4";
+ default:
+ return __byteAddressBufferLoad<uint4>(this, location, alignment);
+ }
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ uint4 Load4Aligned(int location)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Load4";
+ default:
+ return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>());
}
}
@@ -2958,8 +3128,25 @@ struct $(item.name)
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
T Load<T>(int location)
{
- return __byteAddressBufferLoad<T>(this, location);
+ return __byteAddressBufferLoad<T>(this, location, 0);
}
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ T Load<T>(int location, int alignment)
+ {
+ return __byteAddressBufferLoad<T>(this, location, alignment);
+ }
+
+ [__NoSideEffect]
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ T LoadAligned<T>(int location)
+ {
+ return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>());
+ }
+
${{{{
if (item.op == kIROp_HLSLRWByteAddressBufferType)
{
@@ -3806,18 +3993,17 @@ ${{{{
[ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
- void Store(
- uint address,
- uint value)
+ void Store(uint address, uint value)
{
__target_switch
{
case hlsl: __intrinsic_asm ".Store";
default:
- __byteAddressBufferStore(this, address, value);
+ __byteAddressBufferStore(this, address, 0, value);
}
}
+
[ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
void Store2(uint address, uint2 value)
@@ -3826,42 +4012,125 @@ ${{{{
{
case hlsl: __intrinsic_asm ".Store2";
default:
- __byteAddressBufferStore(this, address, value);
+ __byteAddressBufferStore(this, address, 0, value);
+ }
+ }
+
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store2(uint address, uint2 value, uint alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store2";
+ default:
+ __byteAddressBufferStore(this, address, alignment, value);
}
}
[ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
- void Store3(
- uint address,
- uint3 value)
+ void Store2Aligned(uint address, uint2 value)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store2";
+ default:
+ __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint2>(), value);
+ }
+ }
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store3(uint address, uint3 value)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store3";
+ default:
+ __byteAddressBufferStore(this, address, 0, value);
+ }
+ }
+
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store3(uint address, uint3 value, uint alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store3";
+ default:
+ __byteAddressBufferStore(this, address, alignment, value);
+ }
+ }
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store3Aligned(uint address, uint3 value)
{
__target_switch
{
case hlsl: __intrinsic_asm ".Store3";
default:
- __byteAddressBufferStore(this, address, value);
+ __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint3>(), value);
+ }
+ }
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store4(uint address, uint4 value)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store4";
+ default:
+ __byteAddressBufferStore(this, address, 0, value);
+ }
+ }
+
+
+ [ForceInline]
+ [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ void Store4(uint address, uint4 value, uint alignment)
+ {
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm ".Store4";
+ default:
+ __byteAddressBufferStore(this, address, alignment, value);
}
}
[ForceInline]
[require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
- void Store4(
- uint address,
- uint4 value)
+ void Store4Aligned(uint address, uint4 value)
{
__target_switch
{
case hlsl: __intrinsic_asm ".Store4";
default:
- __byteAddressBufferStore(this, address, value);
+ __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint4>(), value);
}
}
[ForceInline]
void Store<T>(int offset, T value)
{
- __byteAddressBufferStore(this, offset, value);
+ __byteAddressBufferStore(this, offset, 0, value);
+ }
+
+ [ForceInline]
+ void Store<T>(int offset, T value, uint alignment)
+ {
+ __byteAddressBufferStore(this, offset, alignment, value);
+ }
+
+ [ForceInline]
+ void StoreAligned<T>(int offset, T value)
+ {
+ __byteAddressBufferStore(this, offset, __alignOf_intrinsic<T>(), value);
}
};
diff --git a/source/slang/slang-diagnostic-defs.h b/source/slang/slang-diagnostic-defs.h
index eb131df21..c2c4953e0 100644
--- a/source/slang/slang-diagnostic-defs.h
+++ b/source/slang/slang-diagnostic-defs.h
@@ -753,6 +753,7 @@ DIAGNOSTIC(41201, Warning, expectDynamicUniformValue, "value stored at this loca
DIAGNOSTIC(41202, Error, notEqualBitCastSize, "invalid to bit_cast differently sized types: '$0' with size '$1' casted into '$2' with size '$3'")
DIAGNOSTIC(41203, Warning, notEqualReinterpretCastSize, "reinterpret<> into not equally sized types: '$0' with size '$1' casted into '$2' with size '$3'")
+DIAGNOSTIC(41300, Error, byteAddressBufferUnaligned, "invalid alignment `$0` specified for the byte address buffer resource with the element size of `$1`")
//
// 5xxxx - Target code generation.
//
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index b44fa677c..19a7930f6 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -2674,6 +2674,7 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
break;
case kIROp_ByteAddressBufferLoad:
+ {
m_writer->emit("(");
emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
m_writer->emit(").Load<");
@@ -2682,20 +2683,21 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
m_writer->emit(")");
break;
+ }
case kIROp_ByteAddressBufferStore:
- {
- auto prec = getInfo(EmitOp::Postfix);
- needClose = maybeEmitParens(outerPrec, prec);
+ {
+ auto prec = getInfo(EmitOp::Postfix);
+ needClose = maybeEmitParens(outerPrec, prec);
- emitOperand(inst->getOperand(0), leftSide(outerPrec, prec));
- m_writer->emit(".Store(");
- emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
- m_writer->emit(",");
- emitOperand(inst->getOperand(2), getInfo(EmitOp::General));
- m_writer->emit(")");
- }
+ emitOperand(inst->getOperand(0), leftSide(outerPrec, prec));
+ m_writer->emit(".Store(");
+ emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+ m_writer->emit(",");
+ emitOperand(inst->getOperand(inst->getOperandCount() - 1), getInfo(EmitOp::General));
+ m_writer->emit(")");
break;
+ }
case kIROp_PackAnyValue:
{
m_writer->emit("packAnyValue<");
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 39ebaa64d..ab22e1f90 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -777,7 +777,7 @@ Result linkAndOptimizeIR(
break;
}
- legalizeByteAddressBufferOps(session, targetProgram, irModule, byteAddressBufferOptions);
+ legalizeByteAddressBufferOps(session, targetProgram, irModule, codeGenContext->getSink(), byteAddressBufferOptions);
}
// For CUDA targets only, we will need to turn operations
diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp
index 35040da64..0561d8744 100644
--- a/source/slang/slang-ir-byte-address-legalize.cpp
+++ b/source/slang/slang-ir-byte-address-legalize.cpp
@@ -28,6 +28,7 @@ struct ByteAddressBufferLegalizationContext
TargetRequest* m_target = nullptr;
ByteAddressBufferLegalizationOptions m_options;
+ DiagnosticSink* m_sink = nullptr;
// We will also use a central IR builder when generating new
// code as part of legalization (rather than create/destroy
// IR builders on the fly).
@@ -124,14 +125,15 @@ struct ByteAddressBufferLegalizationContext
//
auto buffer = load->getOperand(0);
auto offset = load->getOperand(1);
- auto legalLoad = emitLegalLoad(type, buffer, offset, 0);
+ auto alignment = load->getOperand(2);
+ auto legalLoad = emitLegalLoad(type, buffer, offset, 0, alignment);
// If it currently possible for the legalization
// to fail (perhaps because of something else that
// is invalid in the IR), so we will defensively
// leave the code along in that case.
//
- if(!legalLoad)
+ if (!legalLoad)
return;
// If we were able to generate a legal load operation,
@@ -154,21 +156,21 @@ struct ByteAddressBufferLegalizationContext
// operations, then that means *no* type is
// legal for byte-address load/store.
//
- if(m_options.translateToStructuredBufferOps)
+ if (m_options.translateToStructuredBufferOps)
return false;
// Basic types are usually legal to load/store
// on all targets.
//
- if( auto basicType = as<IRBasicType>(type) )
+ if (auto basicType = as<IRBasicType>(type))
{
// On targets that require translation to
// make all load/store use `uint` values,
// any scalar type that isn't `uint` is
// illegal.
//
- if( m_options.useBitCastFromUInt
- && basicType->getBaseType() != BaseType::UInt )
+ if (m_options.useBitCastFromUInt
+ && basicType->getBaseType() != BaseType::UInt)
{
return false;
}
@@ -181,13 +183,13 @@ struct ByteAddressBufferLegalizationContext
// Vector types also depend on the options.
//
- if( as<IRVectorType>(type) )
+ if (as<IRVectorType>(type))
{
// If we've been asked to scalarize all
// vector load/store, then we need to
// tread them as illegal.
//
- if(m_options.scalarizeVectorLoadStore)
+ if (m_options.scalarizeVectorLoadStore)
return false;
}
@@ -205,17 +207,35 @@ struct ByteAddressBufferLegalizationContext
return false;
}
- bool checkUnaligned(IRInst* baseOffset, IRIntegerValue immediateOffset, IRType* elementType, IRIntegerValue elementCount)
+ // Helper function to check if the alignment value passed is
+ // divisible by the offset at which the resource is indexed into
+ // in order to ensure if the load or store can be vectorized.
+ bool isAligned(IRInst* offset, IRInst* unknownOffsetAlignment, IRIntegerValue alignmentVal)
{
- // Check whether the given composite resource type is aligned to the baseOffset
- IRSizeAndAlignment elementLayout;
- SLANG_RETURN_FALSE_ON_FAIL(getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), elementType, &elementLayout));
- IRIntegerValue elementStride = elementLayout.getStride();
- bool isUnaligned = true;
- if (auto baseOffsetVal = as<IRIntLit>(baseOffset)) {
- isUnaligned = ((baseOffsetVal->getValue() + immediateOffset) % (elementStride * elementCount)) != 0;
+ if (auto baseOffsetVal = as<IRIntLit>(offset))
+ {
+ // If the offset is a constant known at compile time, simply check if it aligned to
+ // the elementsize of the underlying resource.
+ return (baseOffsetVal->getValue() % alignmentVal) == 0;
+ }
+ else if (auto alignInst = as<IRIntLit>(unknownOffsetAlignment))
+ {
+ // If the offset is not known during compile time, use the explicit align
+ // field of the overloaded `Load` or `Store` operation or vi `LoadAligned`
+ // or `StoreAligned` function.
+ //
+ // Unaligned `Load`s or `Store`s are identified with 0 alignment, to prevent
+ // accidentally issuing a wide vectorized operations.
+ if (!alignInst->getValue())
+ return false;
+
+ if ((alignInst->getValue() % alignmentVal) == 0)
+ {
+ return true;
+ }
+ m_sink->diagnose(offset->sourceLoc, Slang::Diagnostics::byteAddressBufferUnaligned, alignInst->getValue(), alignmentVal);
}
- return isUnaligned;
+ return false;
}
SlangResult getOffset(TargetProgram* target, IRStructField* field, IRIntegerValue* outOffset)
@@ -241,7 +261,7 @@ struct ByteAddressBufferLegalizationContext
// given `type` from the given `buffer` at the required `baseOffset`
// plus the `immediateOffset` if any.
//
- IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset)
+ IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment)
{
// The right way to load a value depends primarily
// on the type, and secondarily on the options
@@ -299,7 +319,7 @@ struct ByteAddressBufferLegalizationContext
// for earlier fields will be left behind but can be eliminated
// as dead code.
//
- auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset);
+ auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment);
if(!fieldVal)
return nullptr;
@@ -324,9 +344,23 @@ struct ByteAddressBufferLegalizationContext
// legalization if the array type isn't in the right form
// for us to proceed.
//
+
if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
{
- return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue());
+ // Emit an aligned load operation on an array when using a LoadAligned inst.
+ // Else, fallback to scalarizing the loads.
+ IRSizeAndAlignment elementLayout;
+ SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout));
+ IRIntegerValue elementStride = elementLayout.getStride();
+ auto alignmentVal = elementStride * elementCountInst->getValue();
+ if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
+ {
+ return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue(), alignment);
+ }
+ else
+ {
+ return emitSimpleLoad(type, buffer, baseOffset, immediateOffset);
+ }
}
}
else if( auto matType = as<IRMatrixType>(type) )
@@ -341,7 +375,7 @@ struct ByteAddressBufferLegalizationContext
if( rowCountInst )
{
auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount());
- return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue());
+ return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue(), alignment);
}
}
else
@@ -354,7 +388,7 @@ struct ByteAddressBufferLegalizationContext
getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment);
for (Index c = 0; c < colCount; c++)
{
- auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset);
+ auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset, alignment);
for (Index r = 0; r < rowCount; r++)
{
elements.add(m_builder.emitElementExtract(colVector, (IRIntegerValue)r));
@@ -382,11 +416,15 @@ struct ByteAddressBufferLegalizationContext
//
if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
{
- // Emit an aligned vector load operation when the data (elementCount * elementSize) is divisible
- // by the offset. Else, fallback to scalarizing the loads.
- if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+ // Emit an aligned vector load operation when using a LoadAligned inst.
+ // Else, fallback to scalarizing the loads.
+ IRSizeAndAlignment elementLayout;
+ SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout));
+ IRIntegerValue elementStride = elementLayout.getStride();
+ auto alignmentVal = elementStride * elementCountInst->getValue();
+ if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
{
- return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue());
+ return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue(), alignment);
}
else
{
@@ -464,7 +502,7 @@ struct ByteAddressBufferLegalizationContext
// Loading of sequences for arrays, matrices, and vectors is
// bottlenecked through a single function.
//
- IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount)
+ IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment)
{
// Or goal here is to produce a value of the given `type`, loaded from `buffer`
// at `baseOffset` plus `immediateOffset`.
@@ -486,7 +524,7 @@ struct ByteAddressBufferLegalizationContext
List<IRInst*> elementVals;
for( IRIntegerValue ii = 0; ii < elementCount; ++ii )
{
- auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride);
+ auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment);
if(!elementVal)
return nullptr;
@@ -844,7 +882,7 @@ struct ByteAddressBufferLegalizationContext
// the type of the store operation, but instead the operand
// that represents the value to be stored.
//
- auto value = store->getOperand(2);
+ auto value = store->getOperand(3);
auto type = value->getDataType();
// Types that are already legal to use don't require any processing.
@@ -863,14 +901,14 @@ struct ByteAddressBufferLegalizationContext
// performance issue, but we should still consider trying to
// tighten this up and make all uhandled cases be hard errors).
//
- auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, value);
+ auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, store->getOperand(2), value);
if(SLANG_FAILED(result))
return;
store->removeAndDeallocate();
}
- Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value)
+ Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment, IRInst* value)
{
// The flow for emitting a legal store is very similar to that for
// legal loads; we will recurse on the structure of `type` and
@@ -889,7 +927,7 @@ struct ByteAddressBufferLegalizationContext
SLANG_RETURN_ON_FAIL(getOffset(m_targetProgram, field, &fieldOffset));
auto fieldVal = m_builder.emitFieldExtract(fieldType, value, field->getKey());
- SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, fieldVal));
+ SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment, fieldVal));
}
return SLANG_OK;
}
@@ -900,7 +938,20 @@ struct ByteAddressBufferLegalizationContext
//
if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
{
- return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue());
+ // Emit an aligned store operation on an array when using a StoreAligned inst.
+ // Else, fallback to scalarizing the stores.
+ IRSizeAndAlignment elementLayout;
+ SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout));
+ IRIntegerValue elementStride = elementLayout.getStride();
+ auto alignmentVal = elementStride * elementCountInst->getValue();
+ if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
+ {
+ return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue(), alignment);
+ }
+ else
+ {
+ return emitSimpleStore(value->getDataType(), buffer, baseOffset, immediateOffset, value);
+ }
}
}
else if( auto matType = as<IRMatrixType>(type) )
@@ -912,7 +963,7 @@ struct ByteAddressBufferLegalizationContext
if( rowCountInst )
{
auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount());
- return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue());
+ return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue(), alignment);
}
}
else
@@ -935,7 +986,7 @@ struct ByteAddressBufferLegalizationContext
auto colVector = m_builder.emitMakeVector(colVectorType, colVectorArgs);
IRSizeAndAlignment colVectorSizeAlignment;
getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment);
- emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, colVector);
+ emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, alignment, colVector);
immediateOffset += colVectorSizeAlignment.getStride();
}
return SLANG_OK;
@@ -945,11 +996,16 @@ struct ByteAddressBufferLegalizationContext
{
if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
{
- // Emit an aligned vector store operation when the data (elementCount * elementSize) is divisible
- // by the offset. Else, fallback to scalarizing the stores.
- if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+ // Emit an aligned vector store operation when using a StoreAligned inst.
+ // Else, fallback to scalarizing the stores.
+
+ IRSizeAndAlignment elementLayout;
+ SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout));
+ IRIntegerValue elementStride = elementLayout.getStride();
+ auto alignmentVal = elementStride * elementCountInst->getValue();
+ if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
{
- return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue());
+ return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue(), alignment);
}
else
{
@@ -1023,7 +1079,7 @@ struct ByteAddressBufferLegalizationContext
}
}
- Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount)
+ Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment)
{
// The store case for sequences is similar to the load case.
//
@@ -1038,7 +1094,7 @@ struct ByteAddressBufferLegalizationContext
{
auto elementIndex = m_builder.getIntValue(indexType, ii);
auto elementVal = m_builder.emitElementExtract(elementType, value, elementIndex);
- SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, elementVal));
+ SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment, elementVal));
}
return SLANG_OK;
@@ -1050,6 +1106,7 @@ void legalizeByteAddressBufferOps(
Session* session,
TargetProgram* program,
IRModule* module,
+ DiagnosticSink* sink,
ByteAddressBufferLegalizationOptions const& options)
{
ByteAddressBufferLegalizationContext context;
@@ -1057,6 +1114,7 @@ void legalizeByteAddressBufferOps(
context.m_target = program->getTargetReq();
context.m_options = options;
context.m_targetProgram = program;
+ context.m_sink = sink;
context.processModule(module);
}
diff --git a/source/slang/slang-ir-byte-address-legalize.h b/source/slang/slang-ir-byte-address-legalize.h
index 996a93c73..71ab8a4e1 100644
--- a/source/slang/slang-ir-byte-address-legalize.h
+++ b/source/slang/slang-ir-byte-address-legalize.h
@@ -6,6 +6,7 @@ namespace Slang
class Session;
class TargetProgram;
struct IRModule;
+class DiagnosticSink;
struct ByteAddressBufferLegalizationOptions
{
@@ -24,6 +25,7 @@ void legalizeByteAddressBufferOps(
Session* session,
TargetProgram* target,
IRModule* module,
+ DiagnosticSink* sink,
ByteAddressBufferLegalizationOptions const& options);
}
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index f4954375d..4bad614b3 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -428,25 +428,27 @@ INST(ImageStore, imageStore, 3, 0)
// Load (almost) arbitrary-type data from a byte-address buffer
//
-// %dst = byteAddressBufferLoad(%buffer, %offset)
+// %dst = byteAddressBufferLoad(%buffer, %offset, %alignment)
//
// where
// - `buffer` is a value of some `ByteAddressBufferTypeBase` type
// - `offset` is an `int`
+// - `alignment` is an `int`
// - `dst` is a value of some type containing only ordinary data
//
-INST(ByteAddressBufferLoad, byteAddressBufferLoad, 2, 0)
+INST(ByteAddressBufferLoad, byteAddressBufferLoad, 3, 0)
// Store (almost) arbitrary-type data to a byte-address buffer
//
-// byteAddressBufferLoad(%buffer, %offset, %src)
+// byteAddressBufferLoad(%buffer, %offset, %alignment, %src)
//
// where
// - `buffer` is a value of some `ByteAddressBufferTypeBase` type
// - `offset` is an `int`
+// - `alignment` is an `int`
// - `src` is a value of some type containing only ordinary data
//
-INST(ByteAddressBufferStore, byteAddressBufferStore, 3, 0)
+INST(ByteAddressBufferStore, byteAddressBufferStore, 4, 0)
// Load data from a structured buffer
//
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index 5c4f01ae7..f0613dfa5 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -2252,6 +2252,10 @@ struct IRLayoutDecoration : IRDecoration
};
//
+struct IRAlignOf : IRInst
+{
+ IRInst* getBaseOp() { return getOperand(0); }
+};
struct IRCall : IRInst
{
diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp
index f35fa6750..6a4e9360a 100644
--- a/source/slang/slang-ir-layout.cpp
+++ b/source/slang/slang-ir-layout.cpp
@@ -497,7 +497,6 @@ struct Std140LayoutRules : IRTypeLayoutRules
Result getNaturalSizeAndAlignment(CompilerOptionSet& optionSet, IRType* type, IRSizeAndAlignment* outSizeAndAlignment)
{
return getSizeAndAlignment(optionSet, IRTypeLayoutRules::getNatural(), type, outSizeAndAlignment);
-
}
Result getNaturalOffset(CompilerOptionSet& optionSet, IRStructField* field, IRIntegerValue* outOffset)
diff --git a/source/slang/slang-ir-peephole.cpp b/source/slang/slang-ir-peephole.cpp
index 88b26fbd3..16e440b32 100644
--- a/source/slang/slang-ir-peephole.cpp
+++ b/source/slang/slang-ir-peephole.cpp
@@ -250,6 +250,30 @@ struct PeepholeContext : InstPassBase
switch (inst->getOp())
{
+ case kIROp_AlignOf:
+ // Fold all calls to alignOf<T>() that returns a simple integer value.
+ if (inst->getDataType()->getOp() == kIROp_IntType)
+ {
+ if (!targetProgram)
+ break;
+
+ // Save the alignment information and exit early if it is invalid
+ IRSizeAndAlignment sizeAlignment;
+ auto alignOfInst = as<IRAlignOf>(inst);
+ auto baseType = alignOfInst->getBaseOp()->getDataType();
+ if (SLANG_FAILED(getNaturalSizeAndAlignment(targetProgram->getOptionSet(), baseType, &sizeAlignment)))
+ break;
+ if (sizeAlignment.size == 0)
+ break;
+
+ IRBuilder builder(module);
+ builder.setInsertBefore(inst);
+ auto stride = builder.getIntValue(inst->getDataType(), sizeAlignment.getStride());
+ inst->replaceUsesWith(stride);
+ maybeRemoveOldInst(inst);
+ changed = true;
+ }
+ break;
case kIROp_GetResultError:
if (inst->getOperand(0)->getOp() == kIROp_MakeResultError)
{
diff --git a/tests/compute/byte-address-buffer-align-error.slang b/tests/compute/byte-address-buffer-align-error.slang
new file mode 100644
index 000000000..34300d7c3
--- /dev/null
+++ b/tests/compute/byte-address-buffer-align-error.slang
@@ -0,0 +1,24 @@
+// byte-address-buffer-align-error.slang
+
+//TEST:SIMPLE(filecheck=CHECK):-target glsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target hlsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target spirv -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target spirv -emit-spirv-directly -entry computeMain -stage compute
+
+// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types.
+
+[vk::binding(2, 3)] RWByteAddressBuffer buffer;
+struct Block {
+ float4 val[2];
+};
+[shader("compute")]
+[numthreads(1,1,1)]
+void computeMain(uint3 threadId : SV_DispatchThreadID)
+{
+ // CHECK: error 41300: invalid alignment `{{.*}}` specified for the byte address buffer resource with the element size of `{{.*}}`
+ // CHECK: error 41300: invalid alignment `{{.*}}` specified for the byte address buffer resource with the element size of `{{.*}}`
+ buffer.Store<Block>(0, buffer.Load<Block>(1, 5));
+ buffer.Store<Block>(1, buffer.Load<Block>(0), 3);
+
+}
+
diff --git a/tests/compute/byte-address-buffer-aligned.slang b/tests/compute/byte-address-buffer-aligned.slang
index 5024987aa..f959ec66d 100644
--- a/tests/compute/byte-address-buffer-aligned.slang
+++ b/tests/compute/byte-address-buffer-aligned.slang
@@ -109,8 +109,8 @@ void computeMain(uint3 threadId : SV_DispatchThreadID)
// CHECK3-DAG: OpStore %[[V33]] %[[V28]]
// CHECK3-DAG: %[[V34:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]]
// CHECK3-DAG: OpStore %[[V34]] %[[V30]]
- buffer0.Store(32, buffer0.Load<float4>(32));
- buffer0.Store(32, buffer0.Load<float4>(8));
- buffer0.Store(8, buffer0.Load<float4>(32));
- buffer0.Store(8, buffer0.Load<float4>(8));
+ buffer0.StoreAligned(32, buffer0.LoadAligned<float4>(32));
+ buffer0.StoreAligned(32, buffer0.LoadAligned<float4>(8));
+ buffer0.StoreAligned(8, buffer0.LoadAligned<float4>(32));
+ buffer0.StoreAligned(8, buffer0.LoadAligned<float4>(8));
}
diff --git a/tests/compute/byte-address-buffer-array.slang b/tests/compute/byte-address-buffer-array.slang
new file mode 100644
index 000000000..1a23821a1
--- /dev/null
+++ b/tests/compute/byte-address-buffer-array.slang
@@ -0,0 +1,77 @@
+// byte-address-buffer-array.slang
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -d3d12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type
+//DISABLED_TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -output-using-type
+
+//TEST:SIMPLE(filecheck=CHECK1):-target glsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK2):-target hlsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK3):-target spirv -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK3):-target spirv -emit-spirv-directly -entry computeMain -stage compute
+
+// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types.
+
+[vk::binding(2, 3)] RWByteAddressBuffer buffer;
+struct Block {
+ float4 val[2];
+};
+[shader("compute")]
+[numthreads(1,1,1)]
+void computeMain(uint3 threadId : SV_DispatchThreadID)
+{
+ // CHECK-NOT: warning
+ // CHECK1: _Array_std430_vector{{.*}} _data[]
+ // CHECK1: _Array_std430_vector{{.*}} packStorage_0(vec4 {{.*}}[2])
+ // CHECK1: vec4 {{.*}}[2] = { {{.*}}[0], {{.*}}[1] };
+ // CHECK1: _Array_std430_vector{{.*}} {{.*}} = { {{.*}} };
+ // CHECK1: void unpackStorage_0(_Array_std430_vector{{.*}}, out vec4 {{.*}}[2])
+ // CHECK1: {{.*}}[0] = {{.*}}.data_0[0];
+ // CHECK1: {{.*}}[1] = {{.*}}.data_0[1];
+ // CHECK1: vec4 {{.*}}[2];
+ // CHECK1: unpackStorage_0(buffer_0._data[0], {{.*}});
+ // CHECK1: vec4 {{.*}}[2] = buffer_0._data[0] = packStorage_0({{.*}});
+ // CHECK1: vec4 {{.*}} = buffer_2._data[1];
+ // CHECK1: vec4 {{.*}} = buffer_2._data[1] = vec4(buffer_1._data[1], buffer_1._data[2], buffer_1._data[3], buffer_1._data[4]);
+
+ // CHECK2: float4 {{.*}}[int(2)] = (buffer_0).Load<float4 [int(2)] >(int(0));
+ // CHECK2: buffer_0.Store(int(0),{{.*}});
+ // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(4));
+ // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(8));
+ // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(12));
+ // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(16));
+ // CHECK2: float4 {{.*}} = float4({{.*}}, {{.*}}, {{.*}}, {{.*}});
+ // CHECK2: float4 {{.*}} = (buffer_0).Load<float4 >(int(20));
+ // CHECK2: buffer_0.Store(int(16),{{.*}});
+ // CHECK2: buffer_0.Store(int(32),{{.*}});
+
+ // CHECK3-DAG: %[[ARRV4_2:[a-zA-Z0-9_]+]] = OpTypeArray %v4float %int_2
+ // CHECK3-DAG: %[[SARRV4_2:[a-zA-Z0-9_]+]] = OpTypeStruct %[[ARRV4_2]]
+ // CHECK3-DAG: %[[SBA:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[SARRV4_2]]
+ // CHECK3-DAG: %[[SBF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %float
+ // CHECK3-DAG: %[[SBVF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %v4float
+ // CHECK3-DAG: %[[V0:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}}
+ // CHECK3-DAG: %[[V1:[a-zA-Z0-9_]+]] = OpLoad %[[SARRV4_2]] %[[V0]]
+ // CHECK3-DAG: %[[V2:[a-zA-Z0-9_]+]] = OpCompositeExtract %[[ARRV4_2]] %[[V1]] 0
+ // CHECK3-DAG: %[[V3:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 0
+ // CHECK3-DAG: %[[V4:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 1
+ // CHECK3-DAG: %[[V5:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[ARRV4_2]] %[[V3]] %[[V4]]
+ // CHECK3-DAG: %[[V6:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[SARRV4_2]] %[[V5]]
+ // CHECK3-DAG: %[[V7:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}}
+ // CHECK3-DAG: OpStore %[[V7]] %[[V6]]
+ // CHECK3-DAG: %[[V8:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+ // CHECK3-DAG: %[[V9:[a-zA-Z0-9_]+]] = OpLoad %float %[[V8]]
+ // CHECK3-DAG: %[[V10:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+ // CHECK3-DAG: %[[V11:[a-zA-Z0-9_]+]] = OpLoad %float %[[V10]]
+ // CHECK3-DAG: %[[V12:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+ // CHECK3-DAG: %[[V13:[a-zA-Z0-9_]+]] = OpLoad %float %[[V12]]
+ // CHECK3-DAG: %[[V14:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+ // CHECK3-DAG: %[[V15:[a-zA-Z0-9_]+]] = OpLoad %float %[[V14]]
+ // CHECK3-DAG: %[[V16:[a-zA-Z0-9_]+]] = OpCompositeConstruct %v4float %[[V9]] %[[V11]] %[[V13]] %[[V15]]
+ // CHECK3-DAG: %[[V17:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+ // CHECK3-DAG: %[[V18:[a-zA-Z0-9_]+]] = OpLoad %v4float %[[V17]]
+ // CHECK3-DAG: %[[V19:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+ // CHECK3-DAG: OpStore %[[V19]] %[[V16]]
+ // CHECK3-DAG: %[[V20:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+ // CHECK3-DAG: OpStore %[[V20]] %[[V18]]
+ buffer.Store(0, buffer.LoadAligned<Block>(0));
+ buffer.StoreAligned(16, buffer.Load<Block>(4, 16));
+}
+