Add LoadAligned and StoreAligned methods to ByteAddressBuffers (#4066)

Fixes #4062 This change enables wide load/stores for byte-address-buffer backed resources, when the data is accessed at an offset that is aligned. **Goals** - Improve performance by issuing wider instructions instead of sequence of scalar instructions, for load and stores of byte-address buffers. - Reduce code-size and readability of the generated shaders. - Help naive users as well as ninja programmers, generate optimal code. **Non Goals** - Help with Structured buffers, or other resources. - Target compilation time improvements. **Key changes** Adds 2 new overloads for Load and Store operations on ByteAddress Buffers. 1. Load / Store with an extra alignment parameter ``` resource.Load<T>(offset, alignment); resource.Store<T>(offset, value, alignment); ``` 2. LoadAligned / StoreAligned with no extra parameter, with the same signature as orignial Load / Store. ``` resource.LoadAligned<T>(offset); resource.StoreAligned<T>(offset, value); ``` - This overload will implicitly identify the alignment value, from the base type T of the elementary unit of the resource. **Supported resources** 1. Vectors This can be upto 4 elements, i.e. float -- float4. 2. Arrays This does not have a limit on number of elements, but on a conservative estimate, we can limit to few hundreds. 3. Structures This is used to group a resource of a single type. ``` struct { float4 x; } ``` **Code updates** - Modified byte-address-ir legalize to handle struct, array and vector kinds of load or store access - Added custom hlsl stdlib functions to implement all the overloads for Load, Store etc. - Added C-like emitter, SPIR-V emitter for handling ByteAddressBuffers. - Added a new core stdlib function intrinsic to wrap around alignOf<T>(). - Added a new peephole optimization entry to identify the equivalent IntLiteral value from the alignOf<T>() inst. - Added tests to check explicit, and implicit aligned Load and Store operations.
author: Sriram Murali <85252063+sriramm-nv@users.noreply.github.com> 2024-05-13 23:57:57 -0700
committer: GitHub <noreply@github.com> 2024-05-13 23:57:57 -0700
commit: 487ae034e2b03ddd67945132c8fecbd937952705 (patch)
tree: 036d318a64385151ad9d5e7275c2e387fdca6cee
parent: 9f23046138629f78995d54a7722ad6749bd84db9 (diff)
14 files changed, 561 insertions, 90 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index bde943972..63bc2571b 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -2412,6 +2412,15 @@ int __naturalStrideOf()
     return __naturalStrideOf_impl(__declVal<T>());
 }
 
+__intrinsic_op($(kIROp_AlignOf))
+int __alignOf_intrinsic_impl<T>(T t);
+
+[ForceInline]
+int __alignOf_intrinsic<T>()
+{
+    return __alignOf_intrinsic_impl<T>(__default<T>());
+}
+
 __intrinsic_op($(kIROp_TreatAsDynamicUniform))
 T asDynamicUniform<T>(T v);
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 303d18771..95ca03beb 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -108,7 +108,7 @@ struct ByteAddressBuffer
         {
         case hlsl: __intrinsic_asm ".Load";
         default:
-            return __byteAddressBufferLoad<uint>(this, location);
+            return __byteAddressBufferLoad<uint>(this, location, 0);
         }
     }
 
@@ -124,7 +124,33 @@ struct ByteAddressBuffer
         {
         case hlsl: __intrinsic_asm ".Load2";
         default:
-            return __byteAddressBufferLoad<uint2>(this, location);
+            return __byteAddressBufferLoad<uint2>(this, location, 0);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint2 Load2(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load2";
+        default:
+            return __byteAddressBufferLoad<uint2>(this, location, alignment);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint2 Load2Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load2";
+        default:
+            return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>());
         }
     }
 
@@ -140,7 +166,33 @@ struct ByteAddressBuffer
         {
         case hlsl: __intrinsic_asm ".Load3";
         default:
-            return __byteAddressBufferLoad<uint3>(this, location);
+            return __byteAddressBufferLoad<uint3>(this, location, 0);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint3 Load3(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load3";
+        default:
+            return __byteAddressBufferLoad<uint3>(this, location, alignment);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint3 Load3Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load3";
+        default:
+            return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>());
         }
     }
 
@@ -156,7 +208,33 @@ struct ByteAddressBuffer
         {
         case hlsl: __intrinsic_asm ".Load4";
         default:
-            return __byteAddressBufferLoad<uint4>(this, location);
+            return __byteAddressBufferLoad<uint4>(this, location, 0);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint4 Load4(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load4";
+        default:
+            return __byteAddressBufferLoad<uint4>(this, location, alignment);
+        }
+    }
+
+    [__readNone]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
+    uint4 Load4Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load4";
+        default:
+            return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>());
         }
     }
 
@@ -167,7 +245,21 @@ struct ByteAddressBuffer
     [ForceInline]
     T Load<T>(int location)
     {
-        return __byteAddressBufferLoad<T>(this, location);
+        return __byteAddressBufferLoad<T>(this, location, 0);
+    }
+
+    [__readNone]
+    [ForceInline]
+    T Load<T>(int location, int alignment)
+    {
+        return __byteAddressBufferLoad<T>(this, location, alignment);
+    }
+
+    [__readNone]
+    [ForceInline]
+    T LoadAligned<T>(int location)
+    {
+        return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>());
     }
 };
 
@@ -2765,23 +2857,23 @@ uint64_t __asuint64(uint2 i)
 
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer)]
-T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset, int alignment);
 
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset, int alignment);
 
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset);
+T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment);
 
 __intrinsic_op($(kIROp_ByteAddressBufferStore))
 [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, T value);
+void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, int alignment, T value);
 
 __intrinsic_op($(kIROp_ByteAddressBufferStore))
 [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, T value);
+void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment, T value);
 
 __generic<T, L:IBufferDataLayout=DefaultDataLayout>
 __magic_type(HLSLStructuredBufferType)
@@ -2898,7 +2990,7 @@ struct $(item.name)
         {
         case hlsl: __intrinsic_asm ".Load";
         default:
-            return __byteAddressBufferLoad<uint>(this, location);
+            return __byteAddressBufferLoad<uint>(this, location, 0);
         }
     }
 
@@ -2914,7 +3006,33 @@ struct $(item.name)
         {
         case hlsl: __intrinsic_asm ".Load2";
         default:
-            return __byteAddressBufferLoad<uint2>(this, location);
+            return __byteAddressBufferLoad<uint2>(this, location, 0);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint2 Load2(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load2";
+        default:
+            return __byteAddressBufferLoad<uint2>(this, location, alignment);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint2 Load2Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load2";
+        default:
+            return __byteAddressBufferLoad<uint2>(this, location, __alignOf_intrinsic<uint2>());
         }
     }
 
@@ -2930,7 +3048,33 @@ struct $(item.name)
         {
         case hlsl: __intrinsic_asm ".Load3";
         default:
-            return __byteAddressBufferLoad<uint3>(this, location);
+            return __byteAddressBufferLoad<uint3>(this, location, 0);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint3 Load3(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load3";
+        default:
+            return __byteAddressBufferLoad<uint3>(this, location, alignment);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint3 Load3Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load3";
+        default:
+            return __byteAddressBufferLoad<uint3>(this, location, __alignOf_intrinsic<uint3>());
         }
     }
 
@@ -2946,7 +3090,33 @@ struct $(item.name)
         {
         case hlsl: __intrinsic_asm ".Load4";
         default:
-            return __byteAddressBufferLoad<uint4>(this, location);
+            return __byteAddressBufferLoad<uint4>(this, location, 0);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint4 Load4(int location, int alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load4";
+        default:
+            return __byteAddressBufferLoad<uint4>(this, location, alignment);
+        }
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    uint4 Load4Aligned(int location)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Load4";
+        default:
+            return __byteAddressBufferLoad<uint4>(this, location, __alignOf_intrinsic<uint4>());
         }
     }
 
@@ -2958,8 +3128,25 @@ struct $(item.name)
     [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
     T Load<T>(int location)
     {
-        return __byteAddressBufferLoad<T>(this, location);
+        return __byteAddressBufferLoad<T>(this, location, 0);
     }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    T Load<T>(int location, int alignment)
+    {
+        return __byteAddressBufferLoad<T>(this, location, alignment);
+    }
+
+    [__NoSideEffect]
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    T LoadAligned<T>(int location)
+    {
+        return __byteAddressBufferLoad<T>(this, location, __alignOf_intrinsic<T>());
+    }
+
 ${{{{
     if (item.op == kIROp_HLSLRWByteAddressBufferType)
     {
@@ -3806,18 +3993,17 @@ ${{{{
 
     [ForceInline]
     [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
-    void Store(
-        uint address,
-        uint value)
+    void Store(uint address, uint value)
     {
         __target_switch
         {
         case hlsl: __intrinsic_asm ".Store";
         default:
-            __byteAddressBufferStore(this, address, value);
+            __byteAddressBufferStore(this, address, 0, value);
         }
     }
 
+
     [ForceInline]
     [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
     void Store2(uint address, uint2 value)
@@ -3826,42 +4012,125 @@ ${{{{
         {
         case hlsl: __intrinsic_asm ".Store2";
         default:
-            __byteAddressBufferStore(this, address, value);
+            __byteAddressBufferStore(this, address, 0, value);
+        }
+    }
+
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store2(uint address, uint2 value, uint alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store2";
+        default:
+            __byteAddressBufferStore(this, address, alignment, value);
         }
     }
 
     [ForceInline]
     [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
-    void Store3(
-        uint address,
-        uint3 value)
+    void Store2Aligned(uint address, uint2 value)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store2";
+        default:
+            __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint2>(), value);
+        }
+    }
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store3(uint address, uint3 value)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store3";
+        default:
+            __byteAddressBufferStore(this, address, 0, value);
+        }
+    }
+
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store3(uint address, uint3 value, uint alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store3";
+        default:
+            __byteAddressBufferStore(this, address, alignment, value);
+        }
+    }
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store3Aligned(uint address, uint3 value)
     {
         __target_switch
         {
         case hlsl: __intrinsic_asm ".Store3";
         default:
-            __byteAddressBufferStore(this, address, value);
+            __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint3>(), value);
+        }
+    }
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store4(uint address, uint4 value)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store4";
+        default:
+            __byteAddressBufferStore(this, address, 0, value);
+        }
+    }
+
+
+    [ForceInline]
+    [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+    void Store4(uint address, uint4 value, uint alignment)
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm ".Store4";
+        default:
+            __byteAddressBufferStore(this, address, alignment, value);
         }
     }
 
     [ForceInline]
     [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
-    void Store4(
-        uint address,
-        uint4 value)
+    void Store4Aligned(uint address, uint4 value)
     {
         __target_switch
         {
         case hlsl: __intrinsic_asm ".Store4";
         default:
-            __byteAddressBufferStore(this, address, value);
+            __byteAddressBufferStore(this, address, __alignOf_intrinsic<uint4>(), value);
         }
     }
 
     [ForceInline]
     void Store<T>(int offset, T value)
     {
-        __byteAddressBufferStore(this, offset, value);
+        __byteAddressBufferStore(this, offset, 0, value);
+    }
+
+    [ForceInline]
+    void Store<T>(int offset, T value, uint alignment)
+    {
+        __byteAddressBufferStore(this, offset, alignment, value);
+    }
+
+    [ForceInline]
+    void StoreAligned<T>(int offset, T value)
+    {
+        __byteAddressBufferStore(this, offset, __alignOf_intrinsic<T>(), value);
     }
 };
 
diff --git a/source/slang/slang-diagnostic-defs.h b/source/slang/slang-diagnostic-defs.h
index eb131df21..c2c4953e0 100644
--- a/source/slang/slang-diagnostic-defs.h
+++ b/source/slang/slang-diagnostic-defs.h
@@ -753,6 +753,7 @@ DIAGNOSTIC(41201, Warning, expectDynamicUniformValue, "value stored at this loca
 DIAGNOSTIC(41202, Error, notEqualBitCastSize, "invalid to bit_cast differently sized types: '$0' with size '$1' casted into '$2' with size '$3'")
 DIAGNOSTIC(41203, Warning, notEqualReinterpretCastSize, "reinterpret<> into not equally sized types: '$0' with size '$1' casted into '$2' with size '$3'")
 
+DIAGNOSTIC(41300, Error, byteAddressBufferUnaligned, "invalid alignment `$0` specified for the byte address buffer resource with the element size of `$1`")
 //
 // 5xxxx - Target code generation.
 //
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index b44fa677c..19a7930f6 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -2674,6 +2674,7 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
         break;
 
     case kIROp_ByteAddressBufferLoad:
+    {
         m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(").Load<");
@@ -2682,20 +2683,21 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(")");
         break;
+    }
 
     case kIROp_ByteAddressBufferStore:
-        {
-            auto prec = getInfo(EmitOp::Postfix);
-            needClose = maybeEmitParens(outerPrec, prec);
+    {
+        auto prec = getInfo(EmitOp::Postfix);
+        needClose = maybeEmitParens(outerPrec, prec);
 
-            emitOperand(inst->getOperand(0), leftSide(outerPrec, prec));
-            m_writer->emit(".Store(");
-            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-            m_writer->emit(",");
-            emitOperand(inst->getOperand(2), getInfo(EmitOp::General));
-            m_writer->emit(")");
-        }
+        emitOperand(inst->getOperand(0), leftSide(outerPrec, prec));
+        m_writer->emit(".Store(");
+        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+        m_writer->emit(",");
+        emitOperand(inst->getOperand(inst->getOperandCount() - 1), getInfo(EmitOp::General));
+        m_writer->emit(")");
         break;
+    }
     case kIROp_PackAnyValue:
     {
         m_writer->emit("packAnyValue<");
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 39ebaa64d..ab22e1f90 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -777,7 +777,7 @@ Result linkAndOptimizeIR(
             break;
         }
 
-        legalizeByteAddressBufferOps(session, targetProgram, irModule, byteAddressBufferOptions);
+        legalizeByteAddressBufferOps(session, targetProgram, irModule, codeGenContext->getSink(), byteAddressBufferOptions);
     }
 
     // For CUDA targets only, we will need to turn operations
diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp
index 35040da64..0561d8744 100644
--- a/source/slang/slang-ir-byte-address-legalize.cpp
+++ b/source/slang/slang-ir-byte-address-legalize.cpp
@@ -28,6 +28,7 @@ struct ByteAddressBufferLegalizationContext
     TargetRequest* m_target = nullptr;
     ByteAddressBufferLegalizationOptions m_options;
 
+    DiagnosticSink* m_sink = nullptr;
     // We will also use a central IR builder when generating new
     // code as part of legalization (rather than create/destroy
     // IR builders on the fly).
@@ -124,14 +125,15 @@ struct ByteAddressBufferLegalizationContext
         //
         auto buffer = load->getOperand(0);
         auto offset = load->getOperand(1);
-        auto legalLoad = emitLegalLoad(type, buffer, offset, 0);
+        auto alignment = load->getOperand(2);
+        auto legalLoad = emitLegalLoad(type, buffer, offset, 0, alignment);
 
         // If it currently possible for the legalization
         // to fail (perhaps because of something else that
         // is invalid in the IR), so we will defensively
         // leave the code along in that case.
         //
-        if(!legalLoad)
+        if (!legalLoad)
             return;
 
         // If we were able to generate a legal load operation,
@@ -154,21 +156,21 @@ struct ByteAddressBufferLegalizationContext
         // operations, then that means *no* type is
         // legal for byte-address load/store.
         //
-        if(m_options.translateToStructuredBufferOps)
+        if (m_options.translateToStructuredBufferOps)
             return false;
 
         // Basic types are usually legal to load/store
         // on all targets.
         //
-        if( auto basicType = as<IRBasicType>(type) )
+        if (auto basicType = as<IRBasicType>(type))
         {
             // On targets that require translation to
             // make all load/store use `uint` values,
             // any scalar type that isn't `uint` is
             // illegal.
             //
-            if( m_options.useBitCastFromUInt
-                && basicType->getBaseType() != BaseType::UInt )
+            if (m_options.useBitCastFromUInt
+                && basicType->getBaseType() != BaseType::UInt)
             {
                 return false;
             }
@@ -181,13 +183,13 @@ struct ByteAddressBufferLegalizationContext
 
         // Vector types also depend on the options.
         //
-        if( as<IRVectorType>(type) )
+        if (as<IRVectorType>(type))
         {
             // If we've been asked to scalarize all
             // vector load/store, then we need to
             // tread them as illegal.
             //
-            if(m_options.scalarizeVectorLoadStore)
+            if (m_options.scalarizeVectorLoadStore)
                 return false;
 
         }
@@ -205,17 +207,35 @@ struct ByteAddressBufferLegalizationContext
         return false;
     }
 
-    bool checkUnaligned(IRInst* baseOffset, IRIntegerValue immediateOffset, IRType* elementType, IRIntegerValue elementCount)
+    // Helper function to check if the alignment value passed is
+    // divisible by the offset at which the resource is indexed into
+    // in order to ensure if the load or store can be vectorized.
+    bool isAligned(IRInst* offset, IRInst* unknownOffsetAlignment, IRIntegerValue alignmentVal)
     {
-        // Check whether the given composite resource type is aligned to the baseOffset
-        IRSizeAndAlignment elementLayout;
-        SLANG_RETURN_FALSE_ON_FAIL(getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), elementType, &elementLayout));
-        IRIntegerValue elementStride = elementLayout.getStride();
-        bool isUnaligned = true;
-        if (auto baseOffsetVal = as<IRIntLit>(baseOffset)) {
-            isUnaligned = ((baseOffsetVal->getValue() + immediateOffset) % (elementStride * elementCount)) != 0;
+        if (auto baseOffsetVal = as<IRIntLit>(offset))
+        {
+            // If the offset is a constant known at compile time, simply check if it aligned to
+            // the elementsize of the underlying resource.
+            return (baseOffsetVal->getValue() % alignmentVal) == 0;
+        }
+        else if (auto alignInst = as<IRIntLit>(unknownOffsetAlignment))
+        {
+            // If the offset is not known during compile time, use the explicit align
+            // field of the overloaded `Load` or `Store` operation or vi `LoadAligned`
+            // or `StoreAligned` function.
+            //
+            // Unaligned `Load`s or `Store`s are identified with 0 alignment, to prevent
+            // accidentally issuing a wide vectorized operations.
+            if (!alignInst->getValue())
+                return false;
+
+            if ((alignInst->getValue() % alignmentVal) == 0)
+            {
+                return true;
+            }
+            m_sink->diagnose(offset->sourceLoc, Slang::Diagnostics::byteAddressBufferUnaligned, alignInst->getValue(), alignmentVal);
         }
-        return isUnaligned;
+        return false;
     }
 
     SlangResult getOffset(TargetProgram* target, IRStructField* field, IRIntegerValue* outOffset)
@@ -241,7 +261,7 @@ struct ByteAddressBufferLegalizationContext
     // given `type` from the given `buffer` at the required `baseOffset`
     // plus the `immediateOffset` if any.
     //
-    IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset)
+    IRInst* emitLegalLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment)
     {
         // The right way to load a value depends primarily
         // on the type, and secondarily on the options
@@ -299,7 +319,7 @@ struct ByteAddressBufferLegalizationContext
                 // for earlier fields will be left behind but can be eliminated
                 // as dead code.
                 //
-                auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset);
+                auto fieldVal = emitLegalLoad(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment);
                 if(!fieldVal)
                     return nullptr;
 
@@ -324,9 +344,23 @@ struct ByteAddressBufferLegalizationContext
             // legalization if the array type isn't in the right form
             // for us to proceed.
             //
+
             if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
             {
-                return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue());
+                // Emit an aligned load operation on an array when using a LoadAligned inst.
+                // Else, fallback to scalarizing the loads.
+                IRSizeAndAlignment elementLayout;
+                SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout));
+                IRIntegerValue elementStride = elementLayout.getStride();
+                auto alignmentVal = elementStride * elementCountInst->getValue();
+                if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
+                {
+                    return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeArray, arrayType->getElementType(), elementCountInst->getValue(), alignment);
+                }
+                else
+                {
+                    return emitSimpleLoad(type, buffer, baseOffset, immediateOffset);
+                }
             }
         }
         else if( auto matType = as<IRMatrixType>(type) )
@@ -341,7 +375,7 @@ struct ByteAddressBufferLegalizationContext
                 if( rowCountInst )
                 {
                     auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount());
-                    return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue());
+                    return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeMatrix, rowType, rowCountInst->getValue(), alignment);
                 }
             }
             else
@@ -354,7 +388,7 @@ struct ByteAddressBufferLegalizationContext
                 getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment);
                 for (Index c = 0; c < colCount; c++)
                 {
-                    auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset);
+                    auto colVector = emitLegalLoad(colVectorType, buffer, baseOffset, immediateOffset, alignment);
                     for (Index r = 0; r < rowCount; r++)
                     {
                         elements.add(m_builder.emitElementExtract(colVector, (IRIntegerValue)r));
@@ -382,11 +416,15 @@ struct ByteAddressBufferLegalizationContext
             //
             if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
             {
-                // Emit an aligned vector load operation when the data (elementCount * elementSize) is divisible
-                // by the offset. Else, fallback to scalarizing the loads.
-                if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+                // Emit an aligned vector load operation when using a LoadAligned inst.
+                // Else, fallback to scalarizing the loads.
+                IRSizeAndAlignment elementLayout;
+                SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout));
+                IRIntegerValue elementStride = elementLayout.getStride();
+                auto alignmentVal = elementStride * elementCountInst->getValue();
+                if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
                 {
-                    return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue());
+                    return emitLegalSequenceLoad(type, buffer, baseOffset, immediateOffset, kIROp_MakeVector, vecType->getElementType(), elementCountInst->getValue(), alignment);
                 }
                 else
                 {
@@ -464,7 +502,7 @@ struct ByteAddressBufferLegalizationContext
     // Loading of sequences for arrays, matrices, and vectors is
     // bottlenecked through a single function.
     //
-    IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount)
+    IRInst* emitLegalSequenceLoad(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IROp op, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment)
     {
         // Or goal here is to produce a value of the given `type`, loaded from `buffer`
         // at `baseOffset` plus `immediateOffset`.
@@ -486,7 +524,7 @@ struct ByteAddressBufferLegalizationContext
         List<IRInst*> elementVals;
         for( IRIntegerValue ii = 0; ii < elementCount; ++ii )
         {
-            auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride);
+            auto elementVal = emitLegalLoad(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment);
             if(!elementVal)
                 return nullptr;
 
@@ -844,7 +882,7 @@ struct ByteAddressBufferLegalizationContext
         // the type of the store operation, but instead the operand
         // that represents the value to be stored.
         //
-        auto value = store->getOperand(2);
+        auto value = store->getOperand(3);
         auto type = value->getDataType();
 
         // Types that are already legal to use don't require any processing.
@@ -863,14 +901,14 @@ struct ByteAddressBufferLegalizationContext
         // performance issue, but we should still consider trying to
         // tighten this up and make all uhandled cases be hard errors).
         //
-        auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, value);
+        auto result = emitLegalStore(type, store->getOperand(0), store->getOperand(1), 0, store->getOperand(2), value);
         if(SLANG_FAILED(result))
             return;
 
         store->removeAndDeallocate();
     }
 
-    Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value)
+    Result emitLegalStore(IRType* type, IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* alignment, IRInst* value)
     {
         // The flow for emitting a legal store is very similar to that for
         // legal loads; we will recurse on the structure of `type` and
@@ -889,7 +927,7 @@ struct ByteAddressBufferLegalizationContext
                 SLANG_RETURN_ON_FAIL(getOffset(m_targetProgram, field, &fieldOffset));
 
                 auto fieldVal = m_builder.emitFieldExtract(fieldType, value, field->getKey());
-                SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, fieldVal));
+                SLANG_RETURN_ON_FAIL(emitLegalStore(fieldType, buffer, baseOffset, immediateOffset + fieldOffset, alignment, fieldVal));
             }
             return SLANG_OK;
         }
@@ -900,7 +938,20 @@ struct ByteAddressBufferLegalizationContext
             //
             if (auto elementCountInst = as<IRIntLit>(arrayType->getElementCount()))
             {
-                return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue());
+                // Emit an aligned store operation on an array when using a StoreAligned inst.
+                // Else, fallback to scalarizing the stores.
+                IRSizeAndAlignment elementLayout;
+				SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), arrayType->getElementType(), &elementLayout));
+                IRIntegerValue elementStride = elementLayout.getStride();
+                auto alignmentVal = elementStride * elementCountInst->getValue();
+                if (!isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
+                {
+                    return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, arrayType->getElementType(), elementCountInst->getValue(), alignment);
+                }
+                else
+                {
+                    return emitSimpleStore(value->getDataType(), buffer, baseOffset, immediateOffset, value);
+                }
             }
         }
         else if( auto matType = as<IRMatrixType>(type) )
@@ -912,7 +963,7 @@ struct ByteAddressBufferLegalizationContext
                 if( rowCountInst )
                 {
                     auto rowType = m_builder.getVectorType(matType->getElementType(), matType->getColumnCount());
-                    return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue());
+                    return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, rowType, rowCountInst->getValue(), alignment);
                 }
             }
             else
@@ -935,7 +986,7 @@ struct ByteAddressBufferLegalizationContext
                     auto colVector = m_builder.emitMakeVector(colVectorType, colVectorArgs);
                     IRSizeAndAlignment colVectorSizeAlignment;
                     getSizeAndAlignment(m_targetProgram, colVectorType, &colVectorSizeAlignment);
-                    emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, colVector);
+                    emitLegalStore(colVectorType, buffer, baseOffset, immediateOffset, alignment, colVector);
                     immediateOffset += colVectorSizeAlignment.getStride();
                 }
                 return SLANG_OK;
@@ -945,11 +996,16 @@ struct ByteAddressBufferLegalizationContext
         {
             if (auto elementCountInst = as<IRIntLit>(vecType->getElementCount()))
             {
-                // Emit an aligned vector store operation when the data (elementCount * elementSize) is divisible
-                // by the offset. Else, fallback to scalarizing the stores.
-                if (m_options.scalarizeVectorLoadStore || checkUnaligned(baseOffset, immediateOffset, vecType->getElementType(), elementCountInst->getValue()))
+                // Emit an aligned vector store operation when using a StoreAligned inst.
+                // Else, fallback to scalarizing the stores.
+
+                IRSizeAndAlignment elementLayout;
+				SLANG_RELEASE_ASSERT(!getNaturalSizeAndAlignment(m_targetProgram->getOptionSet(), vecType->getElementType(), &elementLayout));
+                IRIntegerValue elementStride = elementLayout.getStride();
+                auto alignmentVal = elementStride * elementCountInst->getValue();
+                if (m_options.scalarizeVectorLoadStore || !isAligned(emitOffsetAddIfNeeded(baseOffset, immediateOffset), alignment, alignmentVal))
                 {
-                    return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue());
+                    return emitLegalSequenceStore(buffer, baseOffset, immediateOffset, value, vecType->getElementType(), elementCountInst->getValue(), alignment);
                 }
                 else
                 {
@@ -1023,7 +1079,7 @@ struct ByteAddressBufferLegalizationContext
         }
     }
 
-    Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount)
+    Result emitLegalSequenceStore(IRInst* buffer, IRInst* baseOffset, IRIntegerValue immediateOffset, IRInst* value, IRType* elementType, IRIntegerValue elementCount, IRInst* alignment)
     {
         // The store case for sequences is similar to the load case.
         //
@@ -1038,7 +1094,7 @@ struct ByteAddressBufferLegalizationContext
         {
             auto elementIndex = m_builder.getIntValue(indexType, ii);
             auto elementVal = m_builder.emitElementExtract(elementType, value, elementIndex);
-            SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, elementVal));
+            SLANG_RETURN_ON_FAIL(emitLegalStore(elementType, buffer, baseOffset, immediateOffset + ii*elementStride, alignment, elementVal));
         }
 
         return SLANG_OK;
@@ -1050,6 +1106,7 @@ void legalizeByteAddressBufferOps(
     Session*                                    session,
     TargetProgram*                              program,
     IRModule*                                   module,
+    DiagnosticSink*                             sink,
     ByteAddressBufferLegalizationOptions const& options)
 {
     ByteAddressBufferLegalizationContext context;
@@ -1057,6 +1114,7 @@ void legalizeByteAddressBufferOps(
     context.m_target = program->getTargetReq();
     context.m_options = options;
     context.m_targetProgram = program;
+    context.m_sink = sink;
     context.processModule(module);
 }
 
diff --git a/source/slang/slang-ir-byte-address-legalize.h b/source/slang/slang-ir-byte-address-legalize.h
index 996a93c73..71ab8a4e1 100644
--- a/source/slang/slang-ir-byte-address-legalize.h
+++ b/source/slang/slang-ir-byte-address-legalize.h
@@ -6,6 +6,7 @@ namespace Slang
 class Session;
 class TargetProgram;
 struct IRModule;
+class DiagnosticSink;
 
 struct ByteAddressBufferLegalizationOptions
 {
@@ -24,6 +25,7 @@ void legalizeByteAddressBufferOps(
     Session*                                    session,
     TargetProgram*                              target,
     IRModule*                                   module,
+    DiagnosticSink*                             sink,
     ByteAddressBufferLegalizationOptions const& options);
 }
 
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index f4954375d..4bad614b3 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -428,25 +428,27 @@ INST(ImageStore, imageStore, 3, 0)
 
 // Load (almost) arbitrary-type data from a byte-address buffer
 //
-// %dst = byteAddressBufferLoad(%buffer, %offset)
+// %dst = byteAddressBufferLoad(%buffer, %offset, %alignment)
 //
 // where
 // - `buffer` is a value of some `ByteAddressBufferTypeBase` type
 // - `offset` is an `int`
+// - `alignment` is an `int`
 // - `dst` is a value of some type containing only ordinary data
 //
-INST(ByteAddressBufferLoad, byteAddressBufferLoad, 2, 0)
+INST(ByteAddressBufferLoad, byteAddressBufferLoad, 3, 0)
 
 // Store (almost) arbitrary-type data to a byte-address buffer
 //
-// byteAddressBufferLoad(%buffer, %offset, %src)
+// byteAddressBufferLoad(%buffer, %offset, %alignment, %src)
 //
 // where
 // - `buffer` is a value of some `ByteAddressBufferTypeBase` type
 // - `offset` is an `int`
+// - `alignment` is an `int`
 // - `src` is a value of some type containing only ordinary data
 //
-INST(ByteAddressBufferStore, byteAddressBufferStore, 3, 0)
+INST(ByteAddressBufferStore, byteAddressBufferStore, 4, 0)
 
 // Load data from a structured buffer
 //
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index 5c4f01ae7..f0613dfa5 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -2252,6 +2252,10 @@ struct IRLayoutDecoration : IRDecoration
 };
 
 //
+struct IRAlignOf : IRInst
+{
+    IRInst* getBaseOp() { return getOperand(0); }
+};
 
 struct IRCall : IRInst
 {
diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp
index f35fa6750..6a4e9360a 100644
--- a/source/slang/slang-ir-layout.cpp
+++ b/source/slang/slang-ir-layout.cpp
@@ -497,7 +497,6 @@ struct Std140LayoutRules : IRTypeLayoutRules
 Result getNaturalSizeAndAlignment(CompilerOptionSet& optionSet, IRType* type, IRSizeAndAlignment* outSizeAndAlignment)
 {
     return getSizeAndAlignment(optionSet, IRTypeLayoutRules::getNatural(), type, outSizeAndAlignment);
-
 }
 
 Result getNaturalOffset(CompilerOptionSet& optionSet, IRStructField* field, IRIntegerValue* outOffset)
diff --git a/source/slang/slang-ir-peephole.cpp b/source/slang/slang-ir-peephole.cpp
index 88b26fbd3..16e440b32 100644
--- a/source/slang/slang-ir-peephole.cpp
+++ b/source/slang/slang-ir-peephole.cpp
@@ -250,6 +250,30 @@ struct PeepholeContext : InstPassBase
 
         switch (inst->getOp())
         {
+        case kIROp_AlignOf:
+            // Fold all calls to alignOf<T>() that returns a simple integer value.
+            if (inst->getDataType()->getOp() == kIROp_IntType)
+            {
+                if (!targetProgram)
+                    break;
+
+                // Save the alignment information and exit early if it is invalid
+                IRSizeAndAlignment sizeAlignment;
+                auto alignOfInst = as<IRAlignOf>(inst);
+                auto baseType = alignOfInst->getBaseOp()->getDataType();
+                if (SLANG_FAILED(getNaturalSizeAndAlignment(targetProgram->getOptionSet(), baseType, &sizeAlignment)))
+                    break;
+                if (sizeAlignment.size == 0)
+                    break;
+
+                IRBuilder builder(module);
+                builder.setInsertBefore(inst);
+                auto stride = builder.getIntValue(inst->getDataType(), sizeAlignment.getStride());
+                inst->replaceUsesWith(stride);
+                maybeRemoveOldInst(inst);
+                changed = true;
+            }
+            break;
         case kIROp_GetResultError:
             if (inst->getOperand(0)->getOp() == kIROp_MakeResultError)
             {
diff --git a/tests/compute/byte-address-buffer-align-error.slang b/tests/compute/byte-address-buffer-align-error.slang
new file mode 100644
index 000000000..34300d7c3
--- /dev/null
+++ b/tests/compute/byte-address-buffer-align-error.slang
@@ -0,0 +1,24 @@
+// byte-address-buffer-align-error.slang
+
+//TEST:SIMPLE(filecheck=CHECK):-target glsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target hlsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target spirv -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK):-target spirv -emit-spirv-directly -entry computeMain -stage compute
+
+// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types.
+
+[vk::binding(2, 3)] RWByteAddressBuffer buffer;
+struct Block {
+    float4 val[2];
+};
+[shader("compute")]
+[numthreads(1,1,1)]
+void computeMain(uint3 threadId : SV_DispatchThreadID)
+{
+    // CHECK: error 41300: invalid alignment `{{.*}}` specified for the byte address buffer resource with the element size of `{{.*}}`
+    // CHECK: error 41300: invalid alignment `{{.*}}` specified for the byte address buffer resource with the element size of `{{.*}}`
+    buffer.Store<Block>(0, buffer.Load<Block>(1, 5));
+    buffer.Store<Block>(1, buffer.Load<Block>(0), 3);
+
+}
+
diff --git a/tests/compute/byte-address-buffer-aligned.slang b/tests/compute/byte-address-buffer-aligned.slang
index 5024987aa..f959ec66d 100644
--- a/tests/compute/byte-address-buffer-aligned.slang
+++ b/tests/compute/byte-address-buffer-aligned.slang
@@ -109,8 +109,8 @@ void computeMain(uint3 threadId : SV_DispatchThreadID)
     // CHECK3-DAG: OpStore %[[V33]] %[[V28]]
     // CHECK3-DAG: %[[V34:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBf]] %[[BUF00]]
     // CHECK3-DAG: OpStore %[[V34]] %[[V30]]
-    buffer0.Store(32, buffer0.Load<float4>(32));
-	buffer0.Store(32, buffer0.Load<float4>(8));
-	buffer0.Store(8, buffer0.Load<float4>(32));
-	buffer0.Store(8, buffer0.Load<float4>(8));
+    buffer0.StoreAligned(32, buffer0.LoadAligned<float4>(32));
+    buffer0.StoreAligned(32, buffer0.LoadAligned<float4>(8));
+    buffer0.StoreAligned(8, buffer0.LoadAligned<float4>(32));
+    buffer0.StoreAligned(8, buffer0.LoadAligned<float4>(8));
 }
diff --git a/tests/compute/byte-address-buffer-array.slang b/tests/compute/byte-address-buffer-array.slang
new file mode 100644
index 000000000..1a23821a1
--- /dev/null
+++ b/tests/compute/byte-address-buffer-array.slang
@@ -0,0 +1,77 @@
+// byte-address-buffer-array.slang
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -d3d12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type
+//DISABLED_TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -output-using-type
+
+//TEST:SIMPLE(filecheck=CHECK1):-target glsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK2):-target hlsl -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK3):-target spirv -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=CHECK3):-target spirv -emit-spirv-directly -entry computeMain -stage compute
+
+// Confirm compilation of `(RW)ByteAddressBuffer` with aligned load / stores to wider data types.
+
+[vk::binding(2, 3)] RWByteAddressBuffer buffer;
+struct Block {
+    float4 val[2];
+};
+[shader("compute")]
+[numthreads(1,1,1)]
+void computeMain(uint3 threadId : SV_DispatchThreadID)
+{
+    // CHECK-NOT: warning
+    // CHECK1: _Array_std430_vector{{.*}} _data[]
+    // CHECK1: _Array_std430_vector{{.*}} packStorage_0(vec4 {{.*}}[2])
+    // CHECK1: vec4  {{.*}}[2] = { {{.*}}[0], {{.*}}[1] };
+    // CHECK1: _Array_std430_vector{{.*}} {{.*}} = { {{.*}} };
+    // CHECK1: void unpackStorage_0(_Array_std430_vector{{.*}}, out vec4 {{.*}}[2])
+    // CHECK1: {{.*}}[0] =  {{.*}}.data_0[0];
+    // CHECK1: {{.*}}[1] =  {{.*}}.data_0[1];
+    // CHECK1: vec4  {{.*}}[2];
+    // CHECK1: unpackStorage_0(buffer_0._data[0], {{.*}});
+    // CHECK1: vec4  {{.*}}[2] = buffer_0._data[0] = packStorage_0({{.*}});
+    // CHECK1: vec4 {{.*}} = buffer_2._data[1];
+    // CHECK1: vec4 {{.*}} = buffer_2._data[1] = vec4(buffer_1._data[1], buffer_1._data[2], buffer_1._data[3], buffer_1._data[4]);
+
+    // CHECK2: float4 {{.*}}[int(2)] = (buffer_0).Load<float4 [int(2)] >(int(0));
+    // CHECK2: buffer_0.Store(int(0),{{.*}});
+    // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(4));
+    // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(8));
+    // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(12));
+    // CHECK2: float {{.*}} = (buffer_0).Load<float >(int(16));
+    // CHECK2: float4 {{.*}} = float4({{.*}}, {{.*}}, {{.*}}, {{.*}});
+    // CHECK2: float4 {{.*}} = (buffer_0).Load<float4 >(int(20));
+    // CHECK2: buffer_0.Store(int(16),{{.*}});
+    // CHECK2: buffer_0.Store(int(32),{{.*}});
+
+    // CHECK3-DAG: %[[ARRV4_2:[a-zA-Z0-9_]+]] = OpTypeArray %v4float %int_2
+    // CHECK3-DAG: %[[SARRV4_2:[a-zA-Z0-9_]+]] = OpTypeStruct %[[ARRV4_2]]
+    // CHECK3-DAG: %[[SBA:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %[[SARRV4_2]]
+    // CHECK3-DAG: %[[SBF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %float
+    // CHECK3-DAG: %[[SBVF:[a-zA-Z0-9_]+]] = OpTypePointer StorageBuffer %v4float
+    // CHECK3-DAG: %[[V0:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}}
+    // CHECK3-DAG: %[[V1:[a-zA-Z0-9_]+]] = OpLoad %[[SARRV4_2]] %[[V0]]
+    // CHECK3-DAG: %[[V2:[a-zA-Z0-9_]+]] = OpCompositeExtract %[[ARRV4_2]] %[[V1]] 0
+    // CHECK3-DAG: %[[V3:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 0
+    // CHECK3-DAG: %[[V4:[a-zA-Z0-9_]+]] = OpCompositeExtract %v4float %[[V2]] 1
+    // CHECK3-DAG: %[[V5:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[ARRV4_2]] %[[V3]] %[[V4]]
+    // CHECK3-DAG: %[[V6:[a-zA-Z0-9_]+]] = OpCompositeConstruct %[[SARRV4_2]] %[[V5]]
+    // CHECK3-DAG: %[[V7:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBA]] %buffer {{.*}}
+    // CHECK3-DAG: OpStore %[[V7]] %[[V6]]
+    // CHECK3-DAG: %[[V8:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+    // CHECK3-DAG: %[[V9:[a-zA-Z0-9_]+]] = OpLoad %float %[[V8]]
+    // CHECK3-DAG: %[[V10:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+    // CHECK3-DAG: %[[V11:[a-zA-Z0-9_]+]] = OpLoad %float %[[V10]]
+    // CHECK3-DAG: %[[V12:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+    // CHECK3-DAG: %[[V13:[a-zA-Z0-9_]+]] = OpLoad %float %[[V12]]
+    // CHECK3-DAG: %[[V14:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBF]] %buffer_0 {{.*}}
+    // CHECK3-DAG: %[[V15:[a-zA-Z0-9_]+]] = OpLoad %float %[[V14]]
+    // CHECK3-DAG: %[[V16:[a-zA-Z0-9_]+]] = OpCompositeConstruct %v4float %[[V9]] %[[V11]] %[[V13]] %[[V15]]
+    // CHECK3-DAG: %[[V17:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+    // CHECK3-DAG: %[[V18:[a-zA-Z0-9_]+]] = OpLoad %v4float %[[V17]]
+    // CHECK3-DAG: %[[V19:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+    // CHECK3-DAG: OpStore %[[V19]] %[[V16]]
+    // CHECK3-DAG: %[[V20:[a-zA-Z0-9_]+]] = OpAccessChain %[[SBVF]] %buffer_1 {{.*}}
+    // CHECK3-DAG: OpStore %[[V20]] %[[V18]]
+    buffer.Store(0, buffer.LoadAligned<Block>(0));
+    buffer.StoreAligned(16, buffer.Load<Block>(4, 16));
+}
+
author	Sriram Murali <85252063+sriramm-nv@users.noreply.github.com>	2024-05-13 23:57:57 -0700
committer	GitHub <noreply@github.com>	2024-05-13 23:57:57 -0700
commit	487ae034e2b03ddd67945132c8fecbd937952705 (patch)
tree	036d318a64385151ad9d5e7275c2e387fdca6cee
parent	9f23046138629f78995d54a7722ad6749bd84db9 (diff)