31 files changed, 1153 insertions, 323 deletions
diff --git a/docs/user-guide/a3-02-reference-capability-atoms.md b/docs/user-guide/a3-02-reference-capability-atoms.md
index 7f759ca24..7f27284d9 100644
--- a/docs/user-guide/a3-02-reference-capability-atoms.md
+++ b/docs/user-guide/a3-02-reference-capability-atoms.md
@@ -626,6 +626,9 @@ Extensions
 `SPV_KHR_vulkan_memory_model`
 > Represents the SPIR-V extension for SPV_KHR_vulkan_memory_model.
 
+`SPV_NV_bindless_texture`
+> Represents the SPIR-V extension for SPV_NV_bindless_texture.
+
 `SPV_NV_cluster_acceleration_structure`
 > Represents the SPIR-V extension for cluster acceleration structure.
 
@@ -675,6 +678,9 @@ Extensions
 `spvAtomicFloat64MinMaxEXT`
 > Represents the SPIR-V capability for atomic float 64 min/max operations.
 
+`spvBindlessTextureNV`
+> Represents the SPIR-V capability for the bindless texture.
+
 `spvCooperativeMatrixBlockLoadsNV`
 > Represents the SPIR-V capability for cooperative matrix 2
 
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 491438c80..26b45d53f 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -59,12 +59,12 @@ struct RWStructuredBuffer
 template<typename T>
 struct StructuredBuffer
 {
-    SLANG_FORCE_INLINE const T& operator[](size_t index) const
+    SLANG_FORCE_INLINE T& operator[](size_t index) const
     {
         SLANG_BOUND_CHECK(index, count);
         return data[index];
     }
-    const T& Load(size_t index) const
+    T& Load(size_t index) const
     {
         SLANG_BOUND_CHECK(index, count);
         return data[index];
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 5c5335ac5..6c68cdb71 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2312,7 +2312,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uintptr_t UPTR_max(uintptr_t a, uintptr_t b)
 template<typename T>
 struct StructuredBuffer
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const
+    SLANG_CUDA_CALL T& operator[](size_t index) const
     {
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
         SLANG_BOUND_CHECK(index, count);
@@ -2320,7 +2320,7 @@ struct StructuredBuffer
         return data[index];
     }
 
-    SLANG_CUDA_CALL const T& Load(size_t index) const
+    SLANG_CUDA_CALL T& Load(size_t index) const
     {
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
         SLANG_BOUND_CHECK(index, count);
diff --git a/source/slang/slang-emit-wgsl.cpp b/source/slang/slang-emit-wgsl.cpp
index 53c3aa487..b115c723a 100644
--- a/source/slang/slang-emit-wgsl.cpp
+++ b/source/slang/slang-emit-wgsl.cpp
@@ -295,6 +295,11 @@ void WGSLSourceEmitter::emitStructFieldAttributes(
 {
     SLANG_UNUSED(allowOffsetLayout);
 
+    // If the struct type is not used for physical storage, then we don't need to
+    // emit any layout attributes.
+    if (!structType->findDecoration<IRPhysicalTypeDecoration>())
+        return;
+
     // Tint emits errors unless we explicitly spell out the layout in some cases, so emit
     // offset and align attribtues for all fields.
     IRSizeAndAlignmentDecoration* const sizeAndAlignmentDecoration =
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index f1cc6090d..09c2efea9 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -1387,16 +1387,10 @@ Result linkAndOptimizeIR(
     specializeFuncsForBufferLoadArgs(codeGenContext, irModule);
 
     // Push `structuredBufferLoad` to the end of access chain to avoid loading unnecessary data.
-    if (isKhronosTarget(targetRequest) || isMetalTarget(targetRequest) ||
-        isWGPUTarget(targetRequest))
-        deferBufferLoad(irModule);
+    deferBufferLoad(codeGenContext, irModule);
 
     // We also want to specialize calls to functions that
     // takes unsized array parameters if possible.
-    // Moreover, for Khronos targets, we also want to specialize calls to functions
-    // that takes arrays/structs containing arrays as parameters with the actual
-    // global array object to avoid loading big arrays into SSA registers, which seems
-    // to cause performance issues.
     specializeArrayParameters(codeGenContext, irModule);
 
 #if 0
diff --git a/source/slang/slang-ir-defer-buffer-load.cpp b/source/slang/slang-ir-defer-buffer-load.cpp
index 51c6a161b..ccdfe4538 100644
--- a/source/slang/slang-ir-defer-buffer-load.cpp
+++ b/source/slang/slang-ir-defer-buffer-load.cpp
@@ -3,142 +3,211 @@
 #include "slang-ir-clone.h"
 #include "slang-ir-dominators.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-redundancy-removal.h"
 #include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
 {
-struct DeferBufferLoadContext
-{
-    // Map an original SSA value to a pointer that can be used to load the value.
-    Dictionary<IRInst*, IRInst*> mapValueToPtr;
 
-    // Map an ptr to its loaded value.
-    Dictionary<IRInst*, IRInst*> mapPtrToValue;
+// Generally, we want to specialize arguments that are large in size, or arguments that
+// are arrays or composite type that contains arrays.
+// This is because:
+// 1. Struct types without arrays will eventually be SROA's into registers and then effectively
+//    DCE'd, so they usually won't cause performance issues. In fact, front loading structs
+//    and reusing the loaded value instead of repetitively loading from constant memory is
+//    usually beneficial to performance. However large struct values can be SROA'd into a large
+//    number of registers, causing slow downstream compilation. Therefore we should avoid/defer
+//    loading them into registers if we can.
+// 2. Arrays usually cannot be SROA'd into individual registers, which usually leads to
+//    large register consumption if they ever get loaded, so we want to defer loading array
+//    typed values as much as possible.
 
-    IRFunc* currentFunc = nullptr;
+// If the argument data is bigger than this threshold, it is considered a large object
+// and we will try to specialize it even if it doesn't contain arrays.
+static const int kBufferLoadElementSizeSpecializationThreshold = 128;
 
-    // Ensure that for an original SSA value, we have formed a pointer that can be used to load the
-    // value.
-    IRInst* ensurePtr(IRInst* valueInst)
-    {
-        IRInst* result = nullptr;
-        if (mapValueToPtr.tryGetValue(valueInst, result))
-            return result;
+// If the argument data is smaller than this threshold, it is considered a tiny object
+// and we will not consider specializing it, even if it contains arrays.
+static const int kBufferLoadElementSizeSpecializationMinThreshold = 16;
 
-        IRBuilder b(valueInst);
-        b.setInsertBefore(valueInst);
-
-        switch (valueInst->getOp())
+static bool isCompositeTypeContainingArrays(IRType* type)
+{
+    if (auto structType = as<IRStructType>(type))
+    {
+        for (auto field : structType->getFields())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            {
-                result = b.emitRWStructuredBufferGetElementPtr(
-                    valueInst->getOperand(0),
-                    valueInst->getOperand(1));
-                break;
-            }
-        case kIROp_GetElement:
+            if (const auto arrayType = as<IRArrayTypeBase>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitElementAddress(ptr, valueInst->getOperand(1));
-                break;
+                return true;
             }
-        case kIROp_FieldExtract:
+            if (auto subStructType = as<IRStructType>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitFieldAddress(ptr, valueInst->getOperand(1));
-                break;
+                if (isCompositeTypeContainingArrays(subStructType))
+                    return true;
             }
-        case kIROp_Load:
-            result = valueInst->getOperand(0);
-            break;
-        }
-        if (result)
-        {
-            mapValueToPtr[valueInst] = result;
         }
-        return result;
     }
+    else if (as<IRArrayTypeBase>(type))
+    {
+        return true;
+    }
+    return false;
+}
 
-    static bool isImmutableBufferLoad(IRInst* inst)
+bool isTypePreferrableToDeferLoad(CodeGenContext* codeGenContext, IRType* type)
+{
+    // If parameter is a pointer/reference, we should consider specialize it.
+    if (as<IROutTypeBase>(type) || as<IRRefType>(type) || as<IRConstRefType>(type))
+        return true;
+
+    // We only want to defer loading values that are "large enough" that
+    // we expect them to be expensive to pass by value.
+    //
+    IRSizeAndAlignment sizeAlignment = {};
+    if (SLANG_FAILED(getNaturalSizeAndAlignment(
+            codeGenContext->getTargetProgram()->getOptionSet(),
+            type,
+            &sizeAlignment)))
     {
-        // Note: we cannot defer loads from RWStructuredBuffer because there can be other
-        // instructions that modify the buffer.
+        // If type contains fields that we don't know how to compute natural size
+        // for, default to specialize if it contains arrays.
+        return isCompositeTypeContainingArrays(type);
+    }
+
+    // If the argument is very small, don't bother specializing.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationMinThreshold)
+        return false;
+
+    // If the argument is somewhat small, don't specialize, unless it contains
+    // arrays.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationThreshold)
+    {
+        // We generally do not specialize for small values, except it contains
+        // arrays that usually present a challenge for the SROA pass to eliminate
+        // unnecessary loads.
+        if (!isCompositeTypeContainingArrays(type))
+            return false;
+    }
+    return true;
+}
+
+// Returns true if memory loaded by `loadInst` is not modified before `userInst` after it is
+// loaded.
+// This method is currently implementing a very conservative analysis that only allows
+// `loadInst` to be in the same block as `userInst`, with basic aliasing analysis for any
+// stores in between. All other cases are conservatively treated as the memory location may be
+// modified.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst)
+{
+    auto func = getParentFunc(loadInst);
+    if (!func)
+        return false;
+
+    // For now we only check if loadInst and userInst are in the same block.
+    if (loadInst->getParent() != userInst->getParent())
+        return false;
+
+    for (IRInst* inst = loadInst->getNextInst(); inst; inst = inst->getNextInst())
+    {
+        // We found callInst before hitting any instruction that may modify the memory.
+        if (inst == userInst)
+            return true;
+
+        if (!inst->mightHaveSideEffects())
+            continue;
+
+        // If we see any inst that has side effect, check if it is simple case that we can rule
+        // out the possibility of modifying the memory location.
         switch (inst->getOp())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            return true;
-        case kIROp_Load:
+        case kIROp_Store:
             {
-                auto rootAddr = getRootAddr(inst->getOperand(0));
-                return isPointerToImmutableLocation(rootAddr);
+                auto storedDest = inst->getOperand(0);
+                if (canAddressesPotentiallyAlias(target, func, loadInst->getOperand(0), storedDest))
+                    return false;
+                continue;
             }
         default:
+            // For any other case, conservatively assume the memory location may be modified.
             return false;
         }
     }
+    // We didn't found callInst after loadInst within the same basic block.
+    // We conservatively assume the memory location may be modified.
+    // This check can be extended to use the dominator tree to allow
+    // loadInst and userInst to be in different blocks.
+    return false;
+}
 
-    // Ensure that for a pointer value, we have created a load instruction to materialize the value.
-    IRInst* materializePointer(IRBuilder& builder, IRInst* loadInst)
+struct DeferBufferLoadContext
+{
+    CodeGenContext* codeGenContext;
+
+
+    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
     {
-        auto ptr = ensurePtr(loadInst);
-        if (!ptr)
-            return nullptr;
-        IRInst* result = nullptr;
-        if (mapPtrToValue.tryGetValue(ptr, result))
-            return result;
-        IRAlignedAttr* align = nullptr;
-        if (auto load = as<IRLoad>(loadInst))
-            align = load->findAttr<IRAlignedAttr>();
-        if (!as<IRModuleInst>(ptr->getParent()))
+        // Don't defer the load anymore if the type is simple.
+        if (!isTypePreferrableToDeferLoad(codeGenContext, loadInst->getDataType()) ||
+            loadInst->findAttr<IRAlignedAttr>())
         {
-            setInsertAfterOrdinaryInst(&builder, ptr);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            mapPtrToValue[ptr] = result;
+            return;
         }
-        else
+
+        auto rootAddr = getRootAddr(loadInst->getOperand(0));
+        bool isImmutableBufferLoad = isPointerToImmutableLocation(rootAddr);
+
+        // Don't defer the load if there are uses that are not getElement or fieldExtract.
+        // Because in this case we need to use the entire loaded value, and further deferring
+        // the load down any access chain will introduce redundant loads.
+        for (auto use = loadInst->firstUse; use; use = use->nextUse)
         {
-            setInsertBeforeOrdinaryInst(&builder, loadInst);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            // Since we are inserting the load in a local scope, we can't register
-            // the mapping to the pointer, since the global pointer needs to be
-            // loaded once per function.
+            auto user = use->getUser();
+            switch (user->getOp())
+            {
+            case kIROp_GetElement:
+            case kIROp_FieldExtract:
+                // Can we defer the load to load only the requested element right before
+                // the element extract inst?
+                // If the buffer is immutable, we can always do that.
+                // If it is not, we need to make sure there is no other instructions that can modify
+                // the buffer between the load and the use.
+                //
+                if (isImmutableBufferLoad)
+                    continue;
+                if (isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codeGenContext->getTargetReq(),
+                        loadInst,
+                        user))
+                    continue;
+                return;
+            default:
+                // If we see any other use the laod instruction, we assume the entire loaded value
+                // is needed, and we can't defer the load anymore.
+                return;
+            }
         }
-        return result;
-    }
 
-    static bool isSimpleType(IRInst* type)
-    {
-        if (auto modType = as<IRRateQualifiedType>(type))
-            type = modType->getValueType();
-        if (as<IRStructType>(type))
-            return false;
-        if (as<IRTupleType>(type))
-            return false;
-        if (as<IRArrayTypeBase>(type))
-            return false;
-        return true;
-    }
+        // If we reach here, it means all uses are getElement or fieldExtract, and
+        // it is safe to defer the load down the access chain.
 
-    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
-    {
-        // Don't defer the load anymore if the type is simple.
-        if (isSimpleType(loadInst->getDataType()) || loadInst->findAttr<IRAlignedAttr>())
+        if (loadInst->getOp() == kIROp_StructuredBufferLoad)
         {
-            auto materializedVal = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(materializedVal);
-            loadInst->replaceUsesWith(materializedVal);
-            return;
+            // Convert the structuredBufferLoad to a regular load to reuse
+            // the same logic for deferring regular loads.
+            builder.setInsertBefore(loadInst);
+            auto bufferPtr = builder.emitRWStructuredBufferGetElementPtr(
+                loadInst->getOperand(0),
+                loadInst->getOperand(1));
+            auto sbLoad = builder.emitLoad(bufferPtr);
+            loadInst->transferDecorationsTo(sbLoad);
+            loadInst->replaceUsesWith(sbLoad);
+            loadInst->removeAndDeallocate();
+            loadInst = sbLoad;
         }
 
         // Otherwise, look for all uses and try to defer the load before actual use of the value.
@@ -148,19 +217,29 @@ struct DeferBufferLoadContext
             loadInst,
             [&](IRUse* use)
             {
-                if (needMaterialize)
-                    return;
-
                 auto user = use->getUser();
+
                 switch (user->getOp())
                 {
                 case kIROp_GetElement:
                 case kIROp_FieldExtract:
                     {
-                        auto basePtr = ensurePtr(loadInst);
-                        if (!basePtr)
-                            return;
-                        pendingWorkList.add(user);
+                        // If we see a getElement or fieldExtract, we defer the load by
+                        // replacing the getElement/fieldExtract with a load of the
+                        // elementAddr/fieldAddr.
+                        builder.setInsertBefore(user);
+                        auto basePtr = loadInst->getOperand(0);
+                        IRInst* gepArg = user->getOperand(1);
+                        auto elementPtr = builder.emitElementAddress(
+                            basePtr,
+                            makeArrayViewSingle<IRInst*>(gepArg));
+                        auto newLoad = builder.emitLoad(elementPtr);
+                        user->transferDecorationsTo(newLoad);
+                        user->replaceUsesWith(newLoad);
+                        user->removeAndDeallocate();
+
+                        // Now add the new load to work list to try to defer it further.
+                        pendingWorkList.add(newLoad);
                     }
                     break;
                 default:
@@ -169,41 +248,37 @@ struct DeferBufferLoadContext
                 }
             });
 
-        if (needMaterialize)
-        {
-            auto val = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(val);
-            loadInst->replaceUsesWith(val);
-            loadInst->removeAndDeallocate();
-        }
-        else
-        {
-            // Append to worklist in reverse order so we process the uses in natural appearance
-            // order.
-            for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
-                workList.add(pendingWorkList[i]);
-        }
+        // Append to worklist in reverse order so we process the uses in natural appearance
+        // order.
+        for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
+            workList.add(pendingWorkList[i]);
     }
 
     void deferBufferLoadInFunc(IRFunc* func)
     {
         removeRedundancyInFunc(func, false);
 
-        currentFunc = func;
-
         List<IRInst*> workList;
 
+        // Discover all load instructions and add to work list.
+
         for (auto block : func->getBlocks())
         {
             for (auto inst : block->getChildren())
             {
-                if (isImmutableBufferLoad(inst))
+                switch (inst->getOp())
                 {
+                case kIROp_Load:
+                case kIROp_StructuredBufferLoad:
+                    // Note: We don't handle `kIROp_StructuredBufferLoadStatus` here because
+                    // it also writes to the status code out parameter, which we can't defer.
                     workList.add(inst);
+                    break;
                 }
             }
         }
 
+        // Iteratively process the work list until it is empty.
         IRBuilder builder(func);
         for (Index i = 0; i < workList.getCount(); i++)
         {
@@ -227,9 +302,10 @@ struct DeferBufferLoadContext
     }
 };
 
-void deferBufferLoad(IRModule* module)
+void deferBufferLoad(CodeGenContext* codeGenContext, IRModule* module)
 {
     DeferBufferLoadContext context;
+    context.codeGenContext = codeGenContext;
     for (auto childInst : module->getGlobalInsts())
     {
         if (auto code = as<IRGlobalValueWithCode>(childInst))
diff --git a/source/slang/slang-ir-defer-buffer-load.h b/source/slang/slang-ir-defer-buffer-load.h
index b54271883..0f692b39a 100644
--- a/source/slang/slang-ir-defer-buffer-load.h
+++ b/source/slang/slang-ir-defer-buffer-load.h
@@ -4,9 +4,8 @@ namespace Slang
 {
 
 /*
-This pass implements a targeted optimization that defers the loading of structured buffer elements
-to the end of the access chain to avoid loading and repacking unnecessary data.
-For example, if we see:
+This pass implements a intra-function optimization that defers the loading of buffer
+elements to the end of the access chain to avoid loading unnecessary data. For example, if we see:
     val = StructuredBufferLoad(s, i)
     val2 = GetElement(val, j)
     val3 = FieldExtract(val2, field_key_0)
@@ -20,7 +19,22 @@ We should rewrite the code into:
 */
 
 struct IRModule;
+struct IRType;
+struct CodeGenContext;
+struct IRInst;
+class TargetRequest;
 
-void deferBufferLoad(IRModule* module);
+void deferBufferLoad(CodeGenContext* context, IRModule* module);
+
+// Returns true if the type is suitable for defer-load optimization.
+// Generally, we want to defer loading large structs or composites that contain arrays.
+bool isTypePreferrableToDeferLoad(CodeGenContext* context, IRType* type);
+
+// Returns true if memory loaded by `loadInst` may be modified before `userInst` after it is
+// loaded.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst);
 
 } // namespace Slang
diff --git a/source/slang/slang-ir-defunctionalization.cpp b/source/slang/slang-ir-defunctionalization.cpp
index af84ec78a..424971f90 100644
--- a/source/slang/slang-ir-defunctionalization.cpp
+++ b/source/slang/slang-ir-defunctionalization.cpp
@@ -12,7 +12,7 @@ struct FunctionParameterSpecializationCondition : FunctionCallSpecializeConditio
 {
     TargetRequest* targetRequest = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/, IRCall* /*callInst*/)
     {
         IRType* type = param->getDataType();
         return as<IRFuncType>(type);
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index a79ca2379..d87d96da0 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2694,7 +2694,10 @@ static void legalizeMeshPayloadInputParam(
     pp->replaceUsesWith(g);
     struct MeshPayloadInputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
@@ -2794,7 +2797,10 @@ static void legalizeMeshOutputParam(
     // pp is only removed later on, so sadly we have to keep it around for now
     struct MeshOutputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
diff --git a/source/slang/slang-ir-metal-legalize.cpp b/source/slang/slang-ir-metal-legalize.cpp
index e66617e72..e91da136a 100644
--- a/source/slang/slang-ir-metal-legalize.cpp
+++ b/source/slang/slang-ir-metal-legalize.cpp
@@ -172,7 +172,7 @@ struct MetalAddressSpaceAssigner : InitialAddressSpaceAssigner
         {
             if (ptrType->hasAddressSpace())
                 return ptrType->getAddressSpace();
-            return AddressSpace::Global;
+            return AddressSpace::Generic;
         }
         return AddressSpace::Generic;
     }
diff --git a/source/slang/slang-ir-specialize-address-space.cpp b/source/slang/slang-ir-specialize-address-space.cpp
index c4a155eec..04792bd8b 100644
--- a/source/slang/slang-ir-specialize-address-space.cpp
+++ b/source/slang/slang-ir-specialize-address-space.cpp
@@ -131,7 +131,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
     bool processFunction(IRFunc* func)
     {
         bool retValAddrSpaceChanged = false;
-        Dictionary<IRInst*, AddressSpace> mapVarValueToAddrSpace;
         bool changed = true;
         while (changed)
         {
@@ -152,18 +151,23 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         continue;
                     }
 
-                    // If the inst already has a pointer type with explicit address space, then use
-                    // it.
-                    if (auto ptrType = as<IRPtrTypeBase>(inst->getDataType()))
+                    // If the inst already has a pointer/pointer-like type with explicit address
+                    // space, then use it.
+                    auto addrSpaceFromType =
+                        addrSpaceAssigner->getAddressSpaceFromVarType(inst->getDataType());
+                    if (addrSpaceFromType != AddressSpace::Generic)
                     {
-                        if (ptrType->hasAddressSpace())
-                        {
-                            mapInstToAddrSpace[inst] = ptrType->getAddressSpace();
+                        mapInstToAddrSpace[inst] = addrSpaceFromType;
+                        changed = true;
+
+                        // Don't return early if the inst itself is a call, as we may still need to
+                        // specialize it down below.
+                        if (inst->getOp() != kIROp_Call)
                             continue;
-                        }
                     }
 
-                    // Otherwise, try to assign an address space based on the instruction type.
+                    // Try to assign an address space based on the instruction type, and specialize
+                    // calls.
                     switch (inst->getOp())
                     {
                     case kIROp_Var:
@@ -195,15 +199,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         }
                         break;
                     case kIROp_Store:
-                        {
-                            auto addrSpace = getAddrSpace(inst->getOperand(1));
-                            if (addrSpace != AddressSpace::Generic)
-                            {
-                                mapVarValueToAddrSpace[inst->getOperand(0)] = addrSpace;
-                                mapInstToAddrSpace[inst] = addrSpace;
-                                changed = true;
-                            }
-                        }
                         break;
                     case kIROp_Param:
                         if (!isFirstBlock)
@@ -243,8 +238,9 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                                 for (UInt i = 0; i < callInst->getArgCount(); i++)
                                 {
                                     auto arg = callInst->getArg(i);
-                                    argAddrSpaces.add(getAddrSpace(arg));
-                                    if (as<IRPtrTypeBase>(arg->getDataType()))
+                                    auto addrSpace = getAddrSpace(arg);
+                                    argAddrSpaces.add(addrSpace);
+                                    if (addrSpace != AddressSpace::Generic)
                                     {
                                         hasSpecializableArg = true;
                                     }
@@ -477,8 +473,13 @@ void propagateAddressSpaceFromInsts(List<IRInst*>&& workList)
     }
 }
 
-AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst*)
+AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst* type)
 {
+    if (auto ptrType = as<IRPtrTypeBase>(type))
+    {
+        if (ptrType->hasAddressSpace())
+            return ptrType->getAddressSpace();
+    }
     return AddressSpace::Generic;
 }
 
diff --git a/source/slang/slang-ir-specialize-arrays.cpp b/source/slang/slang-ir-specialize-arrays.cpp
index 4a4a72ee9..edb6cfa28 100644
--- a/source/slang/slang-ir-specialize-arrays.cpp
+++ b/source/slang/slang-ir-specialize-arrays.cpp
@@ -11,38 +11,14 @@ namespace Slang
 struct ArrayParameterSpecializationCondition : FunctionCallSpecializeCondition
 {
     // This pass is intended to specialize functions
-    // with struct parameters that has array fields
-    // to avoid performance problems for GLSL targets.
-    // Returns true if `type` is an `IRStructType` with array-typed fields.
-    // It will also specialize functions with unsized array parameters into
-    // sized arrays, if the function is called with an argument that has a
-    // sized array type.
+    // with unsized array parameter called with a sized-array argument.
     //
-    bool isStructTypeWithArray(IRType* type)
-    {
-        if (auto structType = as<IRStructType>(type))
-        {
-            for (auto field : structType->getFields())
-            {
-                if (const auto arrayType = as<IRArrayType>(field->getFieldType()))
-                {
-                    return true;
-                }
-                if (auto subStructType = as<IRStructType>(field->getFieldType()))
-                {
-                    if (isStructTypeWithArray(subStructType))
-                        return true;
-                }
-            }
-        }
-        return false;
-    }
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
+        SLANG_UNUSED(param);
         SLANG_UNUSED(arg);
-        if (isKhronosTarget(codeGenContext->getTargetReq()))
-            return isStructTypeWithArray(param->getDataType());
+        SLANG_UNUSED(callInst);
         return false;
     }
 
diff --git a/source/slang/slang-ir-specialize-buffer-load-arg.cpp b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
index 905f2e058..a5a3dd2d9 100644
--- a/source/slang/slang-ir-specialize-buffer-load-arg.cpp
+++ b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
@@ -1,8 +1,11 @@
 // slang-ir-specialize-buffer-load-arg.cpp
 #include "slang-ir-specialize-buffer-load-arg.h"
 
+#include "slang-ir-defer-buffer-load.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-specialize-function-call.h"
+#include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
@@ -17,76 +20,115 @@ namespace Slang
 // As swith most of our IR passes, we encapsulate the logic here in a context
 // type so that the data that needs to be shared throughout the pass can
 // be conveniently scoped.
+//
+
+// Note that this pass also ensures other more contrived cases are properly
+// handled. For example:
+//
+// * A load of a large structure from field in a constant buffer, so that
+//   the value loaded is not the entire buffer contents.
+//
+// * A load of a large structure from a structured buffer, or any other kind
+//   of buffer that requires an index.
+//
 
 struct FuncBufferLoadSpecializationCondition : FunctionCallSpecializeCondition
 {
     typedef FunctionCallSpecializeCondition Super;
 
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    CodeGenContext* codegenContext;
+
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         // We only want to specialize for `struct` types and not base types.
         //
-        // TODO: We might want to consider some criteria here for the "large-ness"
-        // of a structure (in terms of bytes and/or fields), so that we don't
-        // eliminate loads of sufficiently small types (which are cheap to pass
-        // by value).
-        //
-        auto paramType = param->getDataType();
-        if (!as<IRStructType>(paramType))
+        auto paramType = (IRType*)unwrapAttributedType(param->getDataType());
+        if (!isTypePreferrableToDeferLoad(codegenContext, paramType))
             return false;
 
-        // We also only want to specialize for arguments that are a load
-        // from some kind of global shader parameter.
+        // We want to handle loads from arbitrary access chains rooting from a shader parameter.
         //
         IRInst* a = arg;
-        if (auto argLoad = as<IRLoad>(arg))
-        {
-            a = argLoad->getPtr();
-        }
-        else
+        for (;;)
         {
-            return false;
-        }
+            // A user pointer can be directly passed into the function, so we no
+            // longer need to trace up further.
+            if (isUserPointerType(a->getDataType()))
+                break;
 
-        // We want to handle loads from a shader parameter that is an array
-        // of buffers, and not just a single global buffer.
-        //
-        while (auto argGetElement = as<IRGetElement>(a))
-        {
-            a = argGetElement->getBase();
+            if (auto argGetElement = as<IRGetElement>(a))
+            {
+                a = argGetElement->getBase();
+            }
+            else if (auto argSbLoad = as<IRStructuredBufferLoad>(a))
+            {
+                a = argSbLoad->getOperand(0);
+            }
+            else if (auto argBbLoad = as<IRByteAddressBufferLoad>(a))
+            {
+                a = argBbLoad->getOperand(0);
+            }
+            else if (auto argFieldExtract = as<IRFieldExtract>(a))
+            {
+                a = argFieldExtract->getBase();
+            }
+            else if (auto argGetElementPtr = as<IRGetElementPtr>(a))
+            {
+                a = argGetElementPtr->getBase();
+            }
+            else if (auto argSBGetElementPtr = as<IRRWStructuredBufferGetElementPtr>(a))
+            {
+                a = argSBGetElementPtr->getBase();
+            }
+            else if (auto argFieldAddr = as<IRFieldAddress>(a))
+            {
+                a = argFieldAddr->getBase();
+            }
+            else if (auto argLoad = as<IRLoad>(a))
+            {
+                a = argLoad->getPtr();
+
+                // We can safely defer a load to the callee if the source dest is immutable.
+                if (isPointerToImmutableLocation(a))
+                    continue;
+
+                // Otherwise, we check if there is no other instructions in between the load and the
+                // call that can modify the memory location. If so, we can still safely defer the
+                // load to the callee.
+                if (!isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codegenContext->getTargetReq(),
+                        argLoad,
+                        callInst))
+                    return false;
+            }
+            else
+            {
+                break;
+            }
         }
 
-        // The "root" of the parameter must be a reference to a global-scope
-        // shader parameter, so that we know we can substitute it into the callee.
+        // The "root" of the parameter must be one of the following:
+        // 1. A reference to a global-scope shader parameter that can be referenced directly from
+        //    the callee.
+        // 2. A user pointer or bindless resource handle that can be passed to the callee as
+        //    ordinary argument.
         //
         if (const auto argGlobalParam = as<IRGlobalParam>(a))
         {
             return true;
         }
-        else
+        else if (isUserPointerType(a->getDataType()) || as<IRCastDescriptorHandleToResource>(a))
         {
-            return false;
+            return true;
         }
-
-        // TODO: There are other patterns that we could attempt to optimize here.
-        // For example, this logic only handles loads of the *entire* contents of
-        // a buffer, so it would miss:
-        //
-        // * A load of a large structure from field in a constant buffer, so that
-        //   the value loaded is not the entire buffer contents.
-        //
-        // * A load of a large structure from a structured buffer, or any other kind
-        //   of buffer that requires an index.
-        //
-        // * Any resource load that is not expressed at the IR level with a `load`
-        //   instruction (e.g., those that might use an intrinsic function).
-        //
+        return false;
     }
 };
 
 void specializeFuncsForBufferLoadArgs(CodeGenContext* codegenContext, IRModule* module)
 {
     FuncBufferLoadSpecializationCondition condition;
+    condition.codegenContext = codegenContext;
     specializeFunctionCalls(codegenContext, module, &condition);
 }
 
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 7c82891a6..aead69258 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -40,6 +40,12 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         if (as<IRGlobalValueWithCode>(arg))
             return true;
 
+        if (isUserPointerType(arg->getDataType()))
+            return true;
+
+        if (as<IRCastDescriptorHandleToResource>(arg))
+            return true;
+
         // As we will see later, we can also
         // specialize a call when the argument
         // is the result of indexing into an
@@ -47,17 +53,29 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         // of the indexing operation is also
         // suitable for specialization.
         //
-        if (arg->getOp() == kIROp_GetElement || arg->getOp() == kIROp_Load)
+        switch (arg->getOp())
         {
-            auto base = arg->getOperand(0);
-
-            // We will "recurse" on the base of
-            // the indexing operation by continuing
-            // our loop with the `base` as our new
-            // argument.
-            //
-            arg = base;
-            continue;
+        case kIROp_GetElement:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+        case kIROp_GetElementPtr:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+        case kIROp_Load:
+            {
+                auto base = arg->getOperand(0);
+
+                // We will "recurse" on the base of
+                // the indexing operation by continuing
+                // our loop with the `base` as our new
+                // argument.
+                //
+                arg = base;
+                continue;
+            }
+        default:
+            break;
         }
 
         // By default, we will *not* consider an argument
@@ -225,7 +243,7 @@ struct FunctionParameterSpecializationContext
             // If neither the parameter nor the argument wants specialization,
             // then we need to keep looking.
             //
-            auto paramWantSpecialization = doesParamWantSpecialization(param, arg);
+            auto paramWantSpecialization = doesParamWantSpecialization(param, arg, call);
             auto paramTypeWantSpecialization = doesParamTypeWantSpecialization(param, arg);
             if (!paramWantSpecialization && !paramTypeWantSpecialization)
                 continue;
@@ -255,9 +273,9 @@ struct FunctionParameterSpecializationContext
     // Of course, now we need to back-fill the predicates that
     // the above function used to evaluate prameters and arguments.
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
-        return condition->doesParamWantSpecialization(param, arg);
+        return condition->doesParamWantSpecialization(param, arg, callInst);
     }
 
     bool doesParamTypeWantSpecialization(IRParam* param, IRInst* arg)
@@ -484,16 +502,20 @@ struct FunctionParameterSpecializationContext
             UInt oldArgIndex = oldArgCounter++;
             auto oldArg = oldCall->getArg(oldArgIndex);
 
-            getCallInfoForParam(callInfo, oldParam, oldArg);
+            getCallInfoForParam(callInfo, oldParam, oldArg, oldCall);
         }
     }
 
-    void getCallInfoForParam(CallSpecializationInfo& ioInfo, IRParam* oldParam, IRInst* oldArg)
+    void getCallInfoForParam(
+        CallSpecializationInfo& ioInfo,
+        IRParam* oldParam,
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // We know that the case where the parameter
         // and argument don't want specialization is easy.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The new call site will use the same argument
             // value as the old one, and we don't need
@@ -546,7 +568,15 @@ struct FunctionParameterSpecializationContext
             // Similarly for other global constants
             ioInfo.key.vals.add(globalConstant);
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If the arg is a user pointer, we can pass it as an ordinary argument,
+            // and we won't need further tracing down the access chain.
+            //
+            ioInfo.key.vals.add(oldArg->getFullType());
+            ioInfo.newArgs.add(oldArg);
+        }
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the `oldArg` is
             // in the form `oldBase[oldIndex]`
@@ -587,19 +617,45 @@ struct FunctionParameterSpecializationContext
 
             ioInfo.newArgs.add(oldIndex);
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the `oldArg` is
+            // in the form `oldBase.structKey`
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // Similar to the getElement case, we recursively setting up whatever
+            // `oldBase` needs first.
+            //
+            getCallInfoForArg(ioInfo, oldBase);
+
+            // The main difference from the `getElement` case is we actually want
+            // the structKey to be in the specialization key because it will be baked
+            // into the specialized function.
+            // And we won't introduce a new parameter to hold the index.
+            //
+            ioInfo.key.vals.add(structKey);
+        }
         else if (oldArg->getOp() == kIROp_Load)
         {
             auto oldBase = oldArg->getOperand(0);
             getCallInfoForArg(ioInfo, oldBase);
         }
+        else if (oldArg->getOp() == kIROp_CastDescriptorHandleToResource)
+        {
+            // We are accessing a resource from a bindless handle.
+            // We can stop recursion here and just pass in the bindless handle as
+            // an argument.
+            auto oldBase = oldArg->getOperand(0);
+            ioInfo.key.vals.add(oldBase->getFullType());
+            ioInfo.newArgs.add(oldBase);
+        }
         else
         {
             // If we fail to match any of the cases above
-            // then a precondition was violated in that
-            // `isArgSuitableForSpecialization` is allowing
-            // a case that this routine is not covering.
-            //
-            SLANG_UNEXPECTED("mising case in 'getCallInfoForArg'");
+            // then the `SpecializeCondition` is letting through constructs that we cannot handle.
+            SLANG_UNEXPECTED("unexpected function call specialization argument form.");
         }
     }
 
@@ -641,7 +697,7 @@ struct FunctionParameterSpecializationContext
             // will stand in for the parameter in the specialized
             // function.
             //
-            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg);
+            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg, oldCall);
 
             // We will collect the replacement value to use
             // for each of the original parameters in an array.
@@ -681,12 +737,13 @@ struct FunctionParameterSpecializationContext
     IRInst* getSpecializedValueForParam(
         FuncSpecializationInfo& ioInfo,
         IRParam* oldParam,
-        IRInst* oldArg)
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // As always, the easy case is when the parameter of
         // the original function doesn't need specialization.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The specialized callee will need a new parameter
             // that fills the same role as the old one, so we
@@ -718,6 +775,36 @@ struct FunctionParameterSpecializationContext
         }
     }
 
+    // Returns true if `inst` is an instruction that accesses an element from an array or a buffer.
+    //
+    static bool isElementAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_GetElementPtr:
+        case kIROp_GetElement:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+            return true;
+        }
+        return false;
+    }
+
+    // Returns true if `inst` is an instruction that accesses a field from a struct, that is
+    // either a FieldAddress or FieldExtract.
+    //
+    static bool isFieldAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+            return true;
+        }
+        return false;
+    }
+
     IRInst* getSpecializedValueForArg(FuncSpecializationInfo& ioInfo, IRInst* oldArg)
     {
         // The logic here parallels `gatherCallInfoForArg`,
@@ -735,13 +822,24 @@ struct FunctionParameterSpecializationContext
             //
             return globalParam;
         }
+        if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If argument is a user pointer, we can pass it into the callee
+            // directly as an oridinary argument without further specializing
+            // for the access chain beyond the pointer.
+            //
+            auto builder = getBuilder();
+            auto newParam = builder->createParam(oldArg->getFullType());
+            ioInfo.newParams.add(newParam);
+            return newParam;
+        }
         if (auto globalFunc = as<IRGlobalValueWithCode>(oldArg))
         {
             // As above, the identity of the specialized function is sufficient
             // to resolve the uses
             return globalFunc;
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the argument is
             // in the form `oldBase[oldIndex]`.
@@ -801,7 +899,9 @@ struct FunctionParameterSpecializationContext
             // of things, and then inserted to a more permanent location later.
             //
             builder->setInsertLoc(IRInsertLoc());
-            auto newVal = builder->emitElementExtract(oldArg->getFullType(), newBase, newIndex);
+            IRInst* newOperands[] = {newBase, newIndex};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
 
             // Because our new instruction wasn't
             // actually inserted anywhere, we need to
@@ -813,6 +913,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the argument is
+            // in the form `oldBase.structKey`.
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // We handle this case in a similar way as the `oldBase[oldIndex]`
+            // case, except that we don't need to introduce a new parameter
+            // for the index, since the struct key is known at compile-time.
+            auto newBase = getSpecializedValueForArg(ioInfo, oldBase);
+
+            auto builder = getBuilder();
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newBase, structKey};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
+
+            ioInfo.newBodyInsts.add(newVal);
+
+            return newVal;
+        }
         else if (auto oldArgLoad = as<IRLoad>(oldArg))
         {
             auto oldPtr = oldArgLoad->getPtr();
@@ -825,15 +949,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (auto castHandleToResource = as<IRCastDescriptorHandleToResource>(oldArg))
+        {
+            // We are accessing a resource from a bindless handle.
+            // We should create a param for the handle, and load the resource from the param.
+            auto builder = getBuilder();
+            auto oldHandle = castHandleToResource->getOperand(0);
+            auto newHandle = builder->createParam(oldHandle->getFullType());
+            ioInfo.newParams.add(newHandle);
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newHandle};
+            auto newVal = builder->emitIntrinsicInst(
+                oldArg->getFullType(),
+                kIROp_CastDescriptorHandleToResource,
+                1,
+                newOperands);
+            ioInfo.newBodyInsts.add(newVal);
+            return newVal;
+        }
         else
         {
             // If we don't match one of the above cases,
-            // then `isArgSuitableForSpecialization` is
-            // letting through cases that this function
-            // hasn't been updated to handle.
-            //
-            SLANG_UNEXPECTED("mising case in 'getSpecializedValueForArg'");
-            UNREACHABLE_RETURN(nullptr);
+            // then we are running into an invalid case.
+            SLANG_UNEXPECTED("unknown argument form for function call specialization.");
         }
     }
 
diff --git a/source/slang/slang-ir-specialize-function-call.h b/source/slang/slang-ir-specialize-function-call.h
index bab4ce2f4..afb8c2365 100644
--- a/source/slang/slang-ir-specialize-function-call.h
+++ b/source/slang/slang-ir-specialize-function-call.h
@@ -7,12 +7,14 @@ struct CodeGenContext;
 struct IRInst;
 struct IRModule;
 struct IRParam;
+struct IRCall;
+
 class Module;
 
 class FunctionCallSpecializeCondition
 {
 public:
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg) = 0;
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst) = 0;
 
     virtual bool isParamSuitableForSpecialization(IRParam* param, IRInst* arg);
 
diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp
index 871ba2c24..0ac08236f 100644
--- a/source/slang/slang-ir-specialize-resources.cpp
+++ b/source/slang/slang-ir-specialize-resources.cpp
@@ -20,9 +20,10 @@ struct ResourceParameterSpecializationCondition : FunctionCallSpecializeConditio
     TargetRequest* targetRequest = nullptr;
     TargetProgram* targetProgram = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         SLANG_UNUSED(arg);
+        SLANG_UNUSED(callInst);
 
         // Whether or not a parameter needs specialization is really
         // a function of its type:
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 8584ea95e..551a72fc7 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -17,6 +17,14 @@ bool isPointerOfType(IRInst* type, IROp opCode)
     return false;
 }
 
+bool isUserPointerType(IRInst* type)
+{
+    auto ptrType = as<IRPtrType>(type);
+    if (!ptrType)
+        return false;
+    return ptrType->getAddressSpace() == AddressSpace::UserPointer;
+}
+
 IRType* getVectorElementType(IRType* type)
 {
     if (auto vectorType = as<IRVectorType>(type))
@@ -792,35 +800,212 @@ IRInst* getRootAddr(IRInst* addr, List<IRInst*>& outAccessChain, List<IRInst*>*
     return addr;
 }
 
-// A simple and conservative address aliasing check.
-bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+IRInst* getRootBufferOrAddr(IRInst* addr)
 {
-    if (addr1 == addr2)
-        return true;
+    auto rootAddr = getRootAddr(addr);
+    if (as<IRRWStructuredBufferGetElementPtr>(rootAddr))
+    {
+        auto bufferHandle = rootAddr->getOperand(0);
+        // Check if the bufferHandle itself is a load from a global parameter.
+        if (auto load = as<IRLoad>(bufferHandle))
+        {
+            auto newRoot = getRootAddr(load->getPtr());
+            if (newRoot->getOp() == kIROp_GlobalParam)
+                return newRoot;
+        }
+    }
+    return rootAddr;
+}
+
+// The aliasing class of an address. This is used to determine
+// if two addresses may alias.
+enum class AddressAliasingClass
+{
+    Unknown,
+    UserPointer,      // A user pointer into global memory
+    Var,              // A thread-local or groupshared var.
+    ConstantBuffer,   // A constant buffer or parameter block.
+    BoundBuffer,      // A bound buffer.
+    BoundTexture,     // A bound texture resource.
+    DescriptorHandle, // A bindless buffer or resource.
+};
+
+AddressAliasingClass getAliasingClass(IRInst* addr)
+{
+    if (auto globalParam = as<IRGlobalParam>(addr))
+    {
+        auto type = unwrapArray(globalParam->getDataType());
+        if (!type)
+            return AddressAliasingClass::Unknown;
+        switch (type->getOp())
+        {
+        case kIROp_TextureType:
+            return AddressAliasingClass::BoundTexture;
+        case kIROp_HLSLStructuredBufferType:
+        case kIROp_HLSLRWStructuredBufferType:
+        case kIROp_HLSLAppendStructuredBufferType:
+        case kIROp_HLSLConsumeStructuredBufferType:
+        case kIROp_HLSLRasterizerOrderedStructuredBufferType:
+        case kIROp_HLSLByteAddressBufferType:
+        case kIROp_HLSLRWByteAddressBufferType:
+        case kIROp_HLSLRasterizerOrderedByteAddressBufferType:
+        case kIROp_GLSLShaderStorageBufferType:
+            return AddressAliasingClass::BoundBuffer;
+        case kIROp_ConstantBufferType:
+        case kIROp_ParameterBlockType:
+            return AddressAliasingClass::ConstantBuffer;
+        case kIROp_PtrType:
+            if (isUserPointerType(type))
+                return AddressAliasingClass::UserPointer;
+            return AddressAliasingClass::Unknown;
+        case kIROp_DynamicResourceType:
+            return AddressAliasingClass::DescriptorHandle;
+        default:
+            return AddressAliasingClass::Unknown;
+        }
+    }
+    else if (as<IRVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRGlobalVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRRWStructuredBufferGetElementPtr>(addr))
+        return AddressAliasingClass::DescriptorHandle;
+    else if (as<IRCastDescriptorHandleToResource>(addr))
+        return AddressAliasingClass::DescriptorHandle;
 
-    // Two variables can never alias.
-    addr1 = getRootAddr(addr1);
-    addr2 = getRootAddr(addr2);
+    auto type = addr->getDataType();
+    if (isUserPointerType(type))
+        return AddressAliasingClass::UserPointer;
+    return AddressAliasingClass::Unknown;
+}
 
-    // Global addresses can alias with anything.
-    if (!isChildInstOf(addr1, func))
+bool canAddrClassesAlias(AddressAliasingClass c1, AddressAliasingClass c2)
+{
+    if (c1 == AddressAliasingClass::Unknown || c2 == AddressAliasingClass::Unknown)
         return true;
 
-    if (!isChildInstOf(addr2, func))
+    switch (c1)
+    {
+    case AddressAliasingClass::Unknown:
         return true;
+    case AddressAliasingClass::UserPointer:
+    case AddressAliasingClass::Var:
+        // A users pointer or var can only alias with another
+        // object that is either a user pointer or var.
+        //
+        // Generally, a var should never alias with anything else that isn't a var,
+        // if we never allow the user to take address of a local var.
+        // We don't allow taking addresses of a local var on most GPU targets, but
+        // we currently do expose an internal intrinsic to do so when targeting CPU.
+        // We should consider disallowing this across the board, or enable more aggresive
+        // criteria when targeting GPU backends.
+        // For now we stay conservative and just report true even when addr1 is var and
+        // addr2 is not rooted from a var.
+        //
+        return c2 == AddressAliasingClass::UserPointer || c2 == AddressAliasingClass::Var;
+    case AddressAliasingClass::BoundBuffer:
+    case AddressAliasingClass::BoundTexture:
+        // A bound resource can only alias with another
+        // object that is a bound resource or descriptor handle
+        return c2 == c1 || c2 == AddressAliasingClass::DescriptorHandle;
+
+    case AddressAliasingClass::DescriptorHandle:
+        // Can alias with any other resource.
+        switch (c2)
+        {
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::DescriptorHandle:
+            return true;
+        default:
+            return false;
+        }
+    case AddressAliasingClass::ConstantBuffer:
+        // Constant buffer cannot alias with anything.
+        return false;
+    }
+    // For any other unknown case, assume they may alias.
+    return true;
+}
+
+// Has `var` being used in a way that may allow it to alias with a user pointer?
+bool canVarAliasWithUserPointer(TargetRequest* target, IRInst* var)
+{
+    if (target && !isCPUTarget(target))
+    {
+        // We don't allow taking the address of a variable on anything other
+        // than the CPU target. Therefore a var can never alias with a user
+        // pointer on these targets.
+        return false;
+    }
+
+    SLANG_UNUSED(var);
+    return true;
+}
+
+// A simple and conservative address aliasing check.
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2)
+{
+    if (addr1 == addr2)
+        return true;
+
+    addr1 = getRootBufferOrAddr(addr1);
+    addr2 = getRootBufferOrAddr(addr2);
+
+    auto addr1Class = getAliasingClass(addr1);
+    auto addr2Class = getAliasingClass(addr2);
 
-    if (addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Var && addr1 != addr2)
+    if (!canAddrClassesAlias(addr1Class, addr2Class))
         return false;
 
+    if (addr1Class == addr2Class)
+    {
+        // For these classes of addresses, the identity of the root
+        // determines whether or not the addresse can alias.
+        // Note that we assume two different bound resources can never
+        // alias, and two different variables can never alias.
+        switch (addr1Class)
+        {
+        case AddressAliasingClass::Var:
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::ConstantBuffer:
+            if (addr1 != addr2)
+                return false;
+            break;
+        }
+    }
+
     // A param and a var can never alias.
     if (addr1->getOp() == kIROp_Param && addr1->getParent() == func->getFirstBlock() &&
             addr2->getOp() == kIROp_Var ||
         addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Param &&
             addr2->getParent() == func->getFirstBlock())
         return false;
+
+    // If one addr is user pointer and one addr is a var,
+    // they can never alias, if the user code never took the address of
+    // the var.
+    if (addr1Class == AddressAliasingClass::Var && addr2Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr1);
+    }
+    if (addr2Class == AddressAliasingClass::Var && addr1Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr2);
+    }
     return true;
 }
 
+bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+{
+    return canAddressesPotentiallyAlias(nullptr, func, addr1, addr2);
+}
+
 bool isPtrLikeOrHandleType(IRInst* type)
 {
     if (!type)
@@ -1141,15 +1326,15 @@ bool areCallArgumentsSideEffectFree(IRCall* call, SideEffectAnalysisOptions opti
             if (isBitSet(options, SideEffectAnalysisOptions::UseDominanceTree))
                 dom = module->findOrCreateDominatorTree(parentFunc);
 
-            // If the pointer argument is a local variable (thus can't alias with other addresses)
-            // and it is never read from in the function, we can safely treat the call as having
-            // no side-effect.
-            // This is a conservative test, but is sufficient to detect the most common case where
-            // a temporary variable is used as the inout argument and the result stored in the temp
-            // variable isn't being used elsewhere in the parent func.
+            // If the pointer argument is a local variable (thus can't alias with other
+            // addresses) and it is never read from in the function, we can safely treat the
+            // call as having no side-effect. This is a conservative test, but is sufficient to
+            // detect the most common case where a temporary variable is used as the inout
+            // argument and the result stored in the temp variable isn't being used elsewhere in
+            // the parent func.
             //
-            // A more aggresive test can check all other address uses reachable from the call site
-            // and see if any of them are aliasing with the argument.
+            // A more aggresive test can check all other address uses reachable from the call
+            // site and see if any of them are aliasing with the argument.
             for (auto use = arg->firstUse; use; use = use->nextUse)
             {
                 if (as<IRDecoration>(use->getUser()))
@@ -1323,8 +1508,8 @@ bool doesCalleeHaveSideEffect(IRInst* callee)
         }
     }
 
-    // If the callee has no side effect, check if any of its associated functions have side effect.
-    // If so, we want to keep the callee around.
+    // If the callee has no side effect, check if any of its associated functions have side
+    // effect. If so, we want to keep the callee around.
     //
     // Typically, once the relevant pass has completed, the association is removed,
     // and at that point we can remove the function.
@@ -2230,13 +2415,12 @@ void legalizeDefUse(IRGlobalValueWithCode* func)
                 !(as<IRVar>(inst) && loopHeaderBlockMap.containsKey(block)))
                 continue;
 
-            // Normally, if the common dominator is not `block`, we can simply move the definition
-            // to the common dominator.
-            // An exception is when the common dominator is the target block of a
-            // loop.
-            // Another exception is when a var in the loop condition block is accessed both inside
-            // and outside the loop. It is technically visible, but effects on the 'var' are not
-            // visible outside the loop, so we'll need to hoist it out of the loop.
+            // Normally, if the common dominator is not `block`, we can simply move the
+            // definition to the common dominator. An exception is when the common dominator is
+            // the target block of a loop. Another exception is when a var in the loop condition
+            // block is accessed both inside and outside the loop. It is technically visible,
+            // but effects on the 'var' are not visible outside the loop, so we'll need to hoist
+            // it out of the loop.
             //
             // Note that after normalization, loops are in the form of:
             // ```
@@ -2377,9 +2561,9 @@ bool canOperationBeSpecConst(IROp op, IRType* resultType, IRInst* const* fixedAr
     // Returns true for ops that can be declared as an operation under `OpSpecConstantOp`.
     //
     // Integer arithmetic and comparison operations can be `OpSpecConstantOp` with the `Shader`
-    // capability, while floating-point arithmetic and comparison operations require the `Kernel`
-    // capability. We only support `Shader` capability for now, return false when floating-point
-    // arithmetic/comparison is encountered.
+    // capability, while floating-point arithmetic and comparison operations require the
+    // `Kernel` capability. We only support `Shader` capability for now, return false when
+    // floating-point arithmetic/comparison is encountered.
     switch (op)
     {
     case kIROp_Add:
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index c0410fa3c..b8937d569 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -70,6 +70,8 @@ bool isPointerOfType(IRInst* ptrType, IRInst* elementType);
 // True if ptrType is a pointer type to a type of opCode
 bool isPointerOfType(IRInst* ptrType, IROp opCode);
 
+bool isUserPointerType(IRInst* type);
+
 // Builds a dictionary that maps from requirement key to requirement value for `interfaceType`.
 Dictionary<IRInst*, IRInst*> buildInterfaceRequirementDict(IRInterfaceType* interfaceType);
 
@@ -205,6 +207,12 @@ IRInst* getRootAddr(
 
 bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2);
 
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2);
+
 String dumpIRToString(
     IRInst* root,
     IRDumpOptions options = {IRDumpOptions::Mode::Simplified, IRDumpOptions::Flag::DumpDebugIds});
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
index 295b45c73..273e6dc58 100644
--- a/tests/cuda/copy-elision-this-1.slang
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -1,4 +1,6 @@
-//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda
+//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=PTX): -stage compute -entry computeMain -target cuda
+
 struct Data {
     StructuredBuffer<float> input[2];
     RWStructuredBuffer<float> output;
@@ -6,7 +8,9 @@ struct Data {
     StructuredBuffer<uint> index_buffer;
     uint index_count;
 
-    // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this
+    // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}})
+    // CUDA-NEXT: {
+    // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load
     float fetch(int buffer, int index)
     {
         return input[buffer][index];
@@ -15,6 +19,8 @@ struct Data {
 
 ParameterBlock<Data> data;
 
+// PTX: computeMain
+
 [shader("compute")]
 [numthreads(8, 8, 1)]
 void computeMain(uint3 tid: SV_DispatchThreadID)
diff --git a/tests/glsl/global-uniform-with-varyings.slang b/tests/glsl/global-uniform-with-varyings.slang
index 678855dbf..174560840 100644
--- a/tests/glsl/global-uniform-with-varyings.slang
+++ b/tests/glsl/global-uniform-with-varyings.slang
@@ -3,9 +3,9 @@
 
 // CHECK_SPIRV: OpEntryPoint
 // CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 0
-// CHECK_SPIRV: OpVariable {{.*}} Uniform
-// CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 1
-// CHECK_SPIRV: OpVariable {{.*}} Output {{.*}} Location 0
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Uniform
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Input {{.*}} Location 1
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Output {{.*}} Location 0
 
 // CHECK_GLSL: layout(location = 0)
 // CHECK_GLSL-NEXT: in
diff --git a/tests/metal/out-param.slang b/tests/metal/out-param.slang
index e488f8844..68b6e2b62 100644
--- a/tests/metal/out-param.slang
+++ b/tests/metal/out-param.slang
@@ -9,13 +9,20 @@
 //TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
 
-// METAL: void Test_out_param{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
 
-// METAL: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
 
 void Test_out_param(out int value)
 {
diff --git a/tests/optimization/buffer-load-defer-aliasing-1.slang b/tests/optimization/buffer-load-defer-aliasing-1.slang
new file mode 100644
index 000000000..f50d5306c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing-1.slang
@@ -0,0 +1,45 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we can defer buffer loads by ruling out potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // RWStructuredBuffer is considered to not alias with anything else.
+        // this write should not prevent deferring loading bigArray.
+        gOther[0] = 100;
+        // this write should not prevent deferring loading bigArray.
+        gSharedVar = 1;
+        // this write should not prevent deferring loading bigArray.
+        gStaticVar = 2;
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform RWStructuredBuffer<int> gOther;
+static int gStaticVar;
+groupshared int gSharedVar;
+
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpEntryPoint
+    // SPV-NOT: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-aliasing.slang b/tests/optimization/buffer-load-defer-aliasing.slang
new file mode 100644
index 000000000..a0240cc40
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we are not deferring buffer loads due to potential aliasing writes.
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    float bottomGetValue(int index)
+    {
+        // this write may cause data stored at gRoot to be modified,
+        // thus bigArray[index] may be different from what it was before the call to
+        // bottomGetValue. So we should not defer loading bigArray until after this write.
+        *gOther = 100; 
+
+        // We should return the value from bigArray from a previously loaded value of `this`.
+        return bigArray[index];
+    }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform int* gOther;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    // SPV: OpLoad %Bottom_natural
+    outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-bindless.slang b/tests/optimization/buffer-load-defer-bindless.slang
new file mode 100644
index 000000000..2108d562c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-bindless.slang
@@ -0,0 +1,58 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Check that we can specialize buffer loads through bindless handles, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `StructuredBuffer<Middle>.Handle` and an `int`
+    // index, and recursively specialize `bottomGetValue` to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpLoad %Middle
+// SPV: %[[REG:[A-Za-z0-9_]+]] = OpLoad %float
+// SPV: OpStore {{.*}} %[[REG]]
+
+// Check that the generated CUDA code contains a specialized `bottomGetValue` function that has
+// the complete parameter list to access the `bigArray` element directly, without needing to load
+// the entire `Bottom` struct from the caller.
+//
+// CUDA-DAG: __device__ float Bottom_bottomGetValue{{.*}}(StructuredBuffer<Middle{{.*}}> {{.*}}, int {{.*}}, int {{.*}})
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-user-pointer.slang b/tests/optimization/buffer-load-defer-user-pointer.slang
new file mode 100644
index 000000000..58e6386f9
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-user-pointer.slang
@@ -0,0 +1,63 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+// Check that we can specialize buffer loads through user pointers, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    StructuredBuffer<Middle*>.Handle middle;
+
+    // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+    // struct to be loaded into registers. Instead, we should be able to specialize
+    // `middleGetValue` to take a `Middle*` and recursively specialize `bottomGetValue`
+    // to only load the `Bottom.bigArray[index]` element.
+    float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that the generated CUDA code never loads a `Middle` or `Bottom` struct into a local var.
+// CUDA-NOT: Middle{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Bottom{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Top{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+
+// Check that the generated CUDA code can be compiled by nvrtc correctly into PTX.
+// PTX: compute_main
+
+// Check that the generated (unoptimized) SPIR-V contains a specialized Bottom_bottomGetValue function
+// that takes in a Bottom* and use access chain to load the required array element directly, without 
+// needing to load the entire Bottom struct.
+// SPV: %Bottom_bottomGetValue = OpFunction %float None
+// SPV: OpFunctionParameter %_ptr_PhysicalStorageBuffer_Middle_natural
+// SPV: %[[INDEX:[A-Za-z0-9_]+]] = OpFunctionParameter %int
+// SPV: %[[PTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_float %{{.*}} %[[INDEX]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[PTR]]
+// SPV: OpReturnValue %[[VALUE]]
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer.slang b/tests/optimization/buffer-load-defer.slang
new file mode 100644
index 000000000..b2df43c13
--- /dev/null
+++ b/tests/optimization/buffer-load-defer.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    Middle middle;
+    float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-specialize-1.slang b/tests/optimization/buffer-load-specialize-1.slang
new file mode 100644
index 000000000..55f2df473
--- /dev/null
+++ b/tests/optimization/buffer-load-specialize-1.slang
@@ -0,0 +1,35 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+struct Bottom
+{
+    float bigArray[1024];
+    // SPV: %Bottom_bottomGetValue = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    // SPV-NOT: OpCompositeConstruct
+    // SPV: OpFunctionEnd
+
+    // SPV: %Bottom_bottomGetValue_0 = OpFunction %float None %{{.*}}
+    // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+    // SPV-NEXT: OpLabel
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom1;
+    Bottom bottom2;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.bottom1.bottomGetValue(0);
+    outputBuffer[1] = cb.bottom2.bottomGetValue(1);
+    outputBuffer[2] = cb.bottom2.bottomGetValue(2);
+}
diff --git a/tests/optimization/buffer-store-defer.slang b/tests/optimization/buffer-store-defer.slang
new file mode 100644
index 000000000..14362a477
--- /dev/null
+++ b/tests/optimization/buffer-store-defer.slang
@@ -0,0 +1,51 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=PTX): -target ptx -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+
+    [mutating]
+    void setVal(int index, float value) { bigArray[index] = value; }
+}
+
+struct Root
+{
+    Bottom top[2];
+    [mutating]
+    void setTopVal(int x, int y, float value)
+    {
+        top[x].setVal(y, value);
+    }
+}
+
+RWStructuredBuffer<Root> sb;
+
+// Check that we don't load the entire `Root` struct, modify it, and then write it back.
+// Instead we should generate a single store instruction to write the single float value
+// directly to the buffer.
+
+// SPV: OpEntryPoint
+// SPV: OpLabel
+// SPV-NEXT: OpAccessChain
+// SPV-NOT: OpCompositeInsert
+// SPV-NOT: OpLoad
+// SPV: OpStore
+// SPV-NOT: OpLoad
+// SPV-NOT: OpCompositeInsert
+// SPV: OpStore
+// SPV: OpReturn
+
+// CUDA: __device__ void Bottom_setVal_0(int [[INDEX0:[A-Za-z0-9_]+]], int [[INDEX1:[A-Za-z0-9_]+]], int [[INDEX2:[A-Za-z0-9_]+]], float [[VAL:[A-Za-z0-9_]+]])
+// CUDA:  (&(&(globalParams{{.*}}->sb{{.*}}){{\[}}[[INDEX0]]{{\]}})->top{{.*}}{{\[}}[[INDEX1]]{{\]}})->bigArray{{.*}}{{\[}}[[INDEX2]]{{\]}} = [[VAL]];
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    sb[0].setTopVal(1, 2, 100.0f);
+
+    sb[3].top[1].setVal(8, 200.0f);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
new file mode 100644
index 000000000..f7f9b1888
--- /dev/null
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Test that we can defer loading big structured buffer elements.
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+    Bottom bottom;
+}
+
+StructuredBuffer<Root> sb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that we don't load the entire `Root` struct and then do ElementExtract to get to `bigArray[0]`.
+// Instead we use access chain all the way to point to the required array element, and load just a single float.
+
+// SPV: OpEntryPoint
+// SPV: %[[SBPTRARRAY:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer__arr_float_int_1024
+// SPV: %[[SBPTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer_float %[[SBPTRARRAY]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[SBPTR]]
+// SPV: OpStore %{{.*}} %[[VALUE]]
+
+// CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
+// CUDA:  return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = sb[tid.x].bottom.bottomGetValue(0);
+}
diff --git a/tests/vkray/raygen-trace-ray-param-non-struct.slang b/tests/vkray/raygen-trace-ray-param-non-struct.slang
index b0a129761..72d85ed02 100644
--- a/tests/vkray/raygen-trace-ray-param-non-struct.slang
+++ b/tests/vkray/raygen-trace-ray-param-non-struct.slang
@@ -28,7 +28,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: TraceRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     TraceRay(as,
              1,
              0xff,
@@ -39,9 +39,9 @@ void main()
              someInData1);
     outputBuffer1[0] = outputBuffer1[0]+someInData1;
 
-    // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
+    // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = rayPayload{{.*}}.data{{.*}};
     // CHECK: TraceMotionRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     TraceMotionRay(as,
             1,
             0xff,
@@ -55,7 +55,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: NvTraceRayHitObject(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject::TraceRay(as,
              1,
              0xff,
@@ -68,7 +68,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: TraceMotionRay(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject::TraceMotionRay(as,
             1,
             0xff,
@@ -82,7 +82,7 @@ void main()
 
     // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
     // CHECK: NvInvokeHitObject(
-    // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+    // CHECK: rayPayload{{.*}}.data{{.*}};
     HitObject hitObject_HitOrMiss;
     HitObject::Invoke(
         as,
diff --git a/tests/wgsl/switch-case.slang b/tests/wgsl/switch-case.slang
index c4ff0996e..fc24bd67a 100644
--- a/tests/wgsl/switch-case.slang
+++ b/tests/wgsl/switch-case.slang
@@ -70,17 +70,14 @@ func fs_main(VertexOutput input)->FragmentOutput
     return output;
 }
 
-//WGSL: fn _S9( _S10 : Tuple_0) -> f32
-//WGSL-NEXT: {
-//WGSL-NEXT:     switch(_S10.value1_0.x)
+//WGSL:     switch({{.*}})
 //WGSL-NEXT:     {
 //WGSL-NEXT:     case u32(0):
 //WGSL-NEXT:         {
-//WGSL-NEXT:             return Circle_getArea_0(unpackAnyValue16_0(_S10.value2_0));
+//WGSL-NEXT:             return Circle_getArea_0
 //WGSL-NEXT:         }
 //WGSL-NEXT:     default :
 //WGSL-NEXT:         {
-//WGSL-NEXT:             return Rectangle_getArea_0(unpackAnyValue16_1(_S10.value2_0));
+//WGSL-NEXT:             return Rectangle_getArea_0
 //WGSL-NEXT:         }
 //WGSL-NEXT:     }
-//WGSL-NEXT: }