15 files changed, 744 insertions, 296 deletions
diff --git a/source/slang/slang-emit-wgsl.cpp b/source/slang/slang-emit-wgsl.cpp
index 53c3aa487..b115c723a 100644
--- a/source/slang/slang-emit-wgsl.cpp
+++ b/source/slang/slang-emit-wgsl.cpp
@@ -295,6 +295,11 @@ void WGSLSourceEmitter::emitStructFieldAttributes(
 {
     SLANG_UNUSED(allowOffsetLayout);
 
+    // If the struct type is not used for physical storage, then we don't need to
+    // emit any layout attributes.
+    if (!structType->findDecoration<IRPhysicalTypeDecoration>())
+        return;
+
     // Tint emits errors unless we explicitly spell out the layout in some cases, so emit
     // offset and align attribtues for all fields.
     IRSizeAndAlignmentDecoration* const sizeAndAlignmentDecoration =
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index f1cc6090d..09c2efea9 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -1387,16 +1387,10 @@ Result linkAndOptimizeIR(
     specializeFuncsForBufferLoadArgs(codeGenContext, irModule);
 
     // Push `structuredBufferLoad` to the end of access chain to avoid loading unnecessary data.
-    if (isKhronosTarget(targetRequest) || isMetalTarget(targetRequest) ||
-        isWGPUTarget(targetRequest))
-        deferBufferLoad(irModule);
+    deferBufferLoad(codeGenContext, irModule);
 
     // We also want to specialize calls to functions that
     // takes unsized array parameters if possible.
-    // Moreover, for Khronos targets, we also want to specialize calls to functions
-    // that takes arrays/structs containing arrays as parameters with the actual
-    // global array object to avoid loading big arrays into SSA registers, which seems
-    // to cause performance issues.
     specializeArrayParameters(codeGenContext, irModule);
 
 #if 0
diff --git a/source/slang/slang-ir-defer-buffer-load.cpp b/source/slang/slang-ir-defer-buffer-load.cpp
index 51c6a161b..ccdfe4538 100644
--- a/source/slang/slang-ir-defer-buffer-load.cpp
+++ b/source/slang/slang-ir-defer-buffer-load.cpp
@@ -3,142 +3,211 @@
 #include "slang-ir-clone.h"
 #include "slang-ir-dominators.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-redundancy-removal.h"
 #include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
 {
-struct DeferBufferLoadContext
-{
-    // Map an original SSA value to a pointer that can be used to load the value.
-    Dictionary<IRInst*, IRInst*> mapValueToPtr;
 
-    // Map an ptr to its loaded value.
-    Dictionary<IRInst*, IRInst*> mapPtrToValue;
+// Generally, we want to specialize arguments that are large in size, or arguments that
+// are arrays or composite type that contains arrays.
+// This is because:
+// 1. Struct types without arrays will eventually be SROA's into registers and then effectively
+//    DCE'd, so they usually won't cause performance issues. In fact, front loading structs
+//    and reusing the loaded value instead of repetitively loading from constant memory is
+//    usually beneficial to performance. However large struct values can be SROA'd into a large
+//    number of registers, causing slow downstream compilation. Therefore we should avoid/defer
+//    loading them into registers if we can.
+// 2. Arrays usually cannot be SROA'd into individual registers, which usually leads to
+//    large register consumption if they ever get loaded, so we want to defer loading array
+//    typed values as much as possible.
 
-    IRFunc* currentFunc = nullptr;
+// If the argument data is bigger than this threshold, it is considered a large object
+// and we will try to specialize it even if it doesn't contain arrays.
+static const int kBufferLoadElementSizeSpecializationThreshold = 128;
 
-    // Ensure that for an original SSA value, we have formed a pointer that can be used to load the
-    // value.
-    IRInst* ensurePtr(IRInst* valueInst)
-    {
-        IRInst* result = nullptr;
-        if (mapValueToPtr.tryGetValue(valueInst, result))
-            return result;
+// If the argument data is smaller than this threshold, it is considered a tiny object
+// and we will not consider specializing it, even if it contains arrays.
+static const int kBufferLoadElementSizeSpecializationMinThreshold = 16;
 
-        IRBuilder b(valueInst);
-        b.setInsertBefore(valueInst);
-
-        switch (valueInst->getOp())
+static bool isCompositeTypeContainingArrays(IRType* type)
+{
+    if (auto structType = as<IRStructType>(type))
+    {
+        for (auto field : structType->getFields())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            {
-                result = b.emitRWStructuredBufferGetElementPtr(
-                    valueInst->getOperand(0),
-                    valueInst->getOperand(1));
-                break;
-            }
-        case kIROp_GetElement:
+            if (const auto arrayType = as<IRArrayTypeBase>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitElementAddress(ptr, valueInst->getOperand(1));
-                break;
+                return true;
             }
-        case kIROp_FieldExtract:
+            if (auto subStructType = as<IRStructType>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitFieldAddress(ptr, valueInst->getOperand(1));
-                break;
+                if (isCompositeTypeContainingArrays(subStructType))
+                    return true;
             }
-        case kIROp_Load:
-            result = valueInst->getOperand(0);
-            break;
-        }
-        if (result)
-        {
-            mapValueToPtr[valueInst] = result;
         }
-        return result;
     }
+    else if (as<IRArrayTypeBase>(type))
+    {
+        return true;
+    }
+    return false;
+}
 
-    static bool isImmutableBufferLoad(IRInst* inst)
+bool isTypePreferrableToDeferLoad(CodeGenContext* codeGenContext, IRType* type)
+{
+    // If parameter is a pointer/reference, we should consider specialize it.
+    if (as<IROutTypeBase>(type) || as<IRRefType>(type) || as<IRConstRefType>(type))
+        return true;
+
+    // We only want to defer loading values that are "large enough" that
+    // we expect them to be expensive to pass by value.
+    //
+    IRSizeAndAlignment sizeAlignment = {};
+    if (SLANG_FAILED(getNaturalSizeAndAlignment(
+            codeGenContext->getTargetProgram()->getOptionSet(),
+            type,
+            &sizeAlignment)))
     {
-        // Note: we cannot defer loads from RWStructuredBuffer because there can be other
-        // instructions that modify the buffer.
+        // If type contains fields that we don't know how to compute natural size
+        // for, default to specialize if it contains arrays.
+        return isCompositeTypeContainingArrays(type);
+    }
+
+    // If the argument is very small, don't bother specializing.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationMinThreshold)
+        return false;
+
+    // If the argument is somewhat small, don't specialize, unless it contains
+    // arrays.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationThreshold)
+    {
+        // We generally do not specialize for small values, except it contains
+        // arrays that usually present a challenge for the SROA pass to eliminate
+        // unnecessary loads.
+        if (!isCompositeTypeContainingArrays(type))
+            return false;
+    }
+    return true;
+}
+
+// Returns true if memory loaded by `loadInst` is not modified before `userInst` after it is
+// loaded.
+// This method is currently implementing a very conservative analysis that only allows
+// `loadInst` to be in the same block as `userInst`, with basic aliasing analysis for any
+// stores in between. All other cases are conservatively treated as the memory location may be
+// modified.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst)
+{
+    auto func = getParentFunc(loadInst);
+    if (!func)
+        return false;
+
+    // For now we only check if loadInst and userInst are in the same block.
+    if (loadInst->getParent() != userInst->getParent())
+        return false;
+
+    for (IRInst* inst = loadInst->getNextInst(); inst; inst = inst->getNextInst())
+    {
+        // We found callInst before hitting any instruction that may modify the memory.
+        if (inst == userInst)
+            return true;
+
+        if (!inst->mightHaveSideEffects())
+            continue;
+
+        // If we see any inst that has side effect, check if it is simple case that we can rule
+        // out the possibility of modifying the memory location.
         switch (inst->getOp())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            return true;
-        case kIROp_Load:
+        case kIROp_Store:
             {
-                auto rootAddr = getRootAddr(inst->getOperand(0));
-                return isPointerToImmutableLocation(rootAddr);
+                auto storedDest = inst->getOperand(0);
+                if (canAddressesPotentiallyAlias(target, func, loadInst->getOperand(0), storedDest))
+                    return false;
+                continue;
             }
         default:
+            // For any other case, conservatively assume the memory location may be modified.
             return false;
         }
     }
+    // We didn't found callInst after loadInst within the same basic block.
+    // We conservatively assume the memory location may be modified.
+    // This check can be extended to use the dominator tree to allow
+    // loadInst and userInst to be in different blocks.
+    return false;
+}
 
-    // Ensure that for a pointer value, we have created a load instruction to materialize the value.
-    IRInst* materializePointer(IRBuilder& builder, IRInst* loadInst)
+struct DeferBufferLoadContext
+{
+    CodeGenContext* codeGenContext;
+
+
+    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
     {
-        auto ptr = ensurePtr(loadInst);
-        if (!ptr)
-            return nullptr;
-        IRInst* result = nullptr;
-        if (mapPtrToValue.tryGetValue(ptr, result))
-            return result;
-        IRAlignedAttr* align = nullptr;
-        if (auto load = as<IRLoad>(loadInst))
-            align = load->findAttr<IRAlignedAttr>();
-        if (!as<IRModuleInst>(ptr->getParent()))
+        // Don't defer the load anymore if the type is simple.
+        if (!isTypePreferrableToDeferLoad(codeGenContext, loadInst->getDataType()) ||
+            loadInst->findAttr<IRAlignedAttr>())
         {
-            setInsertAfterOrdinaryInst(&builder, ptr);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            mapPtrToValue[ptr] = result;
+            return;
         }
-        else
+
+        auto rootAddr = getRootAddr(loadInst->getOperand(0));
+        bool isImmutableBufferLoad = isPointerToImmutableLocation(rootAddr);
+
+        // Don't defer the load if there are uses that are not getElement or fieldExtract.
+        // Because in this case we need to use the entire loaded value, and further deferring
+        // the load down any access chain will introduce redundant loads.
+        for (auto use = loadInst->firstUse; use; use = use->nextUse)
         {
-            setInsertBeforeOrdinaryInst(&builder, loadInst);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            // Since we are inserting the load in a local scope, we can't register
-            // the mapping to the pointer, since the global pointer needs to be
-            // loaded once per function.
+            auto user = use->getUser();
+            switch (user->getOp())
+            {
+            case kIROp_GetElement:
+            case kIROp_FieldExtract:
+                // Can we defer the load to load only the requested element right before
+                // the element extract inst?
+                // If the buffer is immutable, we can always do that.
+                // If it is not, we need to make sure there is no other instructions that can modify
+                // the buffer between the load and the use.
+                //
+                if (isImmutableBufferLoad)
+                    continue;
+                if (isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codeGenContext->getTargetReq(),
+                        loadInst,
+                        user))
+                    continue;
+                return;
+            default:
+                // If we see any other use the laod instruction, we assume the entire loaded value
+                // is needed, and we can't defer the load anymore.
+                return;
+            }
         }
-        return result;
-    }
 
-    static bool isSimpleType(IRInst* type)
-    {
-        if (auto modType = as<IRRateQualifiedType>(type))
-            type = modType->getValueType();
-        if (as<IRStructType>(type))
-            return false;
-        if (as<IRTupleType>(type))
-            return false;
-        if (as<IRArrayTypeBase>(type))
-            return false;
-        return true;
-    }
+        // If we reach here, it means all uses are getElement or fieldExtract, and
+        // it is safe to defer the load down the access chain.
 
-    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
-    {
-        // Don't defer the load anymore if the type is simple.
-        if (isSimpleType(loadInst->getDataType()) || loadInst->findAttr<IRAlignedAttr>())
+        if (loadInst->getOp() == kIROp_StructuredBufferLoad)
         {
-            auto materializedVal = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(materializedVal);
-            loadInst->replaceUsesWith(materializedVal);
-            return;
+            // Convert the structuredBufferLoad to a regular load to reuse
+            // the same logic for deferring regular loads.
+            builder.setInsertBefore(loadInst);
+            auto bufferPtr = builder.emitRWStructuredBufferGetElementPtr(
+                loadInst->getOperand(0),
+                loadInst->getOperand(1));
+            auto sbLoad = builder.emitLoad(bufferPtr);
+            loadInst->transferDecorationsTo(sbLoad);
+            loadInst->replaceUsesWith(sbLoad);
+            loadInst->removeAndDeallocate();
+            loadInst = sbLoad;
         }
 
         // Otherwise, look for all uses and try to defer the load before actual use of the value.
@@ -148,19 +217,29 @@ struct DeferBufferLoadContext
             loadInst,
             [&](IRUse* use)
             {
-                if (needMaterialize)
-                    return;
-
                 auto user = use->getUser();
+
                 switch (user->getOp())
                 {
                 case kIROp_GetElement:
                 case kIROp_FieldExtract:
                     {
-                        auto basePtr = ensurePtr(loadInst);
-                        if (!basePtr)
-                            return;
-                        pendingWorkList.add(user);
+                        // If we see a getElement or fieldExtract, we defer the load by
+                        // replacing the getElement/fieldExtract with a load of the
+                        // elementAddr/fieldAddr.
+                        builder.setInsertBefore(user);
+                        auto basePtr = loadInst->getOperand(0);
+                        IRInst* gepArg = user->getOperand(1);
+                        auto elementPtr = builder.emitElementAddress(
+                            basePtr,
+                            makeArrayViewSingle<IRInst*>(gepArg));
+                        auto newLoad = builder.emitLoad(elementPtr);
+                        user->transferDecorationsTo(newLoad);
+                        user->replaceUsesWith(newLoad);
+                        user->removeAndDeallocate();
+
+                        // Now add the new load to work list to try to defer it further.
+                        pendingWorkList.add(newLoad);
                     }
                     break;
                 default:
@@ -169,41 +248,37 @@ struct DeferBufferLoadContext
                 }
             });
 
-        if (needMaterialize)
-        {
-            auto val = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(val);
-            loadInst->replaceUsesWith(val);
-            loadInst->removeAndDeallocate();
-        }
-        else
-        {
-            // Append to worklist in reverse order so we process the uses in natural appearance
-            // order.
-            for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
-                workList.add(pendingWorkList[i]);
-        }
+        // Append to worklist in reverse order so we process the uses in natural appearance
+        // order.
+        for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
+            workList.add(pendingWorkList[i]);
     }
 
     void deferBufferLoadInFunc(IRFunc* func)
     {
         removeRedundancyInFunc(func, false);
 
-        currentFunc = func;
-
         List<IRInst*> workList;
 
+        // Discover all load instructions and add to work list.
+
         for (auto block : func->getBlocks())
         {
             for (auto inst : block->getChildren())
             {
-                if (isImmutableBufferLoad(inst))
+                switch (inst->getOp())
                 {
+                case kIROp_Load:
+                case kIROp_StructuredBufferLoad:
+                    // Note: We don't handle `kIROp_StructuredBufferLoadStatus` here because
+                    // it also writes to the status code out parameter, which we can't defer.
                     workList.add(inst);
+                    break;
                 }
             }
         }
 
+        // Iteratively process the work list until it is empty.
         IRBuilder builder(func);
         for (Index i = 0; i < workList.getCount(); i++)
         {
@@ -227,9 +302,10 @@ struct DeferBufferLoadContext
     }
 };
 
-void deferBufferLoad(IRModule* module)
+void deferBufferLoad(CodeGenContext* codeGenContext, IRModule* module)
 {
     DeferBufferLoadContext context;
+    context.codeGenContext = codeGenContext;
     for (auto childInst : module->getGlobalInsts())
     {
         if (auto code = as<IRGlobalValueWithCode>(childInst))
diff --git a/source/slang/slang-ir-defer-buffer-load.h b/source/slang/slang-ir-defer-buffer-load.h
index b54271883..0f692b39a 100644
--- a/source/slang/slang-ir-defer-buffer-load.h
+++ b/source/slang/slang-ir-defer-buffer-load.h
@@ -4,9 +4,8 @@ namespace Slang
 {
 
 /*
-This pass implements a targeted optimization that defers the loading of structured buffer elements
-to the end of the access chain to avoid loading and repacking unnecessary data.
-For example, if we see:
+This pass implements a intra-function optimization that defers the loading of buffer
+elements to the end of the access chain to avoid loading unnecessary data. For example, if we see:
     val = StructuredBufferLoad(s, i)
     val2 = GetElement(val, j)
     val3 = FieldExtract(val2, field_key_0)
@@ -20,7 +19,22 @@ We should rewrite the code into:
 */
 
 struct IRModule;
+struct IRType;
+struct CodeGenContext;
+struct IRInst;
+class TargetRequest;
 
-void deferBufferLoad(IRModule* module);
+void deferBufferLoad(CodeGenContext* context, IRModule* module);
+
+// Returns true if the type is suitable for defer-load optimization.
+// Generally, we want to defer loading large structs or composites that contain arrays.
+bool isTypePreferrableToDeferLoad(CodeGenContext* context, IRType* type);
+
+// Returns true if memory loaded by `loadInst` may be modified before `userInst` after it is
+// loaded.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst);
 
 } // namespace Slang
diff --git a/source/slang/slang-ir-defunctionalization.cpp b/source/slang/slang-ir-defunctionalization.cpp
index af84ec78a..424971f90 100644
--- a/source/slang/slang-ir-defunctionalization.cpp
+++ b/source/slang/slang-ir-defunctionalization.cpp
@@ -12,7 +12,7 @@ struct FunctionParameterSpecializationCondition : FunctionCallSpecializeConditio
 {
     TargetRequest* targetRequest = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/, IRCall* /*callInst*/)
     {
         IRType* type = param->getDataType();
         return as<IRFuncType>(type);
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index a79ca2379..d87d96da0 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2694,7 +2694,10 @@ static void legalizeMeshPayloadInputParam(
     pp->replaceUsesWith(g);
     struct MeshPayloadInputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
@@ -2794,7 +2797,10 @@ static void legalizeMeshOutputParam(
     // pp is only removed later on, so sadly we have to keep it around for now
     struct MeshOutputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
diff --git a/source/slang/slang-ir-metal-legalize.cpp b/source/slang/slang-ir-metal-legalize.cpp
index e66617e72..e91da136a 100644
--- a/source/slang/slang-ir-metal-legalize.cpp
+++ b/source/slang/slang-ir-metal-legalize.cpp
@@ -172,7 +172,7 @@ struct MetalAddressSpaceAssigner : InitialAddressSpaceAssigner
         {
             if (ptrType->hasAddressSpace())
                 return ptrType->getAddressSpace();
-            return AddressSpace::Global;
+            return AddressSpace::Generic;
         }
         return AddressSpace::Generic;
     }
diff --git a/source/slang/slang-ir-specialize-address-space.cpp b/source/slang/slang-ir-specialize-address-space.cpp
index c4a155eec..04792bd8b 100644
--- a/source/slang/slang-ir-specialize-address-space.cpp
+++ b/source/slang/slang-ir-specialize-address-space.cpp
@@ -131,7 +131,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
     bool processFunction(IRFunc* func)
     {
         bool retValAddrSpaceChanged = false;
-        Dictionary<IRInst*, AddressSpace> mapVarValueToAddrSpace;
         bool changed = true;
         while (changed)
         {
@@ -152,18 +151,23 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         continue;
                     }
 
-                    // If the inst already has a pointer type with explicit address space, then use
-                    // it.
-                    if (auto ptrType = as<IRPtrTypeBase>(inst->getDataType()))
+                    // If the inst already has a pointer/pointer-like type with explicit address
+                    // space, then use it.
+                    auto addrSpaceFromType =
+                        addrSpaceAssigner->getAddressSpaceFromVarType(inst->getDataType());
+                    if (addrSpaceFromType != AddressSpace::Generic)
                     {
-                        if (ptrType->hasAddressSpace())
-                        {
-                            mapInstToAddrSpace[inst] = ptrType->getAddressSpace();
+                        mapInstToAddrSpace[inst] = addrSpaceFromType;
+                        changed = true;
+
+                        // Don't return early if the inst itself is a call, as we may still need to
+                        // specialize it down below.
+                        if (inst->getOp() != kIROp_Call)
                             continue;
-                        }
                     }
 
-                    // Otherwise, try to assign an address space based on the instruction type.
+                    // Try to assign an address space based on the instruction type, and specialize
+                    // calls.
                     switch (inst->getOp())
                     {
                     case kIROp_Var:
@@ -195,15 +199,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         }
                         break;
                     case kIROp_Store:
-                        {
-                            auto addrSpace = getAddrSpace(inst->getOperand(1));
-                            if (addrSpace != AddressSpace::Generic)
-                            {
-                                mapVarValueToAddrSpace[inst->getOperand(0)] = addrSpace;
-                                mapInstToAddrSpace[inst] = addrSpace;
-                                changed = true;
-                            }
-                        }
                         break;
                     case kIROp_Param:
                         if (!isFirstBlock)
@@ -243,8 +238,9 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                                 for (UInt i = 0; i < callInst->getArgCount(); i++)
                                 {
                                     auto arg = callInst->getArg(i);
-                                    argAddrSpaces.add(getAddrSpace(arg));
-                                    if (as<IRPtrTypeBase>(arg->getDataType()))
+                                    auto addrSpace = getAddrSpace(arg);
+                                    argAddrSpaces.add(addrSpace);
+                                    if (addrSpace != AddressSpace::Generic)
                                     {
                                         hasSpecializableArg = true;
                                     }
@@ -477,8 +473,13 @@ void propagateAddressSpaceFromInsts(List<IRInst*>&& workList)
     }
 }
 
-AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst*)
+AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst* type)
 {
+    if (auto ptrType = as<IRPtrTypeBase>(type))
+    {
+        if (ptrType->hasAddressSpace())
+            return ptrType->getAddressSpace();
+    }
     return AddressSpace::Generic;
 }
 
diff --git a/source/slang/slang-ir-specialize-arrays.cpp b/source/slang/slang-ir-specialize-arrays.cpp
index 4a4a72ee9..edb6cfa28 100644
--- a/source/slang/slang-ir-specialize-arrays.cpp
+++ b/source/slang/slang-ir-specialize-arrays.cpp
@@ -11,38 +11,14 @@ namespace Slang
 struct ArrayParameterSpecializationCondition : FunctionCallSpecializeCondition
 {
     // This pass is intended to specialize functions
-    // with struct parameters that has array fields
-    // to avoid performance problems for GLSL targets.
-    // Returns true if `type` is an `IRStructType` with array-typed fields.
-    // It will also specialize functions with unsized array parameters into
-    // sized arrays, if the function is called with an argument that has a
-    // sized array type.
+    // with unsized array parameter called with a sized-array argument.
     //
-    bool isStructTypeWithArray(IRType* type)
-    {
-        if (auto structType = as<IRStructType>(type))
-        {
-            for (auto field : structType->getFields())
-            {
-                if (const auto arrayType = as<IRArrayType>(field->getFieldType()))
-                {
-                    return true;
-                }
-                if (auto subStructType = as<IRStructType>(field->getFieldType()))
-                {
-                    if (isStructTypeWithArray(subStructType))
-                        return true;
-                }
-            }
-        }
-        return false;
-    }
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
+        SLANG_UNUSED(param);
         SLANG_UNUSED(arg);
-        if (isKhronosTarget(codeGenContext->getTargetReq()))
-            return isStructTypeWithArray(param->getDataType());
+        SLANG_UNUSED(callInst);
         return false;
     }
 
diff --git a/source/slang/slang-ir-specialize-buffer-load-arg.cpp b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
index 905f2e058..a5a3dd2d9 100644
--- a/source/slang/slang-ir-specialize-buffer-load-arg.cpp
+++ b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
@@ -1,8 +1,11 @@
 // slang-ir-specialize-buffer-load-arg.cpp
 #include "slang-ir-specialize-buffer-load-arg.h"
 
+#include "slang-ir-defer-buffer-load.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-specialize-function-call.h"
+#include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
@@ -17,76 +20,115 @@ namespace Slang
 // As swith most of our IR passes, we encapsulate the logic here in a context
 // type so that the data that needs to be shared throughout the pass can
 // be conveniently scoped.
+//
+
+// Note that this pass also ensures other more contrived cases are properly
+// handled. For example:
+//
+// * A load of a large structure from field in a constant buffer, so that
+//   the value loaded is not the entire buffer contents.
+//
+// * A load of a large structure from a structured buffer, or any other kind
+//   of buffer that requires an index.
+//
 
 struct FuncBufferLoadSpecializationCondition : FunctionCallSpecializeCondition
 {
     typedef FunctionCallSpecializeCondition Super;
 
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    CodeGenContext* codegenContext;
+
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         // We only want to specialize for `struct` types and not base types.
         //
-        // TODO: We might want to consider some criteria here for the "large-ness"
-        // of a structure (in terms of bytes and/or fields), so that we don't
-        // eliminate loads of sufficiently small types (which are cheap to pass
-        // by value).
-        //
-        auto paramType = param->getDataType();
-        if (!as<IRStructType>(paramType))
+        auto paramType = (IRType*)unwrapAttributedType(param->getDataType());
+        if (!isTypePreferrableToDeferLoad(codegenContext, paramType))
             return false;
 
-        // We also only want to specialize for arguments that are a load
-        // from some kind of global shader parameter.
+        // We want to handle loads from arbitrary access chains rooting from a shader parameter.
         //
         IRInst* a = arg;
-        if (auto argLoad = as<IRLoad>(arg))
-        {
-            a = argLoad->getPtr();
-        }
-        else
+        for (;;)
         {
-            return false;
-        }
+            // A user pointer can be directly passed into the function, so we no
+            // longer need to trace up further.
+            if (isUserPointerType(a->getDataType()))
+                break;
 
-        // We want to handle loads from a shader parameter that is an array
-        // of buffers, and not just a single global buffer.
-        //
-        while (auto argGetElement = as<IRGetElement>(a))
-        {
-            a = argGetElement->getBase();
+            if (auto argGetElement = as<IRGetElement>(a))
+            {
+                a = argGetElement->getBase();
+            }
+            else if (auto argSbLoad = as<IRStructuredBufferLoad>(a))
+            {
+                a = argSbLoad->getOperand(0);
+            }
+            else if (auto argBbLoad = as<IRByteAddressBufferLoad>(a))
+            {
+                a = argBbLoad->getOperand(0);
+            }
+            else if (auto argFieldExtract = as<IRFieldExtract>(a))
+            {
+                a = argFieldExtract->getBase();
+            }
+            else if (auto argGetElementPtr = as<IRGetElementPtr>(a))
+            {
+                a = argGetElementPtr->getBase();
+            }
+            else if (auto argSBGetElementPtr = as<IRRWStructuredBufferGetElementPtr>(a))
+            {
+                a = argSBGetElementPtr->getBase();
+            }
+            else if (auto argFieldAddr = as<IRFieldAddress>(a))
+            {
+                a = argFieldAddr->getBase();
+            }
+            else if (auto argLoad = as<IRLoad>(a))
+            {
+                a = argLoad->getPtr();
+
+                // We can safely defer a load to the callee if the source dest is immutable.
+                if (isPointerToImmutableLocation(a))
+                    continue;
+
+                // Otherwise, we check if there is no other instructions in between the load and the
+                // call that can modify the memory location. If so, we can still safely defer the
+                // load to the callee.
+                if (!isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codegenContext->getTargetReq(),
+                        argLoad,
+                        callInst))
+                    return false;
+            }
+            else
+            {
+                break;
+            }
         }
 
-        // The "root" of the parameter must be a reference to a global-scope
-        // shader parameter, so that we know we can substitute it into the callee.
+        // The "root" of the parameter must be one of the following:
+        // 1. A reference to a global-scope shader parameter that can be referenced directly from
+        //    the callee.
+        // 2. A user pointer or bindless resource handle that can be passed to the callee as
+        //    ordinary argument.
         //
         if (const auto argGlobalParam = as<IRGlobalParam>(a))
         {
             return true;
         }
-        else
+        else if (isUserPointerType(a->getDataType()) || as<IRCastDescriptorHandleToResource>(a))
         {
-            return false;
+            return true;
         }
-
-        // TODO: There are other patterns that we could attempt to optimize here.
-        // For example, this logic only handles loads of the *entire* contents of
-        // a buffer, so it would miss:
-        //
-        // * A load of a large structure from field in a constant buffer, so that
-        //   the value loaded is not the entire buffer contents.
-        //
-        // * A load of a large structure from a structured buffer, or any other kind
-        //   of buffer that requires an index.
-        //
-        // * Any resource load that is not expressed at the IR level with a `load`
-        //   instruction (e.g., those that might use an intrinsic function).
-        //
+        return false;
     }
 };
 
 void specializeFuncsForBufferLoadArgs(CodeGenContext* codegenContext, IRModule* module)
 {
     FuncBufferLoadSpecializationCondition condition;
+    condition.codegenContext = codegenContext;
     specializeFunctionCalls(codegenContext, module, &condition);
 }
 
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 7c82891a6..aead69258 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -40,6 +40,12 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         if (as<IRGlobalValueWithCode>(arg))
             return true;
 
+        if (isUserPointerType(arg->getDataType()))
+            return true;
+
+        if (as<IRCastDescriptorHandleToResource>(arg))
+            return true;
+
         // As we will see later, we can also
         // specialize a call when the argument
         // is the result of indexing into an
@@ -47,17 +53,29 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         // of the indexing operation is also
         // suitable for specialization.
         //
-        if (arg->getOp() == kIROp_GetElement || arg->getOp() == kIROp_Load)
+        switch (arg->getOp())
         {
-            auto base = arg->getOperand(0);
-
-            // We will "recurse" on the base of
-            // the indexing operation by continuing
-            // our loop with the `base` as our new
-            // argument.
-            //
-            arg = base;
-            continue;
+        case kIROp_GetElement:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+        case kIROp_GetElementPtr:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+        case kIROp_Load:
+            {
+                auto base = arg->getOperand(0);
+
+                // We will "recurse" on the base of
+                // the indexing operation by continuing
+                // our loop with the `base` as our new
+                // argument.
+                //
+                arg = base;
+                continue;
+            }
+        default:
+            break;
         }
 
         // By default, we will *not* consider an argument
@@ -225,7 +243,7 @@ struct FunctionParameterSpecializationContext
             // If neither the parameter nor the argument wants specialization,
             // then we need to keep looking.
             //
-            auto paramWantSpecialization = doesParamWantSpecialization(param, arg);
+            auto paramWantSpecialization = doesParamWantSpecialization(param, arg, call);
             auto paramTypeWantSpecialization = doesParamTypeWantSpecialization(param, arg);
             if (!paramWantSpecialization && !paramTypeWantSpecialization)
                 continue;
@@ -255,9 +273,9 @@ struct FunctionParameterSpecializationContext
     // Of course, now we need to back-fill the predicates that
     // the above function used to evaluate prameters and arguments.
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
-        return condition->doesParamWantSpecialization(param, arg);
+        return condition->doesParamWantSpecialization(param, arg, callInst);
     }
 
     bool doesParamTypeWantSpecialization(IRParam* param, IRInst* arg)
@@ -484,16 +502,20 @@ struct FunctionParameterSpecializationContext
             UInt oldArgIndex = oldArgCounter++;
             auto oldArg = oldCall->getArg(oldArgIndex);
 
-            getCallInfoForParam(callInfo, oldParam, oldArg);
+            getCallInfoForParam(callInfo, oldParam, oldArg, oldCall);
         }
     }
 
-    void getCallInfoForParam(CallSpecializationInfo& ioInfo, IRParam* oldParam, IRInst* oldArg)
+    void getCallInfoForParam(
+        CallSpecializationInfo& ioInfo,
+        IRParam* oldParam,
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // We know that the case where the parameter
         // and argument don't want specialization is easy.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The new call site will use the same argument
             // value as the old one, and we don't need
@@ -546,7 +568,15 @@ struct FunctionParameterSpecializationContext
             // Similarly for other global constants
             ioInfo.key.vals.add(globalConstant);
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If the arg is a user pointer, we can pass it as an ordinary argument,
+            // and we won't need further tracing down the access chain.
+            //
+            ioInfo.key.vals.add(oldArg->getFullType());
+            ioInfo.newArgs.add(oldArg);
+        }
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the `oldArg` is
             // in the form `oldBase[oldIndex]`
@@ -587,19 +617,45 @@ struct FunctionParameterSpecializationContext
 
             ioInfo.newArgs.add(oldIndex);
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the `oldArg` is
+            // in the form `oldBase.structKey`
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // Similar to the getElement case, we recursively setting up whatever
+            // `oldBase` needs first.
+            //
+            getCallInfoForArg(ioInfo, oldBase);
+
+            // The main difference from the `getElement` case is we actually want
+            // the structKey to be in the specialization key because it will be baked
+            // into the specialized function.
+            // And we won't introduce a new parameter to hold the index.
+            //
+            ioInfo.key.vals.add(structKey);
+        }
         else if (oldArg->getOp() == kIROp_Load)
         {
             auto oldBase = oldArg->getOperand(0);
             getCallInfoForArg(ioInfo, oldBase);
         }
+        else if (oldArg->getOp() == kIROp_CastDescriptorHandleToResource)
+        {
+            // We are accessing a resource from a bindless handle.
+            // We can stop recursion here and just pass in the bindless handle as
+            // an argument.
+            auto oldBase = oldArg->getOperand(0);
+            ioInfo.key.vals.add(oldBase->getFullType());
+            ioInfo.newArgs.add(oldBase);
+        }
         else
         {
             // If we fail to match any of the cases above
-            // then a precondition was violated in that
-            // `isArgSuitableForSpecialization` is allowing
-            // a case that this routine is not covering.
-            //
-            SLANG_UNEXPECTED("mising case in 'getCallInfoForArg'");
+            // then the `SpecializeCondition` is letting through constructs that we cannot handle.
+            SLANG_UNEXPECTED("unexpected function call specialization argument form.");
         }
     }
 
@@ -641,7 +697,7 @@ struct FunctionParameterSpecializationContext
             // will stand in for the parameter in the specialized
             // function.
             //
-            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg);
+            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg, oldCall);
 
             // We will collect the replacement value to use
             // for each of the original parameters in an array.
@@ -681,12 +737,13 @@ struct FunctionParameterSpecializationContext
     IRInst* getSpecializedValueForParam(
         FuncSpecializationInfo& ioInfo,
         IRParam* oldParam,
-        IRInst* oldArg)
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // As always, the easy case is when the parameter of
         // the original function doesn't need specialization.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The specialized callee will need a new parameter
             // that fills the same role as the old one, so we
@@ -718,6 +775,36 @@ struct FunctionParameterSpecializationContext
         }
     }
 
+    // Returns true if `inst` is an instruction that accesses an element from an array or a buffer.
+    //
+    static bool isElementAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_GetElementPtr:
+        case kIROp_GetElement:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+            return true;
+        }
+        return false;
+    }
+
+    // Returns true if `inst` is an instruction that accesses a field from a struct, that is
+    // either a FieldAddress or FieldExtract.
+    //
+    static bool isFieldAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+            return true;
+        }
+        return false;
+    }
+
     IRInst* getSpecializedValueForArg(FuncSpecializationInfo& ioInfo, IRInst* oldArg)
     {
         // The logic here parallels `gatherCallInfoForArg`,
@@ -735,13 +822,24 @@ struct FunctionParameterSpecializationContext
             //
             return globalParam;
         }
+        if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If argument is a user pointer, we can pass it into the callee
+            // directly as an oridinary argument without further specializing
+            // for the access chain beyond the pointer.
+            //
+            auto builder = getBuilder();
+            auto newParam = builder->createParam(oldArg->getFullType());
+            ioInfo.newParams.add(newParam);
+            return newParam;
+        }
         if (auto globalFunc = as<IRGlobalValueWithCode>(oldArg))
         {
             // As above, the identity of the specialized function is sufficient
             // to resolve the uses
             return globalFunc;
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the argument is
             // in the form `oldBase[oldIndex]`.
@@ -801,7 +899,9 @@ struct FunctionParameterSpecializationContext
             // of things, and then inserted to a more permanent location later.
             //
             builder->setInsertLoc(IRInsertLoc());
-            auto newVal = builder->emitElementExtract(oldArg->getFullType(), newBase, newIndex);
+            IRInst* newOperands[] = {newBase, newIndex};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
 
             // Because our new instruction wasn't
             // actually inserted anywhere, we need to
@@ -813,6 +913,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the argument is
+            // in the form `oldBase.structKey`.
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // We handle this case in a similar way as the `oldBase[oldIndex]`
+            // case, except that we don't need to introduce a new parameter
+            // for the index, since the struct key is known at compile-time.
+            auto newBase = getSpecializedValueForArg(ioInfo, oldBase);
+
+            auto builder = getBuilder();
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newBase, structKey};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
+
+            ioInfo.newBodyInsts.add(newVal);
+
+            return newVal;
+        }
         else if (auto oldArgLoad = as<IRLoad>(oldArg))
         {
             auto oldPtr = oldArgLoad->getPtr();
@@ -825,15 +949,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (auto castHandleToResource = as<IRCastDescriptorHandleToResource>(oldArg))
+        {
+            // We are accessing a resource from a bindless handle.
+            // We should create a param for the handle, and load the resource from the param.
+            auto builder = getBuilder();
+            auto oldHandle = castHandleToResource->getOperand(0);
+            auto newHandle = builder->createParam(oldHandle->getFullType());
+            ioInfo.newParams.add(newHandle);
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newHandle};
+            auto newVal = builder->emitIntrinsicInst(
+                oldArg->getFullType(),
+                kIROp_CastDescriptorHandleToResource,
+                1,
+                newOperands);
+            ioInfo.newBodyInsts.add(newVal);
+            return newVal;
+        }
         else
         {
             // If we don't match one of the above cases,
-            // then `isArgSuitableForSpecialization` is
-            // letting through cases that this function
-            // hasn't been updated to handle.
-            //
-            SLANG_UNEXPECTED("mising case in 'getSpecializedValueForArg'");
-            UNREACHABLE_RETURN(nullptr);
+            // then we are running into an invalid case.
+            SLANG_UNEXPECTED("unknown argument form for function call specialization.");
         }
     }
 
diff --git a/source/slang/slang-ir-specialize-function-call.h b/source/slang/slang-ir-specialize-function-call.h
index bab4ce2f4..afb8c2365 100644
--- a/source/slang/slang-ir-specialize-function-call.h
+++ b/source/slang/slang-ir-specialize-function-call.h
@@ -7,12 +7,14 @@ struct CodeGenContext;
 struct IRInst;
 struct IRModule;
 struct IRParam;
+struct IRCall;
+
 class Module;
 
 class FunctionCallSpecializeCondition
 {
 public:
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg) = 0;
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst) = 0;
 
     virtual bool isParamSuitableForSpecialization(IRParam* param, IRInst* arg);
 
diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp
index 871ba2c24..0ac08236f 100644
--- a/source/slang/slang-ir-specialize-resources.cpp
+++ b/source/slang/slang-ir-specialize-resources.cpp
@@ -20,9 +20,10 @@ struct ResourceParameterSpecializationCondition : FunctionCallSpecializeConditio
     TargetRequest* targetRequest = nullptr;
     TargetProgram* targetProgram = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         SLANG_UNUSED(arg);
+        SLANG_UNUSED(callInst);
 
         // Whether or not a parameter needs specialization is really
         // a function of its type:
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 8584ea95e..551a72fc7 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -17,6 +17,14 @@ bool isPointerOfType(IRInst* type, IROp opCode)
     return false;
 }
 
+bool isUserPointerType(IRInst* type)
+{
+    auto ptrType = as<IRPtrType>(type);
+    if (!ptrType)
+        return false;
+    return ptrType->getAddressSpace() == AddressSpace::UserPointer;
+}
+
 IRType* getVectorElementType(IRType* type)
 {
     if (auto vectorType = as<IRVectorType>(type))
@@ -792,35 +800,212 @@ IRInst* getRootAddr(IRInst* addr, List<IRInst*>& outAccessChain, List<IRInst*>*
     return addr;
 }
 
-// A simple and conservative address aliasing check.
-bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+IRInst* getRootBufferOrAddr(IRInst* addr)
 {
-    if (addr1 == addr2)
-        return true;
+    auto rootAddr = getRootAddr(addr);
+    if (as<IRRWStructuredBufferGetElementPtr>(rootAddr))
+    {
+        auto bufferHandle = rootAddr->getOperand(0);
+        // Check if the bufferHandle itself is a load from a global parameter.
+        if (auto load = as<IRLoad>(bufferHandle))
+        {
+            auto newRoot = getRootAddr(load->getPtr());
+            if (newRoot->getOp() == kIROp_GlobalParam)
+                return newRoot;
+        }
+    }
+    return rootAddr;
+}
+
+// The aliasing class of an address. This is used to determine
+// if two addresses may alias.
+enum class AddressAliasingClass
+{
+    Unknown,
+    UserPointer,      // A user pointer into global memory
+    Var,              // A thread-local or groupshared var.
+    ConstantBuffer,   // A constant buffer or parameter block.
+    BoundBuffer,      // A bound buffer.
+    BoundTexture,     // A bound texture resource.
+    DescriptorHandle, // A bindless buffer or resource.
+};
+
+AddressAliasingClass getAliasingClass(IRInst* addr)
+{
+    if (auto globalParam = as<IRGlobalParam>(addr))
+    {
+        auto type = unwrapArray(globalParam->getDataType());
+        if (!type)
+            return AddressAliasingClass::Unknown;
+        switch (type->getOp())
+        {
+        case kIROp_TextureType:
+            return AddressAliasingClass::BoundTexture;
+        case kIROp_HLSLStructuredBufferType:
+        case kIROp_HLSLRWStructuredBufferType:
+        case kIROp_HLSLAppendStructuredBufferType:
+        case kIROp_HLSLConsumeStructuredBufferType:
+        case kIROp_HLSLRasterizerOrderedStructuredBufferType:
+        case kIROp_HLSLByteAddressBufferType:
+        case kIROp_HLSLRWByteAddressBufferType:
+        case kIROp_HLSLRasterizerOrderedByteAddressBufferType:
+        case kIROp_GLSLShaderStorageBufferType:
+            return AddressAliasingClass::BoundBuffer;
+        case kIROp_ConstantBufferType:
+        case kIROp_ParameterBlockType:
+            return AddressAliasingClass::ConstantBuffer;
+        case kIROp_PtrType:
+            if (isUserPointerType(type))
+                return AddressAliasingClass::UserPointer;
+            return AddressAliasingClass::Unknown;
+        case kIROp_DynamicResourceType:
+            return AddressAliasingClass::DescriptorHandle;
+        default:
+            return AddressAliasingClass::Unknown;
+        }
+    }
+    else if (as<IRVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRGlobalVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRRWStructuredBufferGetElementPtr>(addr))
+        return AddressAliasingClass::DescriptorHandle;
+    else if (as<IRCastDescriptorHandleToResource>(addr))
+        return AddressAliasingClass::DescriptorHandle;
 
-    // Two variables can never alias.
-    addr1 = getRootAddr(addr1);
-    addr2 = getRootAddr(addr2);
+    auto type = addr->getDataType();
+    if (isUserPointerType(type))
+        return AddressAliasingClass::UserPointer;
+    return AddressAliasingClass::Unknown;
+}
 
-    // Global addresses can alias with anything.
-    if (!isChildInstOf(addr1, func))
+bool canAddrClassesAlias(AddressAliasingClass c1, AddressAliasingClass c2)
+{
+    if (c1 == AddressAliasingClass::Unknown || c2 == AddressAliasingClass::Unknown)
         return true;
 
-    if (!isChildInstOf(addr2, func))
+    switch (c1)
+    {
+    case AddressAliasingClass::Unknown:
         return true;
+    case AddressAliasingClass::UserPointer:
+    case AddressAliasingClass::Var:
+        // A users pointer or var can only alias with another
+        // object that is either a user pointer or var.
+        //
+        // Generally, a var should never alias with anything else that isn't a var,
+        // if we never allow the user to take address of a local var.
+        // We don't allow taking addresses of a local var on most GPU targets, but
+        // we currently do expose an internal intrinsic to do so when targeting CPU.
+        // We should consider disallowing this across the board, or enable more aggresive
+        // criteria when targeting GPU backends.
+        // For now we stay conservative and just report true even when addr1 is var and
+        // addr2 is not rooted from a var.
+        //
+        return c2 == AddressAliasingClass::UserPointer || c2 == AddressAliasingClass::Var;
+    case AddressAliasingClass::BoundBuffer:
+    case AddressAliasingClass::BoundTexture:
+        // A bound resource can only alias with another
+        // object that is a bound resource or descriptor handle
+        return c2 == c1 || c2 == AddressAliasingClass::DescriptorHandle;
+
+    case AddressAliasingClass::DescriptorHandle:
+        // Can alias with any other resource.
+        switch (c2)
+        {
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::DescriptorHandle:
+            return true;
+        default:
+            return false;
+        }
+    case AddressAliasingClass::ConstantBuffer:
+        // Constant buffer cannot alias with anything.
+        return false;
+    }
+    // For any other unknown case, assume they may alias.
+    return true;
+}
+
+// Has `var` being used in a way that may allow it to alias with a user pointer?
+bool canVarAliasWithUserPointer(TargetRequest* target, IRInst* var)
+{
+    if (target && !isCPUTarget(target))
+    {
+        // We don't allow taking the address of a variable on anything other
+        // than the CPU target. Therefore a var can never alias with a user
+        // pointer on these targets.
+        return false;
+    }
+
+    SLANG_UNUSED(var);
+    return true;
+}
+
+// A simple and conservative address aliasing check.
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2)
+{
+    if (addr1 == addr2)
+        return true;
+
+    addr1 = getRootBufferOrAddr(addr1);
+    addr2 = getRootBufferOrAddr(addr2);
+
+    auto addr1Class = getAliasingClass(addr1);
+    auto addr2Class = getAliasingClass(addr2);
 
-    if (addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Var && addr1 != addr2)
+    if (!canAddrClassesAlias(addr1Class, addr2Class))
         return false;
 
+    if (addr1Class == addr2Class)
+    {
+        // For these classes of addresses, the identity of the root
+        // determines whether or not the addresse can alias.
+        // Note that we assume two different bound resources can never
+        // alias, and two different variables can never alias.
+        switch (addr1Class)
+        {
+        case AddressAliasingClass::Var:
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::ConstantBuffer:
+            if (addr1 != addr2)
+                return false;
+            break;
+        }
+    }
+
     // A param and a var can never alias.
     if (addr1->getOp() == kIROp_Param && addr1->getParent() == func->getFirstBlock() &&
             addr2->getOp() == kIROp_Var ||
         addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Param &&
             addr2->getParent() == func->getFirstBlock())
         return false;
+
+    // If one addr is user pointer and one addr is a var,
+    // they can never alias, if the user code never took the address of
+    // the var.
+    if (addr1Class == AddressAliasingClass::Var && addr2Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr1);
+    }
+    if (addr2Class == AddressAliasingClass::Var && addr1Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr2);
+    }
     return true;
 }
 
+bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+{
+    return canAddressesPotentiallyAlias(nullptr, func, addr1, addr2);
+}
+
 bool isPtrLikeOrHandleType(IRInst* type)
 {
     if (!type)
@@ -1141,15 +1326,15 @@ bool areCallArgumentsSideEffectFree(IRCall* call, SideEffectAnalysisOptions opti
             if (isBitSet(options, SideEffectAnalysisOptions::UseDominanceTree))
                 dom = module->findOrCreateDominatorTree(parentFunc);
 
-            // If the pointer argument is a local variable (thus can't alias with other addresses)
-            // and it is never read from in the function, we can safely treat the call as having
-            // no side-effect.
-            // This is a conservative test, but is sufficient to detect the most common case where
-            // a temporary variable is used as the inout argument and the result stored in the temp
-            // variable isn't being used elsewhere in the parent func.
+            // If the pointer argument is a local variable (thus can't alias with other
+            // addresses) and it is never read from in the function, we can safely treat the
+            // call as having no side-effect. This is a conservative test, but is sufficient to
+            // detect the most common case where a temporary variable is used as the inout
+            // argument and the result stored in the temp variable isn't being used elsewhere in
+            // the parent func.
             //
-            // A more aggresive test can check all other address uses reachable from the call site
-            // and see if any of them are aliasing with the argument.
+            // A more aggresive test can check all other address uses reachable from the call
+            // site and see if any of them are aliasing with the argument.
             for (auto use = arg->firstUse; use; use = use->nextUse)
             {
                 if (as<IRDecoration>(use->getUser()))
@@ -1323,8 +1508,8 @@ bool doesCalleeHaveSideEffect(IRInst* callee)
         }
     }
 
-    // If the callee has no side effect, check if any of its associated functions have side effect.
-    // If so, we want to keep the callee around.
+    // If the callee has no side effect, check if any of its associated functions have side
+    // effect. If so, we want to keep the callee around.
     //
     // Typically, once the relevant pass has completed, the association is removed,
     // and at that point we can remove the function.
@@ -2230,13 +2415,12 @@ void legalizeDefUse(IRGlobalValueWithCode* func)
                 !(as<IRVar>(inst) && loopHeaderBlockMap.containsKey(block)))
                 continue;
 
-            // Normally, if the common dominator is not `block`, we can simply move the definition
-            // to the common dominator.
-            // An exception is when the common dominator is the target block of a
-            // loop.
-            // Another exception is when a var in the loop condition block is accessed both inside
-            // and outside the loop. It is technically visible, but effects on the 'var' are not
-            // visible outside the loop, so we'll need to hoist it out of the loop.
+            // Normally, if the common dominator is not `block`, we can simply move the
+            // definition to the common dominator. An exception is when the common dominator is
+            // the target block of a loop. Another exception is when a var in the loop condition
+            // block is accessed both inside and outside the loop. It is technically visible,
+            // but effects on the 'var' are not visible outside the loop, so we'll need to hoist
+            // it out of the loop.
             //
             // Note that after normalization, loops are in the form of:
             // ```
@@ -2377,9 +2561,9 @@ bool canOperationBeSpecConst(IROp op, IRType* resultType, IRInst* const* fixedAr
     // Returns true for ops that can be declared as an operation under `OpSpecConstantOp`.
     //
     // Integer arithmetic and comparison operations can be `OpSpecConstantOp` with the `Shader`
-    // capability, while floating-point arithmetic and comparison operations require the `Kernel`
-    // capability. We only support `Shader` capability for now, return false when floating-point
-    // arithmetic/comparison is encountered.
+    // capability, while floating-point arithmetic and comparison operations require the
+    // `Kernel` capability. We only support `Shader` capability for now, return false when
+    // floating-point arithmetic/comparison is encountered.
     switch (op)
     {
     case kIROp_Add:
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index c0410fa3c..b8937d569 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -70,6 +70,8 @@ bool isPointerOfType(IRInst* ptrType, IRInst* elementType);
 // True if ptrType is a pointer type to a type of opCode
 bool isPointerOfType(IRInst* ptrType, IROp opCode);
 
+bool isUserPointerType(IRInst* type);
+
 // Builds a dictionary that maps from requirement key to requirement value for `interfaceType`.
 Dictionary<IRInst*, IRInst*> buildInterfaceRequirementDict(IRInterfaceType* interfaceType);
 
@@ -205,6 +207,12 @@ IRInst* getRootAddr(
 
 bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2);
 
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2);
+
 String dumpIRToString(
     IRInst* root,
     IRDumpOptions options = {IRDumpOptions::Mode::Simplified, IRDumpOptions::Flag::DumpDebugIds});