Enhance buffer load specialization pass to specialize past field extracts. (#8547)

This allows us to specialize functions whose argument is a sub element of a constant buffer, instead of being only applicable to entire buffer element. Closes #8421. This change also implements a proper heuristic to determine when to specialize the calls and defer the buffer loads. This PR addresses a pathological case exposed in `slangpy\slangpy\benchmarks\test_benchmark_tensor.py`, which used to take 27ms to finish, and now takes 1.25ms. For example, given: ``` struct Bottom { float bigArray[1024]; [mutating] void setVal(int index, float value) { bigArray[index] = value; } } struct Root { Bottom top[2]; [mutating] void setTopVal(int x, int y, float value) { top[x].setVal(y, value); } } RWStructuredBuffer<Root> sb; [shader("compute")] [numthreads(1, 1, 1)] void compute_main(uint3 tid: SV_DispatchThreadID) { sb[0].setTopVal(1, 2, 100.0f); } ``` We are now able to specialize the call to `setTopVal` into: ``` void compute_main(uint3 tid: SV_DispatchThreadID) { setTopVal_specialized(0, 1, 2, 100.0f); } void setTopVal_specialized(int sbIdx, int x, int y, float value) { Bottom_setVal_specialized(sbIdx, x, y, value); } void Bottom_setVal_specialized(int sbIdx, int x, int y, float value) { sb[sbIdx].top[x].bigArray[y] = value; } ``` And get rid of all unnecessary loads. Achieving this requires a combination of function call specialization and buffer-load-defer pass. The buffer-load-defer pass has been completely rewritten to be more correct and avoid introducing redundant loads. This PR also adds tests to make sure pointers, bindless handles, and loads from structured buffer or constant buffers works as expected.
author: Yong He <yonghe@outlook.com> 2025-09-30 19:08:23 -0700
committer: GitHub <noreply@github.com> 2025-09-30 19:08:23 -0700
commit: e4611e2e30a3e5969d402f5ed7e72706a0e3b024 (patch)
tree: 0f4240ccf8c4f0786949ab33adb0fcc332890d11 /source
parent: b6422e50cb19f7f790f29678ba22f31b0b305511 (diff)
15 files changed, 744 insertions, 296 deletions
diff --git a/source/slang/slang-emit-wgsl.cpp b/source/slang/slang-emit-wgsl.cpp
index 53c3aa487..b115c723a 100644
--- a/source/slang/slang-emit-wgsl.cpp
+++ b/source/slang/slang-emit-wgsl.cpp
@@ -295,6 +295,11 @@ void WGSLSourceEmitter::emitStructFieldAttributes(
 {
     SLANG_UNUSED(allowOffsetLayout);
 
+    // If the struct type is not used for physical storage, then we don't need to
+    // emit any layout attributes.
+    if (!structType->findDecoration<IRPhysicalTypeDecoration>())
+        return;
+
     // Tint emits errors unless we explicitly spell out the layout in some cases, so emit
     // offset and align attribtues for all fields.
     IRSizeAndAlignmentDecoration* const sizeAndAlignmentDecoration =
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index f1cc6090d..09c2efea9 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -1387,16 +1387,10 @@ Result linkAndOptimizeIR(
     specializeFuncsForBufferLoadArgs(codeGenContext, irModule);
 
     // Push `structuredBufferLoad` to the end of access chain to avoid loading unnecessary data.
-    if (isKhronosTarget(targetRequest) || isMetalTarget(targetRequest) ||
-        isWGPUTarget(targetRequest))
-        deferBufferLoad(irModule);
+    deferBufferLoad(codeGenContext, irModule);
 
     // We also want to specialize calls to functions that
     // takes unsized array parameters if possible.
-    // Moreover, for Khronos targets, we also want to specialize calls to functions
-    // that takes arrays/structs containing arrays as parameters with the actual
-    // global array object to avoid loading big arrays into SSA registers, which seems
-    // to cause performance issues.
     specializeArrayParameters(codeGenContext, irModule);
 
 #if 0
diff --git a/source/slang/slang-ir-defer-buffer-load.cpp b/source/slang/slang-ir-defer-buffer-load.cpp
index 51c6a161b..ccdfe4538 100644
--- a/source/slang/slang-ir-defer-buffer-load.cpp
+++ b/source/slang/slang-ir-defer-buffer-load.cpp
@@ -3,142 +3,211 @@
 #include "slang-ir-clone.h"
 #include "slang-ir-dominators.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-redundancy-removal.h"
 #include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
 {
-struct DeferBufferLoadContext
-{
-    // Map an original SSA value to a pointer that can be used to load the value.
-    Dictionary<IRInst*, IRInst*> mapValueToPtr;
 
-    // Map an ptr to its loaded value.
-    Dictionary<IRInst*, IRInst*> mapPtrToValue;
+// Generally, we want to specialize arguments that are large in size, or arguments that
+// are arrays or composite type that contains arrays.
+// This is because:
+// 1. Struct types without arrays will eventually be SROA's into registers and then effectively
+//    DCE'd, so they usually won't cause performance issues. In fact, front loading structs
+//    and reusing the loaded value instead of repetitively loading from constant memory is
+//    usually beneficial to performance. However large struct values can be SROA'd into a large
+//    number of registers, causing slow downstream compilation. Therefore we should avoid/defer
+//    loading them into registers if we can.
+// 2. Arrays usually cannot be SROA'd into individual registers, which usually leads to
+//    large register consumption if they ever get loaded, so we want to defer loading array
+//    typed values as much as possible.
 
-    IRFunc* currentFunc = nullptr;
+// If the argument data is bigger than this threshold, it is considered a large object
+// and we will try to specialize it even if it doesn't contain arrays.
+static const int kBufferLoadElementSizeSpecializationThreshold = 128;
 
-    // Ensure that for an original SSA value, we have formed a pointer that can be used to load the
-    // value.
-    IRInst* ensurePtr(IRInst* valueInst)
-    {
-        IRInst* result = nullptr;
-        if (mapValueToPtr.tryGetValue(valueInst, result))
-            return result;
+// If the argument data is smaller than this threshold, it is considered a tiny object
+// and we will not consider specializing it, even if it contains arrays.
+static const int kBufferLoadElementSizeSpecializationMinThreshold = 16;
 
-        IRBuilder b(valueInst);
-        b.setInsertBefore(valueInst);
-
-        switch (valueInst->getOp())
+static bool isCompositeTypeContainingArrays(IRType* type)
+{
+    if (auto structType = as<IRStructType>(type))
+    {
+        for (auto field : structType->getFields())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            {
-                result = b.emitRWStructuredBufferGetElementPtr(
-                    valueInst->getOperand(0),
-                    valueInst->getOperand(1));
-                break;
-            }
-        case kIROp_GetElement:
+            if (const auto arrayType = as<IRArrayTypeBase>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitElementAddress(ptr, valueInst->getOperand(1));
-                break;
+                return true;
             }
-        case kIROp_FieldExtract:
+            if (auto subStructType = as<IRStructType>(field->getFieldType()))
             {
-                auto ptr = ensurePtr(valueInst->getOperand(0));
-                if (!ptr)
-                    return nullptr;
-                result = b.emitFieldAddress(ptr, valueInst->getOperand(1));
-                break;
+                if (isCompositeTypeContainingArrays(subStructType))
+                    return true;
             }
-        case kIROp_Load:
-            result = valueInst->getOperand(0);
-            break;
-        }
-        if (result)
-        {
-            mapValueToPtr[valueInst] = result;
         }
-        return result;
     }
+    else if (as<IRArrayTypeBase>(type))
+    {
+        return true;
+    }
+    return false;
+}
 
-    static bool isImmutableBufferLoad(IRInst* inst)
+bool isTypePreferrableToDeferLoad(CodeGenContext* codeGenContext, IRType* type)
+{
+    // If parameter is a pointer/reference, we should consider specialize it.
+    if (as<IROutTypeBase>(type) || as<IRRefType>(type) || as<IRConstRefType>(type))
+        return true;
+
+    // We only want to defer loading values that are "large enough" that
+    // we expect them to be expensive to pass by value.
+    //
+    IRSizeAndAlignment sizeAlignment = {};
+    if (SLANG_FAILED(getNaturalSizeAndAlignment(
+            codeGenContext->getTargetProgram()->getOptionSet(),
+            type,
+            &sizeAlignment)))
     {
-        // Note: we cannot defer loads from RWStructuredBuffer because there can be other
-        // instructions that modify the buffer.
+        // If type contains fields that we don't know how to compute natural size
+        // for, default to specialize if it contains arrays.
+        return isCompositeTypeContainingArrays(type);
+    }
+
+    // If the argument is very small, don't bother specializing.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationMinThreshold)
+        return false;
+
+    // If the argument is somewhat small, don't specialize, unless it contains
+    // arrays.
+    if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationThreshold)
+    {
+        // We generally do not specialize for small values, except it contains
+        // arrays that usually present a challenge for the SROA pass to eliminate
+        // unnecessary loads.
+        if (!isCompositeTypeContainingArrays(type))
+            return false;
+    }
+    return true;
+}
+
+// Returns true if memory loaded by `loadInst` is not modified before `userInst` after it is
+// loaded.
+// This method is currently implementing a very conservative analysis that only allows
+// `loadInst` to be in the same block as `userInst`, with basic aliasing analysis for any
+// stores in between. All other cases are conservatively treated as the memory location may be
+// modified.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst)
+{
+    auto func = getParentFunc(loadInst);
+    if (!func)
+        return false;
+
+    // For now we only check if loadInst and userInst are in the same block.
+    if (loadInst->getParent() != userInst->getParent())
+        return false;
+
+    for (IRInst* inst = loadInst->getNextInst(); inst; inst = inst->getNextInst())
+    {
+        // We found callInst before hitting any instruction that may modify the memory.
+        if (inst == userInst)
+            return true;
+
+        if (!inst->mightHaveSideEffects())
+            continue;
+
+        // If we see any inst that has side effect, check if it is simple case that we can rule
+        // out the possibility of modifying the memory location.
         switch (inst->getOp())
         {
-        case kIROp_StructuredBufferLoad:
-        case kIROp_StructuredBufferLoadStatus:
-            return true;
-        case kIROp_Load:
+        case kIROp_Store:
             {
-                auto rootAddr = getRootAddr(inst->getOperand(0));
-                return isPointerToImmutableLocation(rootAddr);
+                auto storedDest = inst->getOperand(0);
+                if (canAddressesPotentiallyAlias(target, func, loadInst->getOperand(0), storedDest))
+                    return false;
+                continue;
             }
         default:
+            // For any other case, conservatively assume the memory location may be modified.
             return false;
         }
     }
+    // We didn't found callInst after loadInst within the same basic block.
+    // We conservatively assume the memory location may be modified.
+    // This check can be extended to use the dominator tree to allow
+    // loadInst and userInst to be in different blocks.
+    return false;
+}
 
-    // Ensure that for a pointer value, we have created a load instruction to materialize the value.
-    IRInst* materializePointer(IRBuilder& builder, IRInst* loadInst)
+struct DeferBufferLoadContext
+{
+    CodeGenContext* codeGenContext;
+
+
+    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
     {
-        auto ptr = ensurePtr(loadInst);
-        if (!ptr)
-            return nullptr;
-        IRInst* result = nullptr;
-        if (mapPtrToValue.tryGetValue(ptr, result))
-            return result;
-        IRAlignedAttr* align = nullptr;
-        if (auto load = as<IRLoad>(loadInst))
-            align = load->findAttr<IRAlignedAttr>();
-        if (!as<IRModuleInst>(ptr->getParent()))
+        // Don't defer the load anymore if the type is simple.
+        if (!isTypePreferrableToDeferLoad(codeGenContext, loadInst->getDataType()) ||
+            loadInst->findAttr<IRAlignedAttr>())
         {
-            setInsertAfterOrdinaryInst(&builder, ptr);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            mapPtrToValue[ptr] = result;
+            return;
         }
-        else
+
+        auto rootAddr = getRootAddr(loadInst->getOperand(0));
+        bool isImmutableBufferLoad = isPointerToImmutableLocation(rootAddr);
+
+        // Don't defer the load if there are uses that are not getElement or fieldExtract.
+        // Because in this case we need to use the entire loaded value, and further deferring
+        // the load down any access chain will introduce redundant loads.
+        for (auto use = loadInst->firstUse; use; use = use->nextUse)
         {
-            setInsertBeforeOrdinaryInst(&builder, loadInst);
-            IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
-            result = builder.emitLoad(valueType, ptr, align);
-            // Since we are inserting the load in a local scope, we can't register
-            // the mapping to the pointer, since the global pointer needs to be
-            // loaded once per function.
+            auto user = use->getUser();
+            switch (user->getOp())
+            {
+            case kIROp_GetElement:
+            case kIROp_FieldExtract:
+                // Can we defer the load to load only the requested element right before
+                // the element extract inst?
+                // If the buffer is immutable, we can always do that.
+                // If it is not, we need to make sure there is no other instructions that can modify
+                // the buffer between the load and the use.
+                //
+                if (isImmutableBufferLoad)
+                    continue;
+                if (isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codeGenContext->getTargetReq(),
+                        loadInst,
+                        user))
+                    continue;
+                return;
+            default:
+                // If we see any other use the laod instruction, we assume the entire loaded value
+                // is needed, and we can't defer the load anymore.
+                return;
+            }
         }
-        return result;
-    }
 
-    static bool isSimpleType(IRInst* type)
-    {
-        if (auto modType = as<IRRateQualifiedType>(type))
-            type = modType->getValueType();
-        if (as<IRStructType>(type))
-            return false;
-        if (as<IRTupleType>(type))
-            return false;
-        if (as<IRArrayTypeBase>(type))
-            return false;
-        return true;
-    }
+        // If we reach here, it means all uses are getElement or fieldExtract, and
+        // it is safe to defer the load down the access chain.
 
-    void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
-    {
-        // Don't defer the load anymore if the type is simple.
-        if (isSimpleType(loadInst->getDataType()) || loadInst->findAttr<IRAlignedAttr>())
+        if (loadInst->getOp() == kIROp_StructuredBufferLoad)
         {
-            auto materializedVal = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(materializedVal);
-            loadInst->replaceUsesWith(materializedVal);
-            return;
+            // Convert the structuredBufferLoad to a regular load to reuse
+            // the same logic for deferring regular loads.
+            builder.setInsertBefore(loadInst);
+            auto bufferPtr = builder.emitRWStructuredBufferGetElementPtr(
+                loadInst->getOperand(0),
+                loadInst->getOperand(1));
+            auto sbLoad = builder.emitLoad(bufferPtr);
+            loadInst->transferDecorationsTo(sbLoad);
+            loadInst->replaceUsesWith(sbLoad);
+            loadInst->removeAndDeallocate();
+            loadInst = sbLoad;
         }
 
         // Otherwise, look for all uses and try to defer the load before actual use of the value.
@@ -148,19 +217,29 @@ struct DeferBufferLoadContext
             loadInst,
             [&](IRUse* use)
             {
-                if (needMaterialize)
-                    return;
-
                 auto user = use->getUser();
+
                 switch (user->getOp())
                 {
                 case kIROp_GetElement:
                 case kIROp_FieldExtract:
                     {
-                        auto basePtr = ensurePtr(loadInst);
-                        if (!basePtr)
-                            return;
-                        pendingWorkList.add(user);
+                        // If we see a getElement or fieldExtract, we defer the load by
+                        // replacing the getElement/fieldExtract with a load of the
+                        // elementAddr/fieldAddr.
+                        builder.setInsertBefore(user);
+                        auto basePtr = loadInst->getOperand(0);
+                        IRInst* gepArg = user->getOperand(1);
+                        auto elementPtr = builder.emitElementAddress(
+                            basePtr,
+                            makeArrayViewSingle<IRInst*>(gepArg));
+                        auto newLoad = builder.emitLoad(elementPtr);
+                        user->transferDecorationsTo(newLoad);
+                        user->replaceUsesWith(newLoad);
+                        user->removeAndDeallocate();
+
+                        // Now add the new load to work list to try to defer it further.
+                        pendingWorkList.add(newLoad);
                     }
                     break;
                 default:
@@ -169,41 +248,37 @@ struct DeferBufferLoadContext
                 }
             });
 
-        if (needMaterialize)
-        {
-            auto val = materializePointer(builder, loadInst);
-            loadInst->transferDecorationsTo(val);
-            loadInst->replaceUsesWith(val);
-            loadInst->removeAndDeallocate();
-        }
-        else
-        {
-            // Append to worklist in reverse order so we process the uses in natural appearance
-            // order.
-            for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
-                workList.add(pendingWorkList[i]);
-        }
+        // Append to worklist in reverse order so we process the uses in natural appearance
+        // order.
+        for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
+            workList.add(pendingWorkList[i]);
     }
 
     void deferBufferLoadInFunc(IRFunc* func)
     {
         removeRedundancyInFunc(func, false);
 
-        currentFunc = func;
-
         List<IRInst*> workList;
 
+        // Discover all load instructions and add to work list.
+
         for (auto block : func->getBlocks())
         {
             for (auto inst : block->getChildren())
             {
-                if (isImmutableBufferLoad(inst))
+                switch (inst->getOp())
                 {
+                case kIROp_Load:
+                case kIROp_StructuredBufferLoad:
+                    // Note: We don't handle `kIROp_StructuredBufferLoadStatus` here because
+                    // it also writes to the status code out parameter, which we can't defer.
                     workList.add(inst);
+                    break;
                 }
             }
         }
 
+        // Iteratively process the work list until it is empty.
         IRBuilder builder(func);
         for (Index i = 0; i < workList.getCount(); i++)
         {
@@ -227,9 +302,10 @@ struct DeferBufferLoadContext
     }
 };
 
-void deferBufferLoad(IRModule* module)
+void deferBufferLoad(CodeGenContext* codeGenContext, IRModule* module)
 {
     DeferBufferLoadContext context;
+    context.codeGenContext = codeGenContext;
     for (auto childInst : module->getGlobalInsts())
     {
         if (auto code = as<IRGlobalValueWithCode>(childInst))
diff --git a/source/slang/slang-ir-defer-buffer-load.h b/source/slang/slang-ir-defer-buffer-load.h
index b54271883..0f692b39a 100644
--- a/source/slang/slang-ir-defer-buffer-load.h
+++ b/source/slang/slang-ir-defer-buffer-load.h
@@ -4,9 +4,8 @@ namespace Slang
 {
 
 /*
-This pass implements a targeted optimization that defers the loading of structured buffer elements
-to the end of the access chain to avoid loading and repacking unnecessary data.
-For example, if we see:
+This pass implements a intra-function optimization that defers the loading of buffer
+elements to the end of the access chain to avoid loading unnecessary data. For example, if we see:
     val = StructuredBufferLoad(s, i)
     val2 = GetElement(val, j)
     val3 = FieldExtract(val2, field_key_0)
@@ -20,7 +19,22 @@ We should rewrite the code into:
 */
 
 struct IRModule;
+struct IRType;
+struct CodeGenContext;
+struct IRInst;
+class TargetRequest;
 
-void deferBufferLoad(IRModule* module);
+void deferBufferLoad(CodeGenContext* context, IRModule* module);
+
+// Returns true if the type is suitable for defer-load optimization.
+// Generally, we want to defer loading large structs or composites that contain arrays.
+bool isTypePreferrableToDeferLoad(CodeGenContext* context, IRType* type);
+
+// Returns true if memory loaded by `loadInst` may be modified before `userInst` after it is
+// loaded.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+    TargetRequest* target,
+    IRInst* loadInst,
+    IRInst* userInst);
 
 } // namespace Slang
diff --git a/source/slang/slang-ir-defunctionalization.cpp b/source/slang/slang-ir-defunctionalization.cpp
index af84ec78a..424971f90 100644
--- a/source/slang/slang-ir-defunctionalization.cpp
+++ b/source/slang/slang-ir-defunctionalization.cpp
@@ -12,7 +12,7 @@ struct FunctionParameterSpecializationCondition : FunctionCallSpecializeConditio
 {
     TargetRequest* targetRequest = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/, IRCall* /*callInst*/)
     {
         IRType* type = param->getDataType();
         return as<IRFuncType>(type);
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index a79ca2379..d87d96da0 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2694,7 +2694,10 @@ static void legalizeMeshPayloadInputParam(
     pp->replaceUsesWith(g);
     struct MeshPayloadInputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
@@ -2794,7 +2797,10 @@ static void legalizeMeshOutputParam(
     // pp is only removed later on, so sadly we have to keep it around for now
     struct MeshOutputSpecializationCondition : FunctionCallSpecializeCondition
     {
-        bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+        bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+        {
+            return arg == g;
+        }
         IRInst* g;
     } condition;
     condition.g = g;
diff --git a/source/slang/slang-ir-metal-legalize.cpp b/source/slang/slang-ir-metal-legalize.cpp
index e66617e72..e91da136a 100644
--- a/source/slang/slang-ir-metal-legalize.cpp
+++ b/source/slang/slang-ir-metal-legalize.cpp
@@ -172,7 +172,7 @@ struct MetalAddressSpaceAssigner : InitialAddressSpaceAssigner
         {
             if (ptrType->hasAddressSpace())
                 return ptrType->getAddressSpace();
-            return AddressSpace::Global;
+            return AddressSpace::Generic;
         }
         return AddressSpace::Generic;
     }
diff --git a/source/slang/slang-ir-specialize-address-space.cpp b/source/slang/slang-ir-specialize-address-space.cpp
index c4a155eec..04792bd8b 100644
--- a/source/slang/slang-ir-specialize-address-space.cpp
+++ b/source/slang/slang-ir-specialize-address-space.cpp
@@ -131,7 +131,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
     bool processFunction(IRFunc* func)
     {
         bool retValAddrSpaceChanged = false;
-        Dictionary<IRInst*, AddressSpace> mapVarValueToAddrSpace;
         bool changed = true;
         while (changed)
         {
@@ -152,18 +151,23 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         continue;
                     }
 
-                    // If the inst already has a pointer type with explicit address space, then use
-                    // it.
-                    if (auto ptrType = as<IRPtrTypeBase>(inst->getDataType()))
+                    // If the inst already has a pointer/pointer-like type with explicit address
+                    // space, then use it.
+                    auto addrSpaceFromType =
+                        addrSpaceAssigner->getAddressSpaceFromVarType(inst->getDataType());
+                    if (addrSpaceFromType != AddressSpace::Generic)
                     {
-                        if (ptrType->hasAddressSpace())
-                        {
-                            mapInstToAddrSpace[inst] = ptrType->getAddressSpace();
+                        mapInstToAddrSpace[inst] = addrSpaceFromType;
+                        changed = true;
+
+                        // Don't return early if the inst itself is a call, as we may still need to
+                        // specialize it down below.
+                        if (inst->getOp() != kIROp_Call)
                             continue;
-                        }
                     }
 
-                    // Otherwise, try to assign an address space based on the instruction type.
+                    // Try to assign an address space based on the instruction type, and specialize
+                    // calls.
                     switch (inst->getOp())
                     {
                     case kIROp_Var:
@@ -195,15 +199,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                         }
                         break;
                     case kIROp_Store:
-                        {
-                            auto addrSpace = getAddrSpace(inst->getOperand(1));
-                            if (addrSpace != AddressSpace::Generic)
-                            {
-                                mapVarValueToAddrSpace[inst->getOperand(0)] = addrSpace;
-                                mapInstToAddrSpace[inst] = addrSpace;
-                                changed = true;
-                            }
-                        }
                         break;
                     case kIROp_Param:
                         if (!isFirstBlock)
@@ -243,8 +238,9 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
                                 for (UInt i = 0; i < callInst->getArgCount(); i++)
                                 {
                                     auto arg = callInst->getArg(i);
-                                    argAddrSpaces.add(getAddrSpace(arg));
-                                    if (as<IRPtrTypeBase>(arg->getDataType()))
+                                    auto addrSpace = getAddrSpace(arg);
+                                    argAddrSpaces.add(addrSpace);
+                                    if (addrSpace != AddressSpace::Generic)
                                     {
                                         hasSpecializableArg = true;
                                     }
@@ -477,8 +473,13 @@ void propagateAddressSpaceFromInsts(List<IRInst*>&& workList)
     }
 }
 
-AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst*)
+AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst* type)
 {
+    if (auto ptrType = as<IRPtrTypeBase>(type))
+    {
+        if (ptrType->hasAddressSpace())
+            return ptrType->getAddressSpace();
+    }
     return AddressSpace::Generic;
 }
 
diff --git a/source/slang/slang-ir-specialize-arrays.cpp b/source/slang/slang-ir-specialize-arrays.cpp
index 4a4a72ee9..edb6cfa28 100644
--- a/source/slang/slang-ir-specialize-arrays.cpp
+++ b/source/slang/slang-ir-specialize-arrays.cpp
@@ -11,38 +11,14 @@ namespace Slang
 struct ArrayParameterSpecializationCondition : FunctionCallSpecializeCondition
 {
     // This pass is intended to specialize functions
-    // with struct parameters that has array fields
-    // to avoid performance problems for GLSL targets.
-    // Returns true if `type` is an `IRStructType` with array-typed fields.
-    // It will also specialize functions with unsized array parameters into
-    // sized arrays, if the function is called with an argument that has a
-    // sized array type.
+    // with unsized array parameter called with a sized-array argument.
     //
-    bool isStructTypeWithArray(IRType* type)
-    {
-        if (auto structType = as<IRStructType>(type))
-        {
-            for (auto field : structType->getFields())
-            {
-                if (const auto arrayType = as<IRArrayType>(field->getFieldType()))
-                {
-                    return true;
-                }
-                if (auto subStructType = as<IRStructType>(field->getFieldType()))
-                {
-                    if (isStructTypeWithArray(subStructType))
-                        return true;
-                }
-            }
-        }
-        return false;
-    }
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
+        SLANG_UNUSED(param);
         SLANG_UNUSED(arg);
-        if (isKhronosTarget(codeGenContext->getTargetReq()))
-            return isStructTypeWithArray(param->getDataType());
+        SLANG_UNUSED(callInst);
         return false;
     }
 
diff --git a/source/slang/slang-ir-specialize-buffer-load-arg.cpp b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
index 905f2e058..a5a3dd2d9 100644
--- a/source/slang/slang-ir-specialize-buffer-load-arg.cpp
+++ b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
@@ -1,8 +1,11 @@
 // slang-ir-specialize-buffer-load-arg.cpp
 #include "slang-ir-specialize-buffer-load-arg.h"
 
+#include "slang-ir-defer-buffer-load.h"
 #include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
 #include "slang-ir-specialize-function-call.h"
+#include "slang-ir-util.h"
 #include "slang-ir.h"
 
 namespace Slang
@@ -17,76 +20,115 @@ namespace Slang
 // As swith most of our IR passes, we encapsulate the logic here in a context
 // type so that the data that needs to be shared throughout the pass can
 // be conveniently scoped.
+//
+
+// Note that this pass also ensures other more contrived cases are properly
+// handled. For example:
+//
+// * A load of a large structure from field in a constant buffer, so that
+//   the value loaded is not the entire buffer contents.
+//
+// * A load of a large structure from a structured buffer, or any other kind
+//   of buffer that requires an index.
+//
 
 struct FuncBufferLoadSpecializationCondition : FunctionCallSpecializeCondition
 {
     typedef FunctionCallSpecializeCondition Super;
 
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    CodeGenContext* codegenContext;
+
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         // We only want to specialize for `struct` types and not base types.
         //
-        // TODO: We might want to consider some criteria here for the "large-ness"
-        // of a structure (in terms of bytes and/or fields), so that we don't
-        // eliminate loads of sufficiently small types (which are cheap to pass
-        // by value).
-        //
-        auto paramType = param->getDataType();
-        if (!as<IRStructType>(paramType))
+        auto paramType = (IRType*)unwrapAttributedType(param->getDataType());
+        if (!isTypePreferrableToDeferLoad(codegenContext, paramType))
             return false;
 
-        // We also only want to specialize for arguments that are a load
-        // from some kind of global shader parameter.
+        // We want to handle loads from arbitrary access chains rooting from a shader parameter.
         //
         IRInst* a = arg;
-        if (auto argLoad = as<IRLoad>(arg))
-        {
-            a = argLoad->getPtr();
-        }
-        else
+        for (;;)
         {
-            return false;
-        }
+            // A user pointer can be directly passed into the function, so we no
+            // longer need to trace up further.
+            if (isUserPointerType(a->getDataType()))
+                break;
 
-        // We want to handle loads from a shader parameter that is an array
-        // of buffers, and not just a single global buffer.
-        //
-        while (auto argGetElement = as<IRGetElement>(a))
-        {
-            a = argGetElement->getBase();
+            if (auto argGetElement = as<IRGetElement>(a))
+            {
+                a = argGetElement->getBase();
+            }
+            else if (auto argSbLoad = as<IRStructuredBufferLoad>(a))
+            {
+                a = argSbLoad->getOperand(0);
+            }
+            else if (auto argBbLoad = as<IRByteAddressBufferLoad>(a))
+            {
+                a = argBbLoad->getOperand(0);
+            }
+            else if (auto argFieldExtract = as<IRFieldExtract>(a))
+            {
+                a = argFieldExtract->getBase();
+            }
+            else if (auto argGetElementPtr = as<IRGetElementPtr>(a))
+            {
+                a = argGetElementPtr->getBase();
+            }
+            else if (auto argSBGetElementPtr = as<IRRWStructuredBufferGetElementPtr>(a))
+            {
+                a = argSBGetElementPtr->getBase();
+            }
+            else if (auto argFieldAddr = as<IRFieldAddress>(a))
+            {
+                a = argFieldAddr->getBase();
+            }
+            else if (auto argLoad = as<IRLoad>(a))
+            {
+                a = argLoad->getPtr();
+
+                // We can safely defer a load to the callee if the source dest is immutable.
+                if (isPointerToImmutableLocation(a))
+                    continue;
+
+                // Otherwise, we check if there is no other instructions in between the load and the
+                // call that can modify the memory location. If so, we can still safely defer the
+                // load to the callee.
+                if (!isMemoryLocationUnmodifiedBetweenLoadAndUser(
+                        codegenContext->getTargetReq(),
+                        argLoad,
+                        callInst))
+                    return false;
+            }
+            else
+            {
+                break;
+            }
         }
 
-        // The "root" of the parameter must be a reference to a global-scope
-        // shader parameter, so that we know we can substitute it into the callee.
+        // The "root" of the parameter must be one of the following:
+        // 1. A reference to a global-scope shader parameter that can be referenced directly from
+        //    the callee.
+        // 2. A user pointer or bindless resource handle that can be passed to the callee as
+        //    ordinary argument.
         //
         if (const auto argGlobalParam = as<IRGlobalParam>(a))
         {
             return true;
         }
-        else
+        else if (isUserPointerType(a->getDataType()) || as<IRCastDescriptorHandleToResource>(a))
         {
-            return false;
+            return true;
         }
-
-        // TODO: There are other patterns that we could attempt to optimize here.
-        // For example, this logic only handles loads of the *entire* contents of
-        // a buffer, so it would miss:
-        //
-        // * A load of a large structure from field in a constant buffer, so that
-        //   the value loaded is not the entire buffer contents.
-        //
-        // * A load of a large structure from a structured buffer, or any other kind
-        //   of buffer that requires an index.
-        //
-        // * Any resource load that is not expressed at the IR level with a `load`
-        //   instruction (e.g., those that might use an intrinsic function).
-        //
+        return false;
     }
 };
 
 void specializeFuncsForBufferLoadArgs(CodeGenContext* codegenContext, IRModule* module)
 {
     FuncBufferLoadSpecializationCondition condition;
+    condition.codegenContext = codegenContext;
     specializeFunctionCalls(codegenContext, module, &condition);
 }
 
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 7c82891a6..aead69258 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -40,6 +40,12 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         if (as<IRGlobalValueWithCode>(arg))
             return true;
 
+        if (isUserPointerType(arg->getDataType()))
+            return true;
+
+        if (as<IRCastDescriptorHandleToResource>(arg))
+            return true;
+
         // As we will see later, we can also
         // specialize a call when the argument
         // is the result of indexing into an
@@ -47,17 +53,29 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
         // of the indexing operation is also
         // suitable for specialization.
         //
-        if (arg->getOp() == kIROp_GetElement || arg->getOp() == kIROp_Load)
+        switch (arg->getOp())
         {
-            auto base = arg->getOperand(0);
-
-            // We will "recurse" on the base of
-            // the indexing operation by continuing
-            // our loop with the `base` as our new
-            // argument.
-            //
-            arg = base;
-            continue;
+        case kIROp_GetElement:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+        case kIROp_GetElementPtr:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+        case kIROp_Load:
+            {
+                auto base = arg->getOperand(0);
+
+                // We will "recurse" on the base of
+                // the indexing operation by continuing
+                // our loop with the `base` as our new
+                // argument.
+                //
+                arg = base;
+                continue;
+            }
+        default:
+            break;
         }
 
         // By default, we will *not* consider an argument
@@ -225,7 +243,7 @@ struct FunctionParameterSpecializationContext
             // If neither the parameter nor the argument wants specialization,
             // then we need to keep looking.
             //
-            auto paramWantSpecialization = doesParamWantSpecialization(param, arg);
+            auto paramWantSpecialization = doesParamWantSpecialization(param, arg, call);
             auto paramTypeWantSpecialization = doesParamTypeWantSpecialization(param, arg);
             if (!paramWantSpecialization && !paramTypeWantSpecialization)
                 continue;
@@ -255,9 +273,9 @@ struct FunctionParameterSpecializationContext
     // Of course, now we need to back-fill the predicates that
     // the above function used to evaluate prameters and arguments.
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
-        return condition->doesParamWantSpecialization(param, arg);
+        return condition->doesParamWantSpecialization(param, arg, callInst);
     }
 
     bool doesParamTypeWantSpecialization(IRParam* param, IRInst* arg)
@@ -484,16 +502,20 @@ struct FunctionParameterSpecializationContext
             UInt oldArgIndex = oldArgCounter++;
             auto oldArg = oldCall->getArg(oldArgIndex);
 
-            getCallInfoForParam(callInfo, oldParam, oldArg);
+            getCallInfoForParam(callInfo, oldParam, oldArg, oldCall);
         }
     }
 
-    void getCallInfoForParam(CallSpecializationInfo& ioInfo, IRParam* oldParam, IRInst* oldArg)
+    void getCallInfoForParam(
+        CallSpecializationInfo& ioInfo,
+        IRParam* oldParam,
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // We know that the case where the parameter
         // and argument don't want specialization is easy.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The new call site will use the same argument
             // value as the old one, and we don't need
@@ -546,7 +568,15 @@ struct FunctionParameterSpecializationContext
             // Similarly for other global constants
             ioInfo.key.vals.add(globalConstant);
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If the arg is a user pointer, we can pass it as an ordinary argument,
+            // and we won't need further tracing down the access chain.
+            //
+            ioInfo.key.vals.add(oldArg->getFullType());
+            ioInfo.newArgs.add(oldArg);
+        }
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the `oldArg` is
             // in the form `oldBase[oldIndex]`
@@ -587,19 +617,45 @@ struct FunctionParameterSpecializationContext
 
             ioInfo.newArgs.add(oldIndex);
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the `oldArg` is
+            // in the form `oldBase.structKey`
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // Similar to the getElement case, we recursively setting up whatever
+            // `oldBase` needs first.
+            //
+            getCallInfoForArg(ioInfo, oldBase);
+
+            // The main difference from the `getElement` case is we actually want
+            // the structKey to be in the specialization key because it will be baked
+            // into the specialized function.
+            // And we won't introduce a new parameter to hold the index.
+            //
+            ioInfo.key.vals.add(structKey);
+        }
         else if (oldArg->getOp() == kIROp_Load)
         {
             auto oldBase = oldArg->getOperand(0);
             getCallInfoForArg(ioInfo, oldBase);
         }
+        else if (oldArg->getOp() == kIROp_CastDescriptorHandleToResource)
+        {
+            // We are accessing a resource from a bindless handle.
+            // We can stop recursion here and just pass in the bindless handle as
+            // an argument.
+            auto oldBase = oldArg->getOperand(0);
+            ioInfo.key.vals.add(oldBase->getFullType());
+            ioInfo.newArgs.add(oldBase);
+        }
         else
         {
             // If we fail to match any of the cases above
-            // then a precondition was violated in that
-            // `isArgSuitableForSpecialization` is allowing
-            // a case that this routine is not covering.
-            //
-            SLANG_UNEXPECTED("mising case in 'getCallInfoForArg'");
+            // then the `SpecializeCondition` is letting through constructs that we cannot handle.
+            SLANG_UNEXPECTED("unexpected function call specialization argument form.");
         }
     }
 
@@ -641,7 +697,7 @@ struct FunctionParameterSpecializationContext
             // will stand in for the parameter in the specialized
             // function.
             //
-            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg);
+            auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg, oldCall);
 
             // We will collect the replacement value to use
             // for each of the original parameters in an array.
@@ -681,12 +737,13 @@ struct FunctionParameterSpecializationContext
     IRInst* getSpecializedValueForParam(
         FuncSpecializationInfo& ioInfo,
         IRParam* oldParam,
-        IRInst* oldArg)
+        IRInst* oldArg,
+        IRCall* callInst)
     {
         // As always, the easy case is when the parameter of
         // the original function doesn't need specialization.
         //
-        if (!doesParamWantSpecialization(oldParam, oldArg))
+        if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
         {
             // The specialized callee will need a new parameter
             // that fills the same role as the old one, so we
@@ -718,6 +775,36 @@ struct FunctionParameterSpecializationContext
         }
     }
 
+    // Returns true if `inst` is an instruction that accesses an element from an array or a buffer.
+    //
+    static bool isElementAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_GetElementPtr:
+        case kIROp_GetElement:
+        case kIROp_RWStructuredBufferGetElementPtr:
+        case kIROp_StructuredBufferLoad:
+        case kIROp_ByteAddressBufferLoad:
+            return true;
+        }
+        return false;
+    }
+
+    // Returns true if `inst` is an instruction that accesses a field from a struct, that is
+    // either a FieldAddress or FieldExtract.
+    //
+    static bool isFieldAccessInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_FieldAddress:
+        case kIROp_FieldExtract:
+            return true;
+        }
+        return false;
+    }
+
     IRInst* getSpecializedValueForArg(FuncSpecializationInfo& ioInfo, IRInst* oldArg)
     {
         // The logic here parallels `gatherCallInfoForArg`,
@@ -735,13 +822,24 @@ struct FunctionParameterSpecializationContext
             //
             return globalParam;
         }
+        if (isUserPointerType(oldArg->getDataType()))
+        {
+            // If argument is a user pointer, we can pass it into the callee
+            // directly as an oridinary argument without further specializing
+            // for the access chain beyond the pointer.
+            //
+            auto builder = getBuilder();
+            auto newParam = builder->createParam(oldArg->getFullType());
+            ioInfo.newParams.add(newParam);
+            return newParam;
+        }
         if (auto globalFunc = as<IRGlobalValueWithCode>(oldArg))
         {
             // As above, the identity of the specialized function is sufficient
             // to resolve the uses
             return globalFunc;
         }
-        else if (oldArg->getOp() == kIROp_GetElement)
+        else if (isElementAccessInst(oldArg))
         {
             // This is the case where the argument is
             // in the form `oldBase[oldIndex]`.
@@ -801,7 +899,9 @@ struct FunctionParameterSpecializationContext
             // of things, and then inserted to a more permanent location later.
             //
             builder->setInsertLoc(IRInsertLoc());
-            auto newVal = builder->emitElementExtract(oldArg->getFullType(), newBase, newIndex);
+            IRInst* newOperands[] = {newBase, newIndex};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
 
             // Because our new instruction wasn't
             // actually inserted anywhere, we need to
@@ -813,6 +913,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (isFieldAccessInst(oldArg))
+        {
+            // This is the case where the argument is
+            // in the form `oldBase.structKey`.
+            //
+            auto oldBase = oldArg->getOperand(0);
+            auto structKey = oldArg->getOperand(1);
+
+            // We handle this case in a similar way as the `oldBase[oldIndex]`
+            // case, except that we don't need to introduce a new parameter
+            // for the index, since the struct key is known at compile-time.
+            auto newBase = getSpecializedValueForArg(ioInfo, oldBase);
+
+            auto builder = getBuilder();
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newBase, structKey};
+            auto newVal =
+                builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
+
+            ioInfo.newBodyInsts.add(newVal);
+
+            return newVal;
+        }
         else if (auto oldArgLoad = as<IRLoad>(oldArg))
         {
             auto oldPtr = oldArgLoad->getPtr();
@@ -825,15 +949,30 @@ struct FunctionParameterSpecializationContext
 
             return newVal;
         }
+        else if (auto castHandleToResource = as<IRCastDescriptorHandleToResource>(oldArg))
+        {
+            // We are accessing a resource from a bindless handle.
+            // We should create a param for the handle, and load the resource from the param.
+            auto builder = getBuilder();
+            auto oldHandle = castHandleToResource->getOperand(0);
+            auto newHandle = builder->createParam(oldHandle->getFullType());
+            ioInfo.newParams.add(newHandle);
+
+            builder->setInsertLoc(IRInsertLoc());
+            IRInst* newOperands[] = {newHandle};
+            auto newVal = builder->emitIntrinsicInst(
+                oldArg->getFullType(),
+                kIROp_CastDescriptorHandleToResource,
+                1,
+                newOperands);
+            ioInfo.newBodyInsts.add(newVal);
+            return newVal;
+        }
         else
         {
             // If we don't match one of the above cases,
-            // then `isArgSuitableForSpecialization` is
-            // letting through cases that this function
-            // hasn't been updated to handle.
-            //
-            SLANG_UNEXPECTED("mising case in 'getSpecializedValueForArg'");
-            UNREACHABLE_RETURN(nullptr);
+            // then we are running into an invalid case.
+            SLANG_UNEXPECTED("unknown argument form for function call specialization.");
         }
     }
 
diff --git a/source/slang/slang-ir-specialize-function-call.h b/source/slang/slang-ir-specialize-function-call.h
index bab4ce2f4..afb8c2365 100644
--- a/source/slang/slang-ir-specialize-function-call.h
+++ b/source/slang/slang-ir-specialize-function-call.h
@@ -7,12 +7,14 @@ struct CodeGenContext;
 struct IRInst;
 struct IRModule;
 struct IRParam;
+struct IRCall;
+
 class Module;
 
 class FunctionCallSpecializeCondition
 {
 public:
-    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg) = 0;
+    virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst) = 0;
 
     virtual bool isParamSuitableForSpecialization(IRParam* param, IRInst* arg);
 
diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp
index 871ba2c24..0ac08236f 100644
--- a/source/slang/slang-ir-specialize-resources.cpp
+++ b/source/slang/slang-ir-specialize-resources.cpp
@@ -20,9 +20,10 @@ struct ResourceParameterSpecializationCondition : FunctionCallSpecializeConditio
     TargetRequest* targetRequest = nullptr;
     TargetProgram* targetProgram = nullptr;
 
-    bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+    bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
     {
         SLANG_UNUSED(arg);
+        SLANG_UNUSED(callInst);
 
         // Whether or not a parameter needs specialization is really
         // a function of its type:
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 8584ea95e..551a72fc7 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -17,6 +17,14 @@ bool isPointerOfType(IRInst* type, IROp opCode)
     return false;
 }
 
+bool isUserPointerType(IRInst* type)
+{
+    auto ptrType = as<IRPtrType>(type);
+    if (!ptrType)
+        return false;
+    return ptrType->getAddressSpace() == AddressSpace::UserPointer;
+}
+
 IRType* getVectorElementType(IRType* type)
 {
     if (auto vectorType = as<IRVectorType>(type))
@@ -792,35 +800,212 @@ IRInst* getRootAddr(IRInst* addr, List<IRInst*>& outAccessChain, List<IRInst*>*
     return addr;
 }
 
-// A simple and conservative address aliasing check.
-bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+IRInst* getRootBufferOrAddr(IRInst* addr)
 {
-    if (addr1 == addr2)
-        return true;
+    auto rootAddr = getRootAddr(addr);
+    if (as<IRRWStructuredBufferGetElementPtr>(rootAddr))
+    {
+        auto bufferHandle = rootAddr->getOperand(0);
+        // Check if the bufferHandle itself is a load from a global parameter.
+        if (auto load = as<IRLoad>(bufferHandle))
+        {
+            auto newRoot = getRootAddr(load->getPtr());
+            if (newRoot->getOp() == kIROp_GlobalParam)
+                return newRoot;
+        }
+    }
+    return rootAddr;
+}
+
+// The aliasing class of an address. This is used to determine
+// if two addresses may alias.
+enum class AddressAliasingClass
+{
+    Unknown,
+    UserPointer,      // A user pointer into global memory
+    Var,              // A thread-local or groupshared var.
+    ConstantBuffer,   // A constant buffer or parameter block.
+    BoundBuffer,      // A bound buffer.
+    BoundTexture,     // A bound texture resource.
+    DescriptorHandle, // A bindless buffer or resource.
+};
+
+AddressAliasingClass getAliasingClass(IRInst* addr)
+{
+    if (auto globalParam = as<IRGlobalParam>(addr))
+    {
+        auto type = unwrapArray(globalParam->getDataType());
+        if (!type)
+            return AddressAliasingClass::Unknown;
+        switch (type->getOp())
+        {
+        case kIROp_TextureType:
+            return AddressAliasingClass::BoundTexture;
+        case kIROp_HLSLStructuredBufferType:
+        case kIROp_HLSLRWStructuredBufferType:
+        case kIROp_HLSLAppendStructuredBufferType:
+        case kIROp_HLSLConsumeStructuredBufferType:
+        case kIROp_HLSLRasterizerOrderedStructuredBufferType:
+        case kIROp_HLSLByteAddressBufferType:
+        case kIROp_HLSLRWByteAddressBufferType:
+        case kIROp_HLSLRasterizerOrderedByteAddressBufferType:
+        case kIROp_GLSLShaderStorageBufferType:
+            return AddressAliasingClass::BoundBuffer;
+        case kIROp_ConstantBufferType:
+        case kIROp_ParameterBlockType:
+            return AddressAliasingClass::ConstantBuffer;
+        case kIROp_PtrType:
+            if (isUserPointerType(type))
+                return AddressAliasingClass::UserPointer;
+            return AddressAliasingClass::Unknown;
+        case kIROp_DynamicResourceType:
+            return AddressAliasingClass::DescriptorHandle;
+        default:
+            return AddressAliasingClass::Unknown;
+        }
+    }
+    else if (as<IRVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRGlobalVar>(addr))
+        return AddressAliasingClass::Var;
+    else if (as<IRRWStructuredBufferGetElementPtr>(addr))
+        return AddressAliasingClass::DescriptorHandle;
+    else if (as<IRCastDescriptorHandleToResource>(addr))
+        return AddressAliasingClass::DescriptorHandle;
 
-    // Two variables can never alias.
-    addr1 = getRootAddr(addr1);
-    addr2 = getRootAddr(addr2);
+    auto type = addr->getDataType();
+    if (isUserPointerType(type))
+        return AddressAliasingClass::UserPointer;
+    return AddressAliasingClass::Unknown;
+}
 
-    // Global addresses can alias with anything.
-    if (!isChildInstOf(addr1, func))
+bool canAddrClassesAlias(AddressAliasingClass c1, AddressAliasingClass c2)
+{
+    if (c1 == AddressAliasingClass::Unknown || c2 == AddressAliasingClass::Unknown)
         return true;
 
-    if (!isChildInstOf(addr2, func))
+    switch (c1)
+    {
+    case AddressAliasingClass::Unknown:
         return true;
+    case AddressAliasingClass::UserPointer:
+    case AddressAliasingClass::Var:
+        // A users pointer or var can only alias with another
+        // object that is either a user pointer or var.
+        //
+        // Generally, a var should never alias with anything else that isn't a var,
+        // if we never allow the user to take address of a local var.
+        // We don't allow taking addresses of a local var on most GPU targets, but
+        // we currently do expose an internal intrinsic to do so when targeting CPU.
+        // We should consider disallowing this across the board, or enable more aggresive
+        // criteria when targeting GPU backends.
+        // For now we stay conservative and just report true even when addr1 is var and
+        // addr2 is not rooted from a var.
+        //
+        return c2 == AddressAliasingClass::UserPointer || c2 == AddressAliasingClass::Var;
+    case AddressAliasingClass::BoundBuffer:
+    case AddressAliasingClass::BoundTexture:
+        // A bound resource can only alias with another
+        // object that is a bound resource or descriptor handle
+        return c2 == c1 || c2 == AddressAliasingClass::DescriptorHandle;
+
+    case AddressAliasingClass::DescriptorHandle:
+        // Can alias with any other resource.
+        switch (c2)
+        {
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::DescriptorHandle:
+            return true;
+        default:
+            return false;
+        }
+    case AddressAliasingClass::ConstantBuffer:
+        // Constant buffer cannot alias with anything.
+        return false;
+    }
+    // For any other unknown case, assume they may alias.
+    return true;
+}
+
+// Has `var` being used in a way that may allow it to alias with a user pointer?
+bool canVarAliasWithUserPointer(TargetRequest* target, IRInst* var)
+{
+    if (target && !isCPUTarget(target))
+    {
+        // We don't allow taking the address of a variable on anything other
+        // than the CPU target. Therefore a var can never alias with a user
+        // pointer on these targets.
+        return false;
+    }
+
+    SLANG_UNUSED(var);
+    return true;
+}
+
+// A simple and conservative address aliasing check.
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2)
+{
+    if (addr1 == addr2)
+        return true;
+
+    addr1 = getRootBufferOrAddr(addr1);
+    addr2 = getRootBufferOrAddr(addr2);
+
+    auto addr1Class = getAliasingClass(addr1);
+    auto addr2Class = getAliasingClass(addr2);
 
-    if (addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Var && addr1 != addr2)
+    if (!canAddrClassesAlias(addr1Class, addr2Class))
         return false;
 
+    if (addr1Class == addr2Class)
+    {
+        // For these classes of addresses, the identity of the root
+        // determines whether or not the addresse can alias.
+        // Note that we assume two different bound resources can never
+        // alias, and two different variables can never alias.
+        switch (addr1Class)
+        {
+        case AddressAliasingClass::Var:
+        case AddressAliasingClass::BoundBuffer:
+        case AddressAliasingClass::BoundTexture:
+        case AddressAliasingClass::ConstantBuffer:
+            if (addr1 != addr2)
+                return false;
+            break;
+        }
+    }
+
     // A param and a var can never alias.
     if (addr1->getOp() == kIROp_Param && addr1->getParent() == func->getFirstBlock() &&
             addr2->getOp() == kIROp_Var ||
         addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Param &&
             addr2->getParent() == func->getFirstBlock())
         return false;
+
+    // If one addr is user pointer and one addr is a var,
+    // they can never alias, if the user code never took the address of
+    // the var.
+    if (addr1Class == AddressAliasingClass::Var && addr2Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr1);
+    }
+    if (addr2Class == AddressAliasingClass::Var && addr1Class == AddressAliasingClass::UserPointer)
+    {
+        return canVarAliasWithUserPointer(target, addr2);
+    }
     return true;
 }
 
+bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+{
+    return canAddressesPotentiallyAlias(nullptr, func, addr1, addr2);
+}
+
 bool isPtrLikeOrHandleType(IRInst* type)
 {
     if (!type)
@@ -1141,15 +1326,15 @@ bool areCallArgumentsSideEffectFree(IRCall* call, SideEffectAnalysisOptions opti
             if (isBitSet(options, SideEffectAnalysisOptions::UseDominanceTree))
                 dom = module->findOrCreateDominatorTree(parentFunc);
 
-            // If the pointer argument is a local variable (thus can't alias with other addresses)
-            // and it is never read from in the function, we can safely treat the call as having
-            // no side-effect.
-            // This is a conservative test, but is sufficient to detect the most common case where
-            // a temporary variable is used as the inout argument and the result stored in the temp
-            // variable isn't being used elsewhere in the parent func.
+            // If the pointer argument is a local variable (thus can't alias with other
+            // addresses) and it is never read from in the function, we can safely treat the
+            // call as having no side-effect. This is a conservative test, but is sufficient to
+            // detect the most common case where a temporary variable is used as the inout
+            // argument and the result stored in the temp variable isn't being used elsewhere in
+            // the parent func.
             //
-            // A more aggresive test can check all other address uses reachable from the call site
-            // and see if any of them are aliasing with the argument.
+            // A more aggresive test can check all other address uses reachable from the call
+            // site and see if any of them are aliasing with the argument.
             for (auto use = arg->firstUse; use; use = use->nextUse)
             {
                 if (as<IRDecoration>(use->getUser()))
@@ -1323,8 +1508,8 @@ bool doesCalleeHaveSideEffect(IRInst* callee)
         }
     }
 
-    // If the callee has no side effect, check if any of its associated functions have side effect.
-    // If so, we want to keep the callee around.
+    // If the callee has no side effect, check if any of its associated functions have side
+    // effect. If so, we want to keep the callee around.
     //
     // Typically, once the relevant pass has completed, the association is removed,
     // and at that point we can remove the function.
@@ -2230,13 +2415,12 @@ void legalizeDefUse(IRGlobalValueWithCode* func)
                 !(as<IRVar>(inst) && loopHeaderBlockMap.containsKey(block)))
                 continue;
 
-            // Normally, if the common dominator is not `block`, we can simply move the definition
-            // to the common dominator.
-            // An exception is when the common dominator is the target block of a
-            // loop.
-            // Another exception is when a var in the loop condition block is accessed both inside
-            // and outside the loop. It is technically visible, but effects on the 'var' are not
-            // visible outside the loop, so we'll need to hoist it out of the loop.
+            // Normally, if the common dominator is not `block`, we can simply move the
+            // definition to the common dominator. An exception is when the common dominator is
+            // the target block of a loop. Another exception is when a var in the loop condition
+            // block is accessed both inside and outside the loop. It is technically visible,
+            // but effects on the 'var' are not visible outside the loop, so we'll need to hoist
+            // it out of the loop.
             //
             // Note that after normalization, loops are in the form of:
             // ```
@@ -2377,9 +2561,9 @@ bool canOperationBeSpecConst(IROp op, IRType* resultType, IRInst* const* fixedAr
     // Returns true for ops that can be declared as an operation under `OpSpecConstantOp`.
     //
     // Integer arithmetic and comparison operations can be `OpSpecConstantOp` with the `Shader`
-    // capability, while floating-point arithmetic and comparison operations require the `Kernel`
-    // capability. We only support `Shader` capability for now, return false when floating-point
-    // arithmetic/comparison is encountered.
+    // capability, while floating-point arithmetic and comparison operations require the
+    // `Kernel` capability. We only support `Shader` capability for now, return false when
+    // floating-point arithmetic/comparison is encountered.
     switch (op)
     {
     case kIROp_Add:
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index c0410fa3c..b8937d569 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -70,6 +70,8 @@ bool isPointerOfType(IRInst* ptrType, IRInst* elementType);
 // True if ptrType is a pointer type to a type of opCode
 bool isPointerOfType(IRInst* ptrType, IROp opCode);
 
+bool isUserPointerType(IRInst* type);
+
 // Builds a dictionary that maps from requirement key to requirement value for `interfaceType`.
 Dictionary<IRInst*, IRInst*> buildInterfaceRequirementDict(IRInterfaceType* interfaceType);
 
@@ -205,6 +207,12 @@ IRInst* getRootAddr(
 
 bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2);
 
+bool canAddressesPotentiallyAlias(
+    TargetRequest* target,
+    IRGlobalValueWithCode* func,
+    IRInst* addr1,
+    IRInst* addr2);
+
 String dumpIRToString(
     IRInst* root,
     IRDumpOptions options = {IRDumpOptions::Mode::Simplified, IRDumpOptions::Flag::DumpDebugIds});
author	Yong He <yonghe@outlook.com>	2025-09-30 19:08:23 -0700
committer	GitHub <noreply@github.com>	2025-09-30 19:08:23 -0700
commit	e4611e2e30a3e5969d402f5ed7e72706a0e3b024 (patch)
tree	0f4240ccf8c4f0786949ab33adb0fcc332890d11 /source
parent	b6422e50cb19f7f790f29678ba22f31b0b305511 (diff)