22 files changed, 568 insertions, 50 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 9508ea796..69d01920c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -250,6 +250,23 @@ struct __align__(4) bool4
     }
 };
 
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool __ldg(const bool* ptr)
+{
+    return (bool)(__ldg((const char*)ptr));
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 __ldg(const bool2* ptr)
+{
+    auto val = __ldg((const char2*)ptr);
+    return {val.x != 0, val.y != 0};
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 __ldg(const bool4* ptr)
+{
+    auto val = __ldg((const char4*)ptr);
+    return {val.x != 0, val.y != 0, val.z != 0, val.w != 0};
+}
+
 #if SLANG_CUDA_RTC
 
 typedef signed char int8_t;
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index ddee52b6b..2bf832eef 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -1300,11 +1300,29 @@ enum MemoryScope : int32_t
 }
 
 /// @category misc_types
+/// Describes the access permissions for a pointer type.
 __magic_enum(AccessQualifier)
 enum Access : uint64_t
 {
+    /// The data maybe read and modified through the pointer. This is the default access qualifier
+    /// for a pointer type.
     ReadWrite = $((uint64_t)AccessQualifier::ReadWrite),
+
+    /// The data being pointed to by a pointer can only be read through the pointer.
+    /// This is to be distinguished from `Immutable`, which means the data being pointed to
+    /// won't be changed by any means. In contrast, data pointed to by a `Read` pointer
+    /// may still be changed through another pointer that is not read-only.
+    /// This means that a pointer with `Read` access is meaningful only to the front-end
+    /// type system, and is not expected to provide any optimization opportunities to
+    /// the back-end.
     Read = $((uint64_t)AccessQualifier::Read),
+
+    /// The data being pointed to by a pointer is known to be immutable and won't
+    /// be changed by any means during the execution of the program. It is UB if
+    /// the data is changed during the program execution. This is a stronger
+    /// qualifier than `Read`, and may allow the backend to perform more aggressive
+    /// optimizations.
+    Immutable = $((uint64_t)AccessQualifier::Immutable),
 }
 
 //@public:
@@ -1314,8 +1332,8 @@ enum Access : uint64_t
 __magic_type(PtrType)
 struct Ptr<
     T, 
-    Access access = Access::ReadWrite, 
-    AddressSpace addrSpace = AddressSpace::Device>
+    Access access = Access.ReadWrite, 
+    AddressSpace addrSpace = AddressSpace.Device>
 {
     // A user is allowed to explicitly cast between any pointer type of
     // the same address space
@@ -1373,6 +1391,12 @@ struct Ptr<
     }
 };
 
+/// Represents a pointer to immutable data. Immutable data is known at compile time to remain unchanged during the entire
+/// execution of the program. This knowledge allows the compiler to perform more aggressive optimizations around the memory
+/// accesses through such pointers. If the data is changed during the program execution, the behavior is undefined and loaded
+/// data through such pointers may be invalid.
+typealias ImmutablePtr<T, AddressSpace addrSpace = AddressSpace.Device> = Ptr<T, Access.Immutable, addrSpace>;
+
 //@hidden:
 __intrinsic_op($(kIROp_AlignedAttr))
 internal int __align_attr(int alignment);
diff --git a/source/slang/slang-ast-support-types.cpp b/source/slang/slang-ast-support-types.cpp
index 4054d8dc4..d021cb9cb 100644
--- a/source/slang/slang-ast-support-types.cpp
+++ b/source/slang/slang-ast-support-types.cpp
@@ -23,6 +23,7 @@ QualType::QualType(Type* type)
                 break;
 
             case AccessQualifier::Read:
+            case AccessQualifier::Immutable:
                 isLeftValue = false;
                 break;
 
diff --git a/source/slang/slang-ast-type.cpp b/source/slang/slang-ast-type.cpp
index a3b4b91f6..e3f75cb2f 100644
--- a/source/slang/slang-ast-type.cpp
+++ b/source/slang/slang-ast-type.cpp
@@ -481,23 +481,23 @@ void maybePrintAddrSpaceOperand(StringBuilder& out, AddressSpace addrSpace)
     switch (addrSpace)
     {
     case AddressSpace::Generic:
-        out << toSlice(", AddressSpace::Generic");
+        out << toSlice(", AddressSpace.Generic");
         break;
     case AddressSpace::UserPointer:
         // We expose UserPointer as Device to users
-        out << toSlice(", AddressSpace::Device");
+        out << toSlice(", AddressSpace.Device");
         break;
     case AddressSpace::GroupShared:
-        out << toSlice(", AddressSpace::GroupShared");
+        out << toSlice(", AddressSpace.GroupShared");
         break;
     case AddressSpace::Global:
-        out << toSlice(", AddressSpace::Global");
+        out << toSlice(", AddressSpace.Global");
         break;
     case AddressSpace::ThreadLocal:
-        out << toSlice(", AddressSpace::ThreadLocal");
+        out << toSlice(", AddressSpace.ThreadLocal");
         break;
     case AddressSpace::Uniform:
-        out << toSlice(", AddressSpace::Uniform");
+        out << toSlice(", AddressSpace.Uniform");
         break;
     default:
         break;
@@ -509,10 +509,13 @@ void maybePrintAccessQualifierOperand(StringBuilder& out, AccessQualifier access
     switch (accessQualifier)
     {
     case AccessQualifier::ReadWrite:
-        out << toSlice(", Access::ReadWrite");
+        out << toSlice(", Access.ReadWrite");
         break;
     case AccessQualifier::Read:
-        out << toSlice(", Access::Read");
+        out << toSlice(", Access.Read");
+        break;
+    case AccessQualifier::Immutable:
+        out << toSlice(", Access.Immutable");
         break;
     default:
         break;
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index a5b90740d..885414b92 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -921,6 +921,13 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             m_writer->emit(")");
             return true;
         }
+    case kIROp_CUDALDG:
+        {
+            m_writer->emit("__ldg(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(")");
+        }
+        return true;
     default:
         break;
     }
diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp
index d5697117a..1b59d3070 100644
--- a/source/slang/slang-emit-spirv.cpp
+++ b/source/slang/slang-emit-spirv.cpp
@@ -6481,12 +6481,19 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex
             SpvStorageClassPhysicalStorageBuffer)
         {
             // If inst has a pointer type with PhysicalStorageBuffer address space,
-            // emit AliasedPointer decoration.
-            emitOpDecorate(
-                getSection(SpvLogicalSectionID::Annotations),
-                nullptr,
-                varInst,
-                (isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased));
+            // emit AliasedPointer or RestrictPointer decoration.
+            SpvDecoration decor;
+            if (ptrType->getAccessQualifier() == AccessQualifier::Immutable)
+            {
+                // We can always safely use RestrictPointer for immutable pointers.
+                // This will allow better optimization.
+                decor = isVar ? SpvDecorationRestrictPointer : SpvDecorationRestrict;
+            }
+            else
+            {
+                decor = isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased;
+            }
+            emitOpDecorate(getSection(SpvLogicalSectionID::Annotations), nullptr, varInst, decor);
         }
         else
         {
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 1bd1f8b5c..d5e89b1fe 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -32,6 +32,7 @@
 #include "slang-ir-collect-global-uniforms.h"
 #include "slang-ir-com-interface.h"
 #include "slang-ir-composite-reg-to-mem.h"
+#include "slang-ir-cuda-immutable-load.h"
 #include "slang-ir-dce.h"
 #include "slang-ir-defer-buffer-load.h"
 #include "slang-ir-defunctionalization.h"
@@ -1886,6 +1887,13 @@ Result linkAndOptimizeIR(
         specializeAddressSpaceForWGSL(irModule);
     }
 
+    // If we are generating code for CUDA, we should translate all immutable buffer loads to
+    // using `__ldg` intrinsic for improved performance.
+    if (isCUDATarget(targetRequest))
+    {
+        lowerImmutableBufferLoadForCUDA(targetProgram, irModule);
+    }
+
     performForceInlining(irModule);
 
     bool emitSpirvDirectly = targetProgram->shouldEmitSPIRVDirectly();
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index 28023e9e2..0b23c14b7 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -174,7 +174,6 @@ static bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
 
     if (numElems != imageFormatInfo.channelCount)
     {
-        SLANG_ASSERT(!"Format doesn't match channel count");
         return false;
     }
 
diff --git a/source/slang/slang-ir-cuda-immutable-load.cpp b/source/slang/slang-ir-cuda-immutable-load.cpp
new file mode 100644
index 000000000..713d5cb5a
--- /dev/null
+++ b/source/slang/slang-ir-cuda-immutable-load.cpp
@@ -0,0 +1,375 @@
+#include "slang-ir-cuda-immutable-load.h"
+
+#include "slang-ir-inst-pass-base.h"
+#include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
+#include "slang-ir-util.h"
+
+namespace Slang
+{
+
+enum LoadMethodKind
+{
+    Func,
+    Opcode
+};
+
+struct LoadMethod
+{
+    LoadMethodKind kind = LoadMethodKind::Func;
+    union
+    {
+        IRFunc* func;
+        IROp op;
+    };
+    LoadMethod() { func = nullptr; }
+    operator bool() { return kind == LoadMethodKind::Func ? func != nullptr : op != kIROp_Nop; }
+    LoadMethod(IRFunc* f)
+        : kind(LoadMethodKind::Func), func(f)
+    {
+    }
+    LoadMethod(IROp irop)
+        : kind(LoadMethodKind::Opcode), op(irop)
+    {
+    }
+    LoadMethod& operator=(IRFunc* f)
+    {
+        kind = LoadMethodKind::Func;
+        this->func = f;
+        return *this;
+    }
+    LoadMethod& operator=(IROp irop)
+    {
+        kind = LoadMethodKind::Opcode;
+        this->op = irop;
+        return *this;
+    }
+    IRInst* apply(IRBuilder& builder, IRType* resultType, IRInst* operandAddr)
+    {
+        if (kind == LoadMethodKind::Func)
+        {
+            return builder.emitCallInst(resultType, func, 1, &operandAddr);
+        }
+        else
+        {
+            return builder.emitIntrinsicInst(resultType, op, 1, &operandAddr);
+        }
+    }
+};
+
+struct ImmutableBufferLoadLoweringContext : InstPassBase
+{
+    Dictionary<IRType*, LoadMethod> loadFuncs;
+    TargetProgram* targetProgram;
+
+    IRFunc* createLoadFunc(IRBuilder& builder, IRType* valueType, IRParam*& outParam)
+    {
+        auto func = builder.createFunc();
+        builder.addNameHintDecoration(func, toSlice("slang_ldg"));
+        builder.setInsertInto(func);
+        auto block = builder.emitBlock();
+        auto ptrType = builder.getPtrType(valueType);
+        builder.setInsertInto(block);
+        outParam = builder.emitParam(ptrType);
+        builder.addNameHintDecoration(outParam, toSlice("ptr"));
+        func->setFullType(builder.getFuncType(ptrType, valueType));
+        return func;
+    }
+
+    LoadMethod createLoadFuncForType(IRType* type)
+    {
+        IRBuilder builder(type);
+        builder.setInsertAfter(type);
+        switch (type->getOp())
+        {
+        case kIROp_FloatType:
+        case kIROp_HalfType:
+        case kIROp_DoubleType:
+        case kIROp_Int8Type:
+        case kIROp_Int16Type:
+        case kIROp_IntType:
+        case kIROp_Int64Type:
+        case kIROp_IntPtrType:
+        case kIROp_UInt8Type:
+        case kIROp_UInt16Type:
+        case kIROp_UIntType:
+        case kIROp_UInt64Type:
+        case kIROp_UIntPtrType:
+        case kIROp_BoolType:
+        case kIROp_CharType:
+            return kIROp_CUDALDG;
+        case kIROp_VectorType:
+            {
+                // For vector types that has a direct mapping to CUDA __ldg,
+                // use the instruction directly.
+                auto vectorType = as<IRVectorType>(type);
+                auto elementType = vectorType->getElementType();
+                auto elementCount = getIntVal(vectorType->getElementCount());
+                IRSizeAndAlignment elementSize;
+                getNaturalSizeAndAlignment(
+                    targetProgram->getOptionSet(),
+                    elementType,
+                    &elementSize);
+                if (elementCount <= 2)
+                    return kIROp_CUDALDG;
+                else if (elementCount == 4)
+                {
+                    switch (elementType->getOp())
+                    {
+                    case kIROp_FloatType:
+                    case kIROp_UIntType:
+                    case kIROp_IntType:
+                    case kIROp_Int8Type:
+                    case kIROp_UInt8Type:
+                    case kIROp_Int16Type:
+                    case kIROp_UInt16Type:
+                        return kIROp_CUDALDG;
+                    }
+                }
+                // For other vector types, we need to generate a function to load its content.
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (UInt i = 0; i < (UInt)elementCount; i++)
+                {
+                    auto elementPtr = builder.emitElementAddress(
+                        builder.getPtrType(elementType),
+                        ptrParam,
+                        builder.getIntValue(builder.getIntType(), i));
+                    auto loadedElement =
+                        builder.emitIntrinsicInst(elementType, kIROp_CUDALDG, 1, &elementPtr);
+                    args.add(loadedElement);
+                }
+                auto result = builder.emitMakeVector(type, args);
+                builder.emitReturn(result);
+                return func;
+            }
+            break;
+        case kIROp_MatrixType:
+            {
+                // For matrix types, we should generate a function to load its content by row or
+                // column, depending on the layout.
+                auto matrixType = as<IRMatrixType>(type);
+                auto elementType = matrixType->getElementType();
+                auto rowCount = getIntVal(matrixType->getRowCount());
+                auto colCount = getIntVal(matrixType->getColumnCount());
+                auto layout = (MatrixLayoutMode)getIntVal(matrixType->getLayout());
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                if (layout == kMatrixLayoutMode_ColumnMajor)
+                {
+                    // For column major matrix, we can load it by column (vector) directly.
+                    auto vectorType = builder.getVectorType(elementType, rowCount);
+                    auto vectorPtrType = builder.getPtrType(vectorType);
+                    auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam);
+                    List<IRInst*> args;
+                    for (UInt i = 0; i < (UInt)colCount; i++)
+                    {
+                        auto colPtr = builder.emitGetOffsetPtr(
+                            elementBasePtr,
+                            builder.getIntValue(builder.getIntType(), i));
+                        auto loadedCol = emitImmutableLoad(builder, colPtr);
+                        args.add(loadedCol);
+                    }
+                    // Rearrange loaded vectors in row-major order.
+                    List<IRInst*> elements;
+                    for (UInt i = 0; i < (UInt)rowCount; i++)
+                    {
+                        for (UInt j = 0; j < (UInt)colCount; j++)
+                        {
+                            elements.add(builder.emitElementExtract(
+                                elementType,
+                                args[j],
+                                builder.getIntValue(builder.getIntType(), i)));
+                        }
+                    }
+                    auto result = builder.emitMakeMatrix(
+                        type,
+                        (UInt)elements.getCount(),
+                        elements.getArrayView().getBuffer());
+                    builder.emitReturn(result);
+                    return func;
+                }
+                else
+                {
+                    // For row major matrix, we can load it by row (vector) directly.
+                    auto vectorType = builder.getVectorType(elementType, colCount);
+                    auto vectorPtrType = builder.getPtrType(vectorType);
+                    auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam);
+                    List<IRInst*> args;
+                    for (UInt i = 0; i < (UInt)rowCount; i++)
+                    {
+                        auto rowPtr = builder.emitGetOffsetPtr(
+                            elementBasePtr,
+                            builder.getIntValue(builder.getIntType(), i));
+                        auto loadedRow = emitImmutableLoad(builder, rowPtr);
+                        args.add(loadedRow);
+                    }
+                    auto result =
+                        builder.emitMakeMatrix(type, (UInt)args.getCount(), args.getBuffer());
+                    builder.emitReturn(result);
+                    return func;
+                }
+            }
+            break;
+        case kIROp_ArrayType:
+            {
+                // For array types, we need to generate a function to load its content by element.
+                auto arrayType = as<IRArrayType>(type);
+                auto elementType = arrayType->getElementType();
+                auto elementCount = getIntVal(arrayType->getElementCount());
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (UInt i = 0; i < (UInt)elementCount; i++)
+                {
+                    auto elementPtr = builder.emitElementAddress(
+                        builder.getPtrType(elementType),
+                        ptrParam,
+                        builder.getIntValue(builder.getIntType(), i));
+                    auto loadedElement = emitImmutableLoad(builder, elementPtr);
+                    if (!loadedElement)
+                    {
+                        func->removeAndDeallocate();
+                        return LoadMethod();
+                    }
+                    args.add(loadedElement);
+                }
+                auto result = builder.emitMakeArray(type, (UInt)args.getCount(), args.getBuffer());
+                builder.emitReturn(result);
+                return func;
+            }
+        case kIROp_StructType:
+            {
+                // For struct types, we need to generate a function to load its content by field.
+                auto structType = as<IRStructType>(type);
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (auto field : structType->getFields())
+                {
+                    auto fieldType = field->getFieldType();
+                    auto fieldPtr = builder.emitFieldAddress(
+                        builder.getPtrType(fieldType),
+                        ptrParam,
+                        field->getKey());
+                    auto loadedField = emitImmutableLoad(builder, fieldPtr);
+                    if (!loadedField)
+                    {
+                        func->removeAndDeallocate();
+                        return LoadMethod();
+                    }
+                    args.add(loadedField);
+                }
+                auto result = builder.emitMakeStruct(type, args);
+                builder.emitReturn(result);
+                return func;
+            }
+        }
+        return LoadMethod();
+    }
+
+    LoadMethod getOrCreateLoadFuncForType(IRType* type)
+    {
+        if (auto func = loadFuncs.tryGetValue(type))
+            return *func;
+        auto result = createLoadFuncForType(type);
+        loadFuncs[type] = result;
+        return result;
+    }
+
+    IRInst* emitImmutableLoad(IRBuilder& builder, IRInst* ptr)
+    {
+        IRType* valueType = tryGetPointedToType(&builder, ptr->getDataType());
+        if (!valueType)
+            return nullptr;
+        auto loadFunc = getOrCreateLoadFuncForType(valueType);
+        if (!loadFunc)
+            return nullptr;
+        return loadFunc.apply(builder, valueType, ptr);
+    }
+
+    void processInst(IRInst* inst)
+    {
+        // For every load instruction we see in the module, if the it is loading from
+        // an immutable location, try to lower it into a series of __ldg calls.
+        // We need to handle both ordinary loads and structured buffer loads.
+        //
+        switch (inst->getOp())
+        {
+        case kIROp_Load:
+            {
+                auto load = as<IRLoad>(inst);
+                if (isPointerToImmutableLocation(getRootAddr(load->getPtr())))
+                {
+                    IRBuilder builder(load);
+                    builder.setInsertBefore(load);
+                    if (auto newLoad = emitImmutableLoad(builder, load->getPtr()))
+                    {
+                        load->replaceUsesWith(newLoad);
+                        load->removeAndDeallocate();
+                    }
+                }
+            }
+            break;
+        case kIROp_StructuredBufferLoad:
+            {
+                IRBuilder builder(inst);
+                builder.setInsertBefore(inst);
+                auto ptr = builder.emitRWStructuredBufferGetElementPtr(
+                    inst->getOperand(0),
+                    inst->getOperand(1));
+                if (auto newLoad = emitImmutableLoad(builder, ptr))
+                {
+                    inst->replaceUsesWith(newLoad);
+                    inst->removeAndDeallocate();
+                }
+                else
+                {
+                    // For some reason this load cannot be lowered, remove the ptr we just created.
+                    ptr->removeAndDeallocate();
+                }
+            }
+            break;
+        case kIROp_CUDALDG:
+            {
+                // Does the load needs lowering? If so insert lowered loads.
+                IRBuilder builder(inst);
+                builder.setInsertBefore(inst);
+                auto ptr = inst->getOperand(0);
+                auto valueType = tryGetPointedToType(&builder, ptr->getDataType());
+                if (!valueType)
+                    break;
+                auto loadFunc = getOrCreateLoadFuncForType(valueType);
+                if (!loadFunc)
+                    break;
+                // If the type doesn't need further lowering, we don't need to do anything.
+                if (loadFunc.kind == LoadMethodKind::Opcode && loadFunc.op == kIROp_CUDALDG)
+                    break;
+                auto newLoad = loadFunc.apply(builder, valueType, ptr);
+                inst->replaceUsesWith(newLoad);
+                inst->removeAndDeallocate();
+            }
+            break;
+        }
+    }
+
+    void processModule()
+    {
+        processAllInsts([&](IRInst* inst) { processInst(inst); });
+    }
+
+    ImmutableBufferLoadLoweringContext(IRModule* inModule)
+        : InstPassBase(inModule)
+    {
+    }
+};
+
+void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module)
+{
+    ImmutableBufferLoadLoweringContext context(module);
+    context.targetProgram = targetProgram;
+    context.processModule();
+}
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-cuda-immutable-load.h b/source/slang/slang-ir-cuda-immutable-load.h
new file mode 100644
index 000000000..adaa58dc8
--- /dev/null
+++ b/source/slang/slang-ir-cuda-immutable-load.h
@@ -0,0 +1,17 @@
+#pragma once
+
+namespace Slang
+{
+
+/*
+This pass will lower all immutable buffer loads into CUDA `__ldg` intrinsic calls
+to make sure these loads are performed through the read-only data cache on the GPU
+for better performance.
+*/
+
+struct IRModule;
+class TargetProgram;
+
+void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module);
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index 1ecae6574..ef7eed485 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -1477,7 +1477,7 @@ ScalarizedVal createSimpleGLSLGlobalVarying(
 
             auto accessQualifier = AccessQualifier::ReadWrite;
             if (kind == LayoutResourceKind::VaryingInput)
-                accessQualifier = AccessQualifier::Read;
+                accessQualifier = AccessQualifier::Immutable;
             IRType* paramType =
                 builder->getPtrType(ptrOpCode, arrayType, accessQualifier, addrSpace);
 
@@ -3109,7 +3109,7 @@ IRInst* getOrCreatePerVertexInputArray(GLSLLegalizationContext* context, IRInst*
         tryGetPointedToType(&builder, inputVertexAttr->getDataType()),
         builder.getIntValue(builder.getIntType(), 3));
     arrayInst = builder.createGlobalParam(
-        builder.getPtrType(arrayType, AccessQualifier::Read, AddressSpace::Input));
+        builder.getPtrType(arrayType, AccessQualifier::Immutable, AddressSpace::Input));
     context->mapVertexInputToPerVertexArray[inputVertexAttr] = arrayInst;
     builder.addDecoration(arrayInst, kIROp_PerVertexDecoration);
 
diff --git a/source/slang/slang-ir-insts-stable-names.lua b/source/slang/slang-ir-insts-stable-names.lua
index 923e73ac4..ee999d382 100644
--- a/source/slang/slang-ir-insts-stable-names.lua
+++ b/source/slang/slang-ir-insts-stable-names.lua
@@ -681,4 +681,5 @@ return {
 	["Decoration.InParamProxyVar"] = 677,
 	["Attr.MemoryScope"] = 678,
 	["Undefined.LoadFromUninitializedMemory"] = 679,
+	["CUDA_LDG"] = 680,
 }
diff --git a/source/slang/slang-ir-insts.lua b/source/slang/slang-ir-insts.lua
index ac72e718a..1290507d0 100644
--- a/source/slang/slang-ir-insts.lua
+++ b/source/slang/slang-ir-insts.lua
@@ -675,6 +675,7 @@ local insts = {
 	{ var = {} },
 	{ load = { min_operands = 1 } },
 	{ store = { min_operands = 2 } },
+	{ CUDA_LDG = {min_operands = 1 } },
 	-- Atomic Operations
 	{
 		AtomicOperation = {
diff --git a/source/slang/slang-ir-redundancy-removal.cpp b/source/slang/slang-ir-redundancy-removal.cpp
index 0308b50c2..e2822ba09 100644
--- a/source/slang/slang-ir-redundancy-removal.cpp
+++ b/source/slang/slang-ir-redundancy-removal.cpp
@@ -128,26 +128,8 @@ bool removeRedundancy(IRModule* module, bool hoistLoopInvariantInsts)
 
 bool isAddressMutable(IRInst* inst)
 {
-    auto rootType = getRootAddr(inst)->getDataType();
-    switch (rootType->getOp())
-    {
-    case kIROp_ParameterBlockType:
-    case kIROp_ConstantBufferType:
-    case kIROp_BorrowInParamType:
-        return false; // immutable
-
-    // We should consider StructuredBuffer as mutable by default, since the resources may alias.
-    // There could be anotherRWStructuredBuffer pointing to the same memory location as the
-    // structured buffer.
-    case kIROp_StructuredBufferLoad:
-    case kIROp_GetStructuredBufferPtr:
-        return true; // mutable
-    }
-
-    // Similarly, IRPtrTypeBase should also be considered writable always,
-    // because there can be aliasing.
-
-    return true; // mutable
+    auto rootAddr = getRootAddr(inst);
+    return !isPointerToImmutableLocation(rootAddr);
 }
 
 /// Eliminate redundant temporary variable copies in load-store patterns.
diff --git a/source/slang/slang-ir-spirv-legalize.cpp b/source/slang/slang-ir-spirv-legalize.cpp
index 6bdd02d73..965d6dc9b 100644
--- a/source/slang/slang-ir-spirv-legalize.cpp
+++ b/source/slang/slang-ir-spirv-legalize.cpp
@@ -517,7 +517,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
                 // structured buffers in GLSL should be annotated as ReadOnly
                 if (as<IRHLSLStructuredBufferType>(structuredBufferType))
                 {
-                    access = AccessQualifier::Read;
+                    access = AccessQualifier::Immutable;
                     memoryFlags = MemoryQualifierSetModifier::Flags::kReadOnly;
                 }
                 if (as<IRHLSLRasterizerOrderedStructuredBufferType>(structuredBufferType))
@@ -2276,7 +2276,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
 
             AccessQualifier accessQualifier = AccessQualifier::ReadWrite;
             if (as<IRHLSLStructuredBufferType>(t))
-                accessQualifier = AccessQualifier::Read;
+                accessQualifier = AccessQualifier::Immutable;
 
             IRBuilder builder(t);
             builder.setInsertBefore(t);
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 11eb4dfa0..4fb4d61ae 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -2749,6 +2749,7 @@ bool isPointerToImmutableLocation(IRInst* loc)
     switch (loc->getOp())
     {
     case kIROp_GetStructuredBufferPtr:
+    case kIROp_RWStructuredBufferGetElementPtr:
     case kIROp_ImageSubscript:
         return isPointerToImmutableLocation(loc->getOperand(0));
     default:
@@ -2784,6 +2785,8 @@ bool isPointerToImmutableLocation(IRInst* loc)
         case AddressSpace::UniformConstant:
             return true;
         }
+        if (ptrType->getAccessQualifier() == AccessQualifier::Immutable)
+            return true;
     }
     return false;
 }
diff --git a/source/slang/slang-type-system-shared.h b/source/slang/slang-type-system-shared.h
index aa9b07d89..b68e44b32 100644
--- a/source/slang/slang-type-system-shared.h
+++ b/source/slang/slang-type-system-shared.h
@@ -129,10 +129,26 @@ enum class MemoryScope : int32_t
     ShaderCall = 6,
 };
 
+// Represents the access qualifier of a pointer type.
 enum class AccessQualifier : uint64_t
 {
     ReadWrite = 0,
+
+    // The data being pointed to by a pointer can only be read through the pointer.
+    // This is to be distinguished from `Immutable`, which means the data being pointed to
+    // won't be changed by any means. In contrast, data pointed to by a `Read` pointer
+    // may still be changed through another pointer that is not read-only.
+    // This means that a pointer with `Read` access is meaningful only to the front-end
+    // type system, and is not expected to provide any optimization opportunities to
+    // the back-end.
     Read = 1,
+
+    // The data being pointed to by a pointer is known to be immutable and won't
+    // be changed by any means during the execution of the program. It is UB if
+    // the data is changed during the program execution. This is a stronger
+    // qualifier than `Read`, and may allow the backend to perform more aggresive
+    // optimizations.
+    Immutable = 2,
 };
 
 } // namespace Slang
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
index 273e6dc58..376ef1f80 100644
--- a/tests/cuda/copy-elision-this-1.slang
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -10,7 +10,7 @@ struct Data {
 
     // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}})
     // CUDA-NEXT: {
-    // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load
+    // CUDA-NEXT: globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}]
     float fetch(int buffer, int index)
     {
         return input[buffer][index];
diff --git a/tests/cuda/dispatch-thread-id-extraction.slang b/tests/cuda/dispatch-thread-id-extraction.slang
index 02705ff24..b1fac2d00 100644
--- a/tests/cuda/dispatch-thread-id-extraction.slang
+++ b/tests/cuda/dispatch-thread-id-extraction.slang
@@ -12,7 +12,7 @@ void computeMain(uint tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: uint _S1 = (blockIdx * blockDim + threadIdx).x;
+// CHECK: uint {{.*}} = (blockIdx * blockDim + threadIdx).x;
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -20,7 +20,7 @@ void computeMain2(uint2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RW
 {
     dst[tid.x] = src[tid.y];
 }
-// CHECK: uint2  _S2 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
+// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -28,9 +28,7 @@ void computeMain3(int2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWS
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: uint2  _S3 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
-// CHECK: int2  _S4 = make_int2 ((int)_S3.x, (int)_S3.y);
-// CHECK: int _S5 = _S4.x;
+// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -38,7 +36,7 @@ void computeMain4(int tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: int _S6 = int((blockIdx * blockDim + threadIdx).x);
+// CHECK: int {{.*}} = int((blockIdx * blockDim + threadIdx).x);
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -46,4 +44,4 @@ void computeMain5(int tid: SV_GroupIndex, StructuredBuffer<uint> src, RWStructur
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: int _S7 = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x);
+// CHECK: int {{.*}} = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x);
diff --git a/tests/optimization/buffer-load-defer-ptr.slang b/tests/optimization/buffer-load-defer-ptr.slang
new file mode 100644
index 000000000..cde006dcf
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-ptr.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    Middle middle;
+    float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+uniform ImmutablePtr<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
index f7f9b1888..22f93bcd1 100644
--- a/tests/optimization/defer-structured-buffer-load.slang
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -28,7 +28,7 @@ RWStructuredBuffer<float> outputBuffer;
 // SPV: OpStore %{{.*}} %[[VALUE]]
 
 // CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
-// CUDA:  return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+// CUDA:  __ldg(&(&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}});
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
diff --git a/tests/optimization/immutable-buffer-load.slang b/tests/optimization/immutable-buffer-load.slang
new file mode 100644
index 000000000..251a1ce9c
--- /dev/null
+++ b/tests/optimization/immutable-buffer-load.slang
@@ -0,0 +1,21 @@
+// Test that we can use ImmutablePtr<T> to result in more optimized buffer loads
+// in SPIR-V and CUDA.
+
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+uniform ImmutablePtr<float4> data;
+
+uniform float4* result;
+
+float4 work(ImmutablePtr<float4> ptr)
+{
+    return *ptr;
+}
+
+[numthreads(1,1,1)]
+void computeMain()
+{
+    // SPV: Restrict
+    // PTX: ld.global.nc.v4.f32
+    *result = work(data) + float4(1,2,3,4);
+}
+\ No newline at end of file