From 01510f2c922af8629c7a730ef92a31fa83bd9f49 Mon Sep 17 00:00:00 2001
From: Yong He <yonghe@outlook.com>
Date: Wed, 15 Oct 2025 20:59:47 -0700
Subject: Immutable access qualifier for pointers and use `__ldg` on cuda.
 (#8710)

This PR implements `Access.Immutable` to allow pointers to immutable
data.

The new type `ImmutablePtr<T>` is defined as an alias of `Ptr<T,
Address.Immutable>`.
By forming a immutable pointer, the programmer is conveying to the
compiler that the data at the pointer address will never change during
the execution of the current program. Therefore loads from immutable
pointers can be deduplicated by the compiler, and will translate to
`__ldg` when generating code for CUDA.

The SPIRV backend is not changed in this PR, since the current SPIRV
spec makes it very difficult to specify loads from immutable address
without generating tons of wrappers and boilerplate type declarations.
We would like to see the spec evolved a bit to around its support of
`NonWritable` physical storage pointers or immutable loads before we
attempt to express such immutability in SPIRV. For now we simply emit
ordinary pointers and loads when generating spirv.

---------

Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
---
 prelude/slang-cuda-prelude.h                       |  17 +
 source/slang/core.meta.slang                       |  28 +-
 source/slang/slang-ast-support-types.cpp           |   1 +
 source/slang/slang-ast-type.cpp                    |  19 +-
 source/slang/slang-emit-cuda.cpp                   |   7 +
 source/slang/slang-emit-spirv.cpp                  |  19 +-
 source/slang/slang-emit.cpp                        |   8 +
 source/slang/slang-intrinsic-expand.cpp            |   1 -
 source/slang/slang-ir-cuda-immutable-load.cpp      | 375 +++++++++++++++++++++
 source/slang/slang-ir-cuda-immutable-load.h        |  17 +
 source/slang/slang-ir-glsl-legalize.cpp            |   4 +-
 source/slang/slang-ir-insts-stable-names.lua       |   1 +
 source/slang/slang-ir-insts.lua                    |   1 +
 source/slang/slang-ir-redundancy-removal.cpp       |  22 +-
 source/slang/slang-ir-spirv-legalize.cpp           |   4 +-
 source/slang/slang-ir-util.cpp                     |   3 +
 source/slang/slang-type-system-shared.h            |  16 +
 tests/cuda/copy-elision-this-1.slang               |   2 +-
 tests/cuda/dispatch-thread-id-extraction.slang     |  12 +-
 tests/optimization/buffer-load-defer-ptr.slang     |  38 +++
 .../defer-structured-buffer-load.slang             |   2 +-
 tests/optimization/immutable-buffer-load.slang     |  21 ++
 22 files changed, 568 insertions(+), 50 deletions(-)
 create mode 100644 source/slang/slang-ir-cuda-immutable-load.cpp
 create mode 100644 source/slang/slang-ir-cuda-immutable-load.h
 create mode 100644 tests/optimization/buffer-load-defer-ptr.slang
 create mode 100644 tests/optimization/immutable-buffer-load.slang
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 9508ea796..69d01920c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -250,6 +250,23 @@ struct __align__(4) bool4
     }
 };
 
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool __ldg(const bool* ptr)
+{
+    return (bool)(__ldg((const char*)ptr));
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 __ldg(const bool2* ptr)
+{
+    auto val = __ldg((const char2*)ptr);
+    return {val.x != 0, val.y != 0};
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 __ldg(const bool4* ptr)
+{
+    auto val = __ldg((const char4*)ptr);
+    return {val.x != 0, val.y != 0, val.z != 0, val.w != 0};
+}
+
 #if SLANG_CUDA_RTC
 
 typedef signed char int8_t;
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index ddee52b6b..2bf832eef 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -1300,11 +1300,29 @@ enum MemoryScope : int32_t
 }
 
 /// @category misc_types
+/// Describes the access permissions for a pointer type.
 __magic_enum(AccessQualifier)
 enum Access : uint64_t
 {
+    /// The data maybe read and modified through the pointer. This is the default access qualifier
+    /// for a pointer type.
     ReadWrite = $((uint64_t)AccessQualifier::ReadWrite),
+
+    /// The data being pointed to by a pointer can only be read through the pointer.
+    /// This is to be distinguished from `Immutable`, which means the data being pointed to
+    /// won't be changed by any means. In contrast, data pointed to by a `Read` pointer
+    /// may still be changed through another pointer that is not read-only.
+    /// This means that a pointer with `Read` access is meaningful only to the front-end
+    /// type system, and is not expected to provide any optimization opportunities to
+    /// the back-end.
     Read = $((uint64_t)AccessQualifier::Read),
+
+    /// The data being pointed to by a pointer is known to be immutable and won't
+    /// be changed by any means during the execution of the program. It is UB if
+    /// the data is changed during the program execution. This is a stronger
+    /// qualifier than `Read`, and may allow the backend to perform more aggressive
+    /// optimizations.
+    Immutable = $((uint64_t)AccessQualifier::Immutable),
 }
 
 //@public:
@@ -1314,8 +1332,8 @@ enum Access : uint64_t
 __magic_type(PtrType)
 struct Ptr<
     T, 
-    Access access = Access::ReadWrite, 
-    AddressSpace addrSpace = AddressSpace::Device>
+    Access access = Access.ReadWrite, 
+    AddressSpace addrSpace = AddressSpace.Device>
 {
     // A user is allowed to explicitly cast between any pointer type of
     // the same address space
@@ -1373,6 +1391,12 @@ struct Ptr<
     }
 };
 
+/// Represents a pointer to immutable data. Immutable data is known at compile time to remain unchanged during the entire
+/// execution of the program. This knowledge allows the compiler to perform more aggressive optimizations around the memory
+/// accesses through such pointers. If the data is changed during the program execution, the behavior is undefined and loaded
+/// data through such pointers may be invalid.
+typealias ImmutablePtr<T, AddressSpace addrSpace = AddressSpace.Device> = Ptr<T, Access.Immutable, addrSpace>;
+
 //@hidden:
 __intrinsic_op($(kIROp_AlignedAttr))
 internal int __align_attr(int alignment);
diff --git a/source/slang/slang-ast-support-types.cpp b/source/slang/slang-ast-support-types.cpp
index 4054d8dc4..d021cb9cb 100644
--- a/source/slang/slang-ast-support-types.cpp
+++ b/source/slang/slang-ast-support-types.cpp
@@ -23,6 +23,7 @@ QualType::QualType(Type* type)
                 break;
 
             case AccessQualifier::Read:
+            case AccessQualifier::Immutable:
                 isLeftValue = false;
                 break;
 
diff --git a/source/slang/slang-ast-type.cpp b/source/slang/slang-ast-type.cpp
index a3b4b91f6..e3f75cb2f 100644
--- a/source/slang/slang-ast-type.cpp
+++ b/source/slang/slang-ast-type.cpp
@@ -481,23 +481,23 @@ void maybePrintAddrSpaceOperand(StringBuilder& out, AddressSpace addrSpace)
     switch (addrSpace)
     {
     case AddressSpace::Generic:
-        out << toSlice(", AddressSpace::Generic");
+        out << toSlice(", AddressSpace.Generic");
         break;
     case AddressSpace::UserPointer:
         // We expose UserPointer as Device to users
-        out << toSlice(", AddressSpace::Device");
+        out << toSlice(", AddressSpace.Device");
         break;
     case AddressSpace::GroupShared:
-        out << toSlice(", AddressSpace::GroupShared");
+        out << toSlice(", AddressSpace.GroupShared");
         break;
     case AddressSpace::Global:
-        out << toSlice(", AddressSpace::Global");
+        out << toSlice(", AddressSpace.Global");
         break;
     case AddressSpace::ThreadLocal:
-        out << toSlice(", AddressSpace::ThreadLocal");
+        out << toSlice(", AddressSpace.ThreadLocal");
         break;
     case AddressSpace::Uniform:
-        out << toSlice(", AddressSpace::Uniform");
+        out << toSlice(", AddressSpace.Uniform");
         break;
     default:
         break;
@@ -509,10 +509,13 @@ void maybePrintAccessQualifierOperand(StringBuilder& out, AccessQualifier access
     switch (accessQualifier)
     {
     case AccessQualifier::ReadWrite:
-        out << toSlice(", Access::ReadWrite");
+        out << toSlice(", Access.ReadWrite");
         break;
     case AccessQualifier::Read:
-        out << toSlice(", Access::Read");
+        out << toSlice(", Access.Read");
+        break;
+    case AccessQualifier::Immutable:
+        out << toSlice(", Access.Immutable");
         break;
     default:
         break;
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index a5b90740d..885414b92 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -921,6 +921,13 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             m_writer->emit(")");
             return true;
         }
+    case kIROp_CUDALDG:
+        {
+            m_writer->emit("__ldg(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(")");
+        }
+        return true;
     default:
         break;
     }
diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp
index d5697117a..1b59d3070 100644
--- a/source/slang/slang-emit-spirv.cpp
+++ b/source/slang/slang-emit-spirv.cpp
@@ -6481,12 +6481,19 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex
             SpvStorageClassPhysicalStorageBuffer)
         {
             // If inst has a pointer type with PhysicalStorageBuffer address space,
-            // emit AliasedPointer decoration.
-            emitOpDecorate(
-                getSection(SpvLogicalSectionID::Annotations),
-                nullptr,
-                varInst,
-                (isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased));
+            // emit AliasedPointer or RestrictPointer decoration.
+            SpvDecoration decor;
+            if (ptrType->getAccessQualifier() == AccessQualifier::Immutable)
+            {
+                // We can always safely use RestrictPointer for immutable pointers.
+                // This will allow better optimization.
+                decor = isVar ? SpvDecorationRestrictPointer : SpvDecorationRestrict;
+            }
+            else
+            {
+                decor = isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased;
+            }
+            emitOpDecorate(getSection(SpvLogicalSectionID::Annotations), nullptr, varInst, decor);
         }
         else
         {
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 1bd1f8b5c..d5e89b1fe 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -32,6 +32,7 @@
 #include "slang-ir-collect-global-uniforms.h"
 #include "slang-ir-com-interface.h"
 #include "slang-ir-composite-reg-to-mem.h"
+#include "slang-ir-cuda-immutable-load.h"
 #include "slang-ir-dce.h"
 #include "slang-ir-defer-buffer-load.h"
 #include "slang-ir-defunctionalization.h"
@@ -1886,6 +1887,13 @@ Result linkAndOptimizeIR(
         specializeAddressSpaceForWGSL(irModule);
     }
 
+    // If we are generating code for CUDA, we should translate all immutable buffer loads to
+    // using `__ldg` intrinsic for improved performance.
+    if (isCUDATarget(targetRequest))
+    {
+        lowerImmutableBufferLoadForCUDA(targetProgram, irModule);
+    }
+
     performForceInlining(irModule);
 
     bool emitSpirvDirectly = targetProgram->shouldEmitSPIRVDirectly();
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index 28023e9e2..0b23c14b7 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -174,7 +174,6 @@ static bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
 
     if (numElems != imageFormatInfo.channelCount)
     {
-        SLANG_ASSERT(!"Format doesn't match channel count");
         return false;
     }
 
diff --git a/source/slang/slang-ir-cuda-immutable-load.cpp b/source/slang/slang-ir-cuda-immutable-load.cpp
new file mode 100644
index 000000000..713d5cb5a
--- /dev/null
+++ b/source/slang/slang-ir-cuda-immutable-load.cpp
@@ -0,0 +1,375 @@
+#include "slang-ir-cuda-immutable-load.h"
+
+#include "slang-ir-inst-pass-base.h"
+#include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
+#include "slang-ir-util.h"
+
+namespace Slang
+{
+
+enum LoadMethodKind
+{
+    Func,
+    Opcode
+};
+
+struct LoadMethod
+{
+    LoadMethodKind kind = LoadMethodKind::Func;
+    union
+    {
+        IRFunc* func;
+        IROp op;
+    };
+    LoadMethod() { func = nullptr; }
+    operator bool() { return kind == LoadMethodKind::Func ? func != nullptr : op != kIROp_Nop; }
+    LoadMethod(IRFunc* f)
+        : kind(LoadMethodKind::Func), func(f)
+    {
+    }
+    LoadMethod(IROp irop)
+        : kind(LoadMethodKind::Opcode), op(irop)
+    {
+    }
+    LoadMethod& operator=(IRFunc* f)
+    {
+        kind = LoadMethodKind::Func;
+        this->func = f;
+        return *this;
+    }
+    LoadMethod& operator=(IROp irop)
+    {
+        kind = LoadMethodKind::Opcode;
+        this->op = irop;
+        return *this;
+    }
+    IRInst* apply(IRBuilder& builder, IRType* resultType, IRInst* operandAddr)
+    {
+        if (kind == LoadMethodKind::Func)
+        {
+            return builder.emitCallInst(resultType, func, 1, &operandAddr);
+        }
+        else
+        {
+            return builder.emitIntrinsicInst(resultType, op, 1, &operandAddr);
+        }
+    }
+};
+
+struct ImmutableBufferLoadLoweringContext : InstPassBase
+{
+    Dictionary<IRType*, LoadMethod> loadFuncs;
+    TargetProgram* targetProgram;
+
+    IRFunc* createLoadFunc(IRBuilder& builder, IRType* valueType, IRParam*& outParam)
+    {
+        auto func = builder.createFunc();
+        builder.addNameHintDecoration(func, toSlice("slang_ldg"));
+        builder.setInsertInto(func);
+        auto block = builder.emitBlock();
+        auto ptrType = builder.getPtrType(valueType);
+        builder.setInsertInto(block);
+        outParam = builder.emitParam(ptrType);
+        builder.addNameHintDecoration(outParam, toSlice("ptr"));
+        func->setFullType(builder.getFuncType(ptrType, valueType));
+        return func;
+    }
+
+    LoadMethod createLoadFuncForType(IRType* type)
+    {
+        IRBuilder builder(type);
+        builder.setInsertAfter(type);
+        switch (type->getOp())
+        {
+        case kIROp_FloatType:
+        case kIROp_HalfType:
+        case kIROp_DoubleType:
+        case kIROp_Int8Type:
+        case kIROp_Int16Type:
+        case kIROp_IntType:
+        case kIROp_Int64Type:
+        case kIROp_IntPtrType:
+        case kIROp_UInt8Type:
+        case kIROp_UInt16Type:
+        case kIROp_UIntType:
+        case kIROp_UInt64Type:
+        case kIROp_UIntPtrType:
+        case kIROp_BoolType:
+        case kIROp_CharType:
+            return kIROp_CUDALDG;
+        case kIROp_VectorType:
+            {
+                // For vector types that has a direct mapping to CUDA __ldg,
+                // use the instruction directly.
+                auto vectorType = as<IRVectorType>(type);
+                auto elementType = vectorType->getElementType();
+                auto elementCount = getIntVal(vectorType->getElementCount());
+                IRSizeAndAlignment elementSize;
+                getNaturalSizeAndAlignment(
+                    targetProgram->getOptionSet(),
+                    elementType,
+                    &elementSize);
+                if (elementCount <= 2)
+                    return kIROp_CUDALDG;
+                else if (elementCount == 4)
+                {
+                    switch (elementType->getOp())
+                    {
+                    case kIROp_FloatType:
+                    case kIROp_UIntType:
+                    case kIROp_IntType:
+                    case kIROp_Int8Type:
+                    case kIROp_UInt8Type:
+                    case kIROp_Int16Type:
+                    case kIROp_UInt16Type:
+                        return kIROp_CUDALDG;
+                    }
+                }
+                // For other vector types, we need to generate a function to load its content.
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (UInt i = 0; i < (UInt)elementCount; i++)
+                {
+                    auto elementPtr = builder.emitElementAddress(
+                        builder.getPtrType(elementType),
+                        ptrParam,
+                        builder.getIntValue(builder.getIntType(), i));
+                    auto loadedElement =
+                        builder.emitIntrinsicInst(elementType, kIROp_CUDALDG, 1, &elementPtr);
+                    args.add(loadedElement);
+                }
+                auto result = builder.emitMakeVector(type, args);
+                builder.emitReturn(result);
+                return func;
+            }
+            break;
+        case kIROp_MatrixType:
+            {
+                // For matrix types, we should generate a function to load its content by row or
+                // column, depending on the layout.
+                auto matrixType = as<IRMatrixType>(type);
+                auto elementType = matrixType->getElementType();
+                auto rowCount = getIntVal(matrixType->getRowCount());
+                auto colCount = getIntVal(matrixType->getColumnCount());
+                auto layout = (MatrixLayoutMode)getIntVal(matrixType->getLayout());
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                if (layout == kMatrixLayoutMode_ColumnMajor)
+                {
+                    // For column major matrix, we can load it by column (vector) directly.
+                    auto vectorType = builder.getVectorType(elementType, rowCount);
+                    auto vectorPtrType = builder.getPtrType(vectorType);
+                    auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam);
+                    List<IRInst*> args;
+                    for (UInt i = 0; i < (UInt)colCount; i++)
+                    {
+                        auto colPtr = builder.emitGetOffsetPtr(
+                            elementBasePtr,
+                            builder.getIntValue(builder.getIntType(), i));
+                        auto loadedCol = emitImmutableLoad(builder, colPtr);
+                        args.add(loadedCol);
+                    }
+                    // Rearrange loaded vectors in row-major order.
+                    List<IRInst*> elements;
+                    for (UInt i = 0; i < (UInt)rowCount; i++)
+                    {
+                        for (UInt j = 0; j < (UInt)colCount; j++)
+                        {
+                            elements.add(builder.emitElementExtract(
+                                elementType,
+                                args[j],
+                                builder.getIntValue(builder.getIntType(), i)));
+                        }
+                    }
+                    auto result = builder.emitMakeMatrix(
+                        type,
+                        (UInt)elements.getCount(),
+                        elements.getArrayView().getBuffer());
+                    builder.emitReturn(result);
+                    return func;
+                }
+                else
+                {
+                    // For row major matrix, we can load it by row (vector) directly.
+                    auto vectorType = builder.getVectorType(elementType, colCount);
+                    auto vectorPtrType = builder.getPtrType(vectorType);
+                    auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam);
+                    List<IRInst*> args;
+                    for (UInt i = 0; i < (UInt)rowCount; i++)
+                    {
+                        auto rowPtr = builder.emitGetOffsetPtr(
+                            elementBasePtr,
+                            builder.getIntValue(builder.getIntType(), i));
+                        auto loadedRow = emitImmutableLoad(builder, rowPtr);
+                        args.add(loadedRow);
+                    }
+                    auto result =
+                        builder.emitMakeMatrix(type, (UInt)args.getCount(), args.getBuffer());
+                    builder.emitReturn(result);
+                    return func;
+                }
+            }
+            break;
+        case kIROp_ArrayType:
+            {
+                // For array types, we need to generate a function to load its content by element.
+                auto arrayType = as<IRArrayType>(type);
+                auto elementType = arrayType->getElementType();
+                auto elementCount = getIntVal(arrayType->getElementCount());
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (UInt i = 0; i < (UInt)elementCount; i++)
+                {
+                    auto elementPtr = builder.emitElementAddress(
+                        builder.getPtrType(elementType),
+                        ptrParam,
+                        builder.getIntValue(builder.getIntType(), i));
+                    auto loadedElement = emitImmutableLoad(builder, elementPtr);
+                    if (!loadedElement)
+                    {
+                        func->removeAndDeallocate();
+                        return LoadMethod();
+                    }
+                    args.add(loadedElement);
+                }
+                auto result = builder.emitMakeArray(type, (UInt)args.getCount(), args.getBuffer());
+                builder.emitReturn(result);
+                return func;
+            }
+        case kIROp_StructType:
+            {
+                // For struct types, we need to generate a function to load its content by field.
+                auto structType = as<IRStructType>(type);
+                IRParam* ptrParam = nullptr;
+                auto func = createLoadFunc(builder, type, ptrParam);
+                List<IRInst*> args;
+                for (auto field : structType->getFields())
+                {
+                    auto fieldType = field->getFieldType();
+                    auto fieldPtr = builder.emitFieldAddress(
+                        builder.getPtrType(fieldType),
+                        ptrParam,
+                        field->getKey());
+                    auto loadedField = emitImmutableLoad(builder, fieldPtr);
+                    if (!loadedField)
+                    {
+                        func->removeAndDeallocate();
+                        return LoadMethod();
+                    }
+                    args.add(loadedField);
+                }
+                auto result = builder.emitMakeStruct(type, args);
+                builder.emitReturn(result);
+                return func;
+            }
+        }
+        return LoadMethod();
+    }
+
+    LoadMethod getOrCreateLoadFuncForType(IRType* type)
+    {
+        if (auto func = loadFuncs.tryGetValue(type))
+            return *func;
+        auto result = createLoadFuncForType(type);
+        loadFuncs[type] = result;
+        return result;
+    }
+
+    IRInst* emitImmutableLoad(IRBuilder& builder, IRInst* ptr)
+    {
+        IRType* valueType = tryGetPointedToType(&builder, ptr->getDataType());
+        if (!valueType)
+            return nullptr;
+        auto loadFunc = getOrCreateLoadFuncForType(valueType);
+        if (!loadFunc)
+            return nullptr;
+        return loadFunc.apply(builder, valueType, ptr);
+    }
+
+    void processInst(IRInst* inst)
+    {
+        // For every load instruction we see in the module, if the it is loading from
+        // an immutable location, try to lower it into a series of __ldg calls.
+        // We need to handle both ordinary loads and structured buffer loads.
+        //
+        switch (inst->getOp())
+        {
+        case kIROp_Load:
+            {
+                auto load = as<IRLoad>(inst);
+                if (isPointerToImmutableLocation(getRootAddr(load->getPtr())))
+                {
+                    IRBuilder builder(load);
+                    builder.setInsertBefore(load);
+                    if (auto newLoad = emitImmutableLoad(builder, load->getPtr()))
+                    {
+                        load->replaceUsesWith(newLoad);
+                        load->removeAndDeallocate();
+                    }
+                }
+            }
+            break;
+        case kIROp_StructuredBufferLoad:
+            {
+                IRBuilder builder(inst);
+                builder.setInsertBefore(inst);
+                auto ptr = builder.emitRWStructuredBufferGetElementPtr(
+                    inst->getOperand(0),
+                    inst->getOperand(1));
+                if (auto newLoad = emitImmutableLoad(builder, ptr))
+                {
+                    inst->replaceUsesWith(newLoad);
+                    inst->removeAndDeallocate();
+                }
+                else
+                {
+                    // For some reason this load cannot be lowered, remove the ptr we just created.
+                    ptr->removeAndDeallocate();
+                }
+            }
+            break;
+        case kIROp_CUDALDG:
+            {
+                // Does the load needs lowering? If so insert lowered loads.
+                IRBuilder builder(inst);
+                builder.setInsertBefore(inst);
+                auto ptr = inst->getOperand(0);
+                auto valueType = tryGetPointedToType(&builder, ptr->getDataType());
+                if (!valueType)
+                    break;
+                auto loadFunc = getOrCreateLoadFuncForType(valueType);
+                if (!loadFunc)
+                    break;
+                // If the type doesn't need further lowering, we don't need to do anything.
+                if (loadFunc.kind == LoadMethodKind::Opcode && loadFunc.op == kIROp_CUDALDG)
+                    break;
+                auto newLoad = loadFunc.apply(builder, valueType, ptr);
+                inst->replaceUsesWith(newLoad);
+                inst->removeAndDeallocate();
+            }
+            break;
+        }
+    }
+
+    void processModule()
+    {
+        processAllInsts([&](IRInst* inst) { processInst(inst); });
+    }
+
+    ImmutableBufferLoadLoweringContext(IRModule* inModule)
+        : InstPassBase(inModule)
+    {
+    }
+};
+
+void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module)
+{
+    ImmutableBufferLoadLoweringContext context(module);
+    context.targetProgram = targetProgram;
+    context.processModule();
+}
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-cuda-immutable-load.h b/source/slang/slang-ir-cuda-immutable-load.h
new file mode 100644
index 000000000..adaa58dc8
--- /dev/null
+++ b/source/slang/slang-ir-cuda-immutable-load.h
@@ -0,0 +1,17 @@
+#pragma once
+
+namespace Slang
+{
+
+/*
+This pass will lower all immutable buffer loads into CUDA `__ldg` intrinsic calls
+to make sure these loads are performed through the read-only data cache on the GPU
+for better performance.
+*/
+
+struct IRModule;
+class TargetProgram;
+
+void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module);
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index 1ecae6574..ef7eed485 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -1477,7 +1477,7 @@ ScalarizedVal createSimpleGLSLGlobalVarying(
 
             auto accessQualifier = AccessQualifier::ReadWrite;
             if (kind == LayoutResourceKind::VaryingInput)
-                accessQualifier = AccessQualifier::Read;
+                accessQualifier = AccessQualifier::Immutable;
             IRType* paramType =
                 builder->getPtrType(ptrOpCode, arrayType, accessQualifier, addrSpace);
 
@@ -3109,7 +3109,7 @@ IRInst* getOrCreatePerVertexInputArray(GLSLLegalizationContext* context, IRInst*
         tryGetPointedToType(&builder, inputVertexAttr->getDataType()),
         builder.getIntValue(builder.getIntType(), 3));
     arrayInst = builder.createGlobalParam(
-        builder.getPtrType(arrayType, AccessQualifier::Read, AddressSpace::Input));
+        builder.getPtrType(arrayType, AccessQualifier::Immutable, AddressSpace::Input));
     context->mapVertexInputToPerVertexArray[inputVertexAttr] = arrayInst;
     builder.addDecoration(arrayInst, kIROp_PerVertexDecoration);
 
diff --git a/source/slang/slang-ir-insts-stable-names.lua b/source/slang/slang-ir-insts-stable-names.lua
index 923e73ac4..ee999d382 100644
--- a/source/slang/slang-ir-insts-stable-names.lua
+++ b/source/slang/slang-ir-insts-stable-names.lua
@@ -681,4 +681,5 @@ return {
 	["Decoration.InParamProxyVar"] = 677,
 	["Attr.MemoryScope"] = 678,
 	["Undefined.LoadFromUninitializedMemory"] = 679,
+	["CUDA_LDG"] = 680,
 }
diff --git a/source/slang/slang-ir-insts.lua b/source/slang/slang-ir-insts.lua
index ac72e718a..1290507d0 100644
--- a/source/slang/slang-ir-insts.lua
+++ b/source/slang/slang-ir-insts.lua
@@ -675,6 +675,7 @@ local insts = {
 	{ var = {} },
 	{ load = { min_operands = 1 } },
 	{ store = { min_operands = 2 } },
+	{ CUDA_LDG = {min_operands = 1 } },
 	-- Atomic Operations
 	{
 		AtomicOperation = {
diff --git a/source/slang/slang-ir-redundancy-removal.cpp b/source/slang/slang-ir-redundancy-removal.cpp
index 0308b50c2..e2822ba09 100644
--- a/source/slang/slang-ir-redundancy-removal.cpp
+++ b/source/slang/slang-ir-redundancy-removal.cpp
@@ -128,26 +128,8 @@ bool removeRedundancy(IRModule* module, bool hoistLoopInvariantInsts)
 
 bool isAddressMutable(IRInst* inst)
 {
-    auto rootType = getRootAddr(inst)->getDataType();
-    switch (rootType->getOp())
-    {
-    case kIROp_ParameterBlockType:
-    case kIROp_ConstantBufferType:
-    case kIROp_BorrowInParamType:
-        return false; // immutable
-
-    // We should consider StructuredBuffer as mutable by default, since the resources may alias.
-    // There could be anotherRWStructuredBuffer pointing to the same memory location as the
-    // structured buffer.
-    case kIROp_StructuredBufferLoad:
-    case kIROp_GetStructuredBufferPtr:
-        return true; // mutable
-    }
-
-    // Similarly, IRPtrTypeBase should also be considered writable always,
-    // because there can be aliasing.
-
-    return true; // mutable
+    auto rootAddr = getRootAddr(inst);
+    return !isPointerToImmutableLocation(rootAddr);
 }
 
 /// Eliminate redundant temporary variable copies in load-store patterns.
diff --git a/source/slang/slang-ir-spirv-legalize.cpp b/source/slang/slang-ir-spirv-legalize.cpp
index 6bdd02d73..965d6dc9b 100644
--- a/source/slang/slang-ir-spirv-legalize.cpp
+++ b/source/slang/slang-ir-spirv-legalize.cpp
@@ -517,7 +517,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
                 // structured buffers in GLSL should be annotated as ReadOnly
                 if (as<IRHLSLStructuredBufferType>(structuredBufferType))
                 {
-                    access = AccessQualifier::Read;
+                    access = AccessQualifier::Immutable;
                     memoryFlags = MemoryQualifierSetModifier::Flags::kReadOnly;
                 }
                 if (as<IRHLSLRasterizerOrderedStructuredBufferType>(structuredBufferType))
@@ -2276,7 +2276,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
 
             AccessQualifier accessQualifier = AccessQualifier::ReadWrite;
             if (as<IRHLSLStructuredBufferType>(t))
-                accessQualifier = AccessQualifier::Read;
+                accessQualifier = AccessQualifier::Immutable;
 
             IRBuilder builder(t);
             builder.setInsertBefore(t);
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 11eb4dfa0..4fb4d61ae 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -2749,6 +2749,7 @@ bool isPointerToImmutableLocation(IRInst* loc)
     switch (loc->getOp())
     {
     case kIROp_GetStructuredBufferPtr:
+    case kIROp_RWStructuredBufferGetElementPtr:
     case kIROp_ImageSubscript:
         return isPointerToImmutableLocation(loc->getOperand(0));
     default:
@@ -2784,6 +2785,8 @@ bool isPointerToImmutableLocation(IRInst* loc)
         case AddressSpace::UniformConstant:
             return true;
         }
+        if (ptrType->getAccessQualifier() == AccessQualifier::Immutable)
+            return true;
     }
     return false;
 }
diff --git a/source/slang/slang-type-system-shared.h b/source/slang/slang-type-system-shared.h
index aa9b07d89..b68e44b32 100644
--- a/source/slang/slang-type-system-shared.h
+++ b/source/slang/slang-type-system-shared.h
@@ -129,10 +129,26 @@ enum class MemoryScope : int32_t
     ShaderCall = 6,
 };
 
+// Represents the access qualifier of a pointer type.
 enum class AccessQualifier : uint64_t
 {
     ReadWrite = 0,
+
+    // The data being pointed to by a pointer can only be read through the pointer.
+    // This is to be distinguished from `Immutable`, which means the data being pointed to
+    // won't be changed by any means. In contrast, data pointed to by a `Read` pointer
+    // may still be changed through another pointer that is not read-only.
+    // This means that a pointer with `Read` access is meaningful only to the front-end
+    // type system, and is not expected to provide any optimization opportunities to
+    // the back-end.
     Read = 1,
+
+    // The data being pointed to by a pointer is known to be immutable and won't
+    // be changed by any means during the execution of the program. It is UB if
+    // the data is changed during the program execution. This is a stronger
+    // qualifier than `Read`, and may allow the backend to perform more aggresive
+    // optimizations.
+    Immutable = 2,
 };
 
 } // namespace Slang
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
index 273e6dc58..376ef1f80 100644
--- a/tests/cuda/copy-elision-this-1.slang
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -10,7 +10,7 @@ struct Data {
 
     // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}})
     // CUDA-NEXT: {
-    // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load
+    // CUDA-NEXT: globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}]
     float fetch(int buffer, int index)
     {
         return input[buffer][index];
diff --git a/tests/cuda/dispatch-thread-id-extraction.slang b/tests/cuda/dispatch-thread-id-extraction.slang
index 02705ff24..b1fac2d00 100644
--- a/tests/cuda/dispatch-thread-id-extraction.slang
+++ b/tests/cuda/dispatch-thread-id-extraction.slang
@@ -12,7 +12,7 @@ void computeMain(uint tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: uint _S1 = (blockIdx * blockDim + threadIdx).x;
+// CHECK: uint {{.*}} = (blockIdx * blockDim + threadIdx).x;
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -20,7 +20,7 @@ void computeMain2(uint2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RW
 {
     dst[tid.x] = src[tid.y];
 }
-// CHECK: uint2  _S2 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
+// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -28,9 +28,7 @@ void computeMain3(int2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWS
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: uint2  _S3 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
-// CHECK: int2  _S4 = make_int2 ((int)_S3.x, (int)_S3.y);
-// CHECK: int _S5 = _S4.x;
+// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -38,7 +36,7 @@ void computeMain4(int tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: int _S6 = int((blockIdx * blockDim + threadIdx).x);
+// CHECK: int {{.*}} = int((blockIdx * blockDim + threadIdx).x);
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
@@ -46,4 +44,4 @@ void computeMain5(int tid: SV_GroupIndex, StructuredBuffer<uint> src, RWStructur
 {
     dst[tid.x] = src[tid.x];
 }
-// CHECK: int _S7 = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x);
+// CHECK: int {{.*}} = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x);
diff --git a/tests/optimization/buffer-load-defer-ptr.slang b/tests/optimization/buffer-load-defer-ptr.slang
new file mode 100644
index 000000000..cde006dcf
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-ptr.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+    float bigArray[1024];
+    float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+    Bottom bottom;
+    float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+    Middle middle;
+    float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+    Top top;
+}
+
+uniform ImmutablePtr<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+    outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
index f7f9b1888..22f93bcd1 100644
--- a/tests/optimization/defer-structured-buffer-load.slang
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -28,7 +28,7 @@ RWStructuredBuffer<float> outputBuffer;
 // SPV: OpStore %{{.*}} %[[VALUE]]
 
 // CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
-// CUDA:  return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+// CUDA:  __ldg(&(&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}});
 
 [shader("compute")]
 [numthreads(1, 1, 1)]
diff --git a/tests/optimization/immutable-buffer-load.slang b/tests/optimization/immutable-buffer-load.slang
new file mode 100644
index 000000000..251a1ce9c
--- /dev/null
+++ b/tests/optimization/immutable-buffer-load.slang
@@ -0,0 +1,21 @@
+// Test that we can use ImmutablePtr<T> to result in more optimized buffer loads
+// in SPIR-V and CUDA.
+
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry computeMain -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+uniform ImmutablePtr<float4> data;
+
+uniform float4* result;
+
+float4 work(ImmutablePtr<float4> ptr)
+{
+    return *ptr;
+}
+
+[numthreads(1,1,1)]
+void computeMain()
+{
+    // SPV: Restrict
+    // PTX: ld.global.nc.v4.f32
+    *result = work(data) + float4(1,2,3,4);
+}
\ No newline at end of file
-- 
cgit v1.2.3