diff options
22 files changed, 568 insertions, 50 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 9508ea796..69d01920c 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -250,6 +250,23 @@ struct __align__(4) bool4 } }; +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool __ldg(const bool* ptr) +{ + return (bool)(__ldg((const char*)ptr)); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 __ldg(const bool2* ptr) +{ + auto val = __ldg((const char2*)ptr); + return {val.x != 0, val.y != 0}; +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 __ldg(const bool4* ptr) +{ + auto val = __ldg((const char4*)ptr); + return {val.x != 0, val.y != 0, val.z != 0, val.w != 0}; +} + #if SLANG_CUDA_RTC typedef signed char int8_t; diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index ddee52b6b..2bf832eef 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -1300,11 +1300,29 @@ enum MemoryScope : int32_t } /// @category misc_types +/// Describes the access permissions for a pointer type. __magic_enum(AccessQualifier) enum Access : uint64_t { + /// The data maybe read and modified through the pointer. This is the default access qualifier + /// for a pointer type. ReadWrite = $((uint64_t)AccessQualifier::ReadWrite), + + /// The data being pointed to by a pointer can only be read through the pointer. + /// This is to be distinguished from `Immutable`, which means the data being pointed to + /// won't be changed by any means. In contrast, data pointed to by a `Read` pointer + /// may still be changed through another pointer that is not read-only. + /// This means that a pointer with `Read` access is meaningful only to the front-end + /// type system, and is not expected to provide any optimization opportunities to + /// the back-end. Read = $((uint64_t)AccessQualifier::Read), + + /// The data being pointed to by a pointer is known to be immutable and won't + /// be changed by any means during the execution of the program. It is UB if + /// the data is changed during the program execution. This is a stronger + /// qualifier than `Read`, and may allow the backend to perform more aggressive + /// optimizations. + Immutable = $((uint64_t)AccessQualifier::Immutable), } //@public: @@ -1314,8 +1332,8 @@ enum Access : uint64_t __magic_type(PtrType) struct Ptr< T, - Access access = Access::ReadWrite, - AddressSpace addrSpace = AddressSpace::Device> + Access access = Access.ReadWrite, + AddressSpace addrSpace = AddressSpace.Device> { // A user is allowed to explicitly cast between any pointer type of // the same address space @@ -1373,6 +1391,12 @@ struct Ptr< } }; +/// Represents a pointer to immutable data. Immutable data is known at compile time to remain unchanged during the entire +/// execution of the program. This knowledge allows the compiler to perform more aggressive optimizations around the memory +/// accesses through such pointers. If the data is changed during the program execution, the behavior is undefined and loaded +/// data through such pointers may be invalid. +typealias ImmutablePtr<T, AddressSpace addrSpace = AddressSpace.Device> = Ptr<T, Access.Immutable, addrSpace>; + //@hidden: __intrinsic_op($(kIROp_AlignedAttr)) internal int __align_attr(int alignment); diff --git a/source/slang/slang-ast-support-types.cpp b/source/slang/slang-ast-support-types.cpp index 4054d8dc4..d021cb9cb 100644 --- a/source/slang/slang-ast-support-types.cpp +++ b/source/slang/slang-ast-support-types.cpp @@ -23,6 +23,7 @@ QualType::QualType(Type* type) break; case AccessQualifier::Read: + case AccessQualifier::Immutable: isLeftValue = false; break; diff --git a/source/slang/slang-ast-type.cpp b/source/slang/slang-ast-type.cpp index a3b4b91f6..e3f75cb2f 100644 --- a/source/slang/slang-ast-type.cpp +++ b/source/slang/slang-ast-type.cpp @@ -481,23 +481,23 @@ void maybePrintAddrSpaceOperand(StringBuilder& out, AddressSpace addrSpace) switch (addrSpace) { case AddressSpace::Generic: - out << toSlice(", AddressSpace::Generic"); + out << toSlice(", AddressSpace.Generic"); break; case AddressSpace::UserPointer: // We expose UserPointer as Device to users - out << toSlice(", AddressSpace::Device"); + out << toSlice(", AddressSpace.Device"); break; case AddressSpace::GroupShared: - out << toSlice(", AddressSpace::GroupShared"); + out << toSlice(", AddressSpace.GroupShared"); break; case AddressSpace::Global: - out << toSlice(", AddressSpace::Global"); + out << toSlice(", AddressSpace.Global"); break; case AddressSpace::ThreadLocal: - out << toSlice(", AddressSpace::ThreadLocal"); + out << toSlice(", AddressSpace.ThreadLocal"); break; case AddressSpace::Uniform: - out << toSlice(", AddressSpace::Uniform"); + out << toSlice(", AddressSpace.Uniform"); break; default: break; @@ -509,10 +509,13 @@ void maybePrintAccessQualifierOperand(StringBuilder& out, AccessQualifier access switch (accessQualifier) { case AccessQualifier::ReadWrite: - out << toSlice(", Access::ReadWrite"); + out << toSlice(", Access.ReadWrite"); break; case AccessQualifier::Read: - out << toSlice(", Access::Read"); + out << toSlice(", Access.Read"); + break; + case AccessQualifier::Immutable: + out << toSlice(", Access.Immutable"); break; default: break; diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index a5b90740d..885414b92 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -921,6 +921,13 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu m_writer->emit(")"); return true; } + case kIROp_CUDALDG: + { + m_writer->emit("__ldg("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(")"); + } + return true; default: break; } diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp index d5697117a..1b59d3070 100644 --- a/source/slang/slang-emit-spirv.cpp +++ b/source/slang/slang-emit-spirv.cpp @@ -6481,12 +6481,19 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex SpvStorageClassPhysicalStorageBuffer) { // If inst has a pointer type with PhysicalStorageBuffer address space, - // emit AliasedPointer decoration. - emitOpDecorate( - getSection(SpvLogicalSectionID::Annotations), - nullptr, - varInst, - (isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased)); + // emit AliasedPointer or RestrictPointer decoration. + SpvDecoration decor; + if (ptrType->getAccessQualifier() == AccessQualifier::Immutable) + { + // We can always safely use RestrictPointer for immutable pointers. + // This will allow better optimization. + decor = isVar ? SpvDecorationRestrictPointer : SpvDecorationRestrict; + } + else + { + decor = isVar ? SpvDecorationAliasedPointer : SpvDecorationAliased; + } + emitOpDecorate(getSection(SpvLogicalSectionID::Annotations), nullptr, varInst, decor); } else { diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 1bd1f8b5c..d5e89b1fe 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -32,6 +32,7 @@ #include "slang-ir-collect-global-uniforms.h" #include "slang-ir-com-interface.h" #include "slang-ir-composite-reg-to-mem.h" +#include "slang-ir-cuda-immutable-load.h" #include "slang-ir-dce.h" #include "slang-ir-defer-buffer-load.h" #include "slang-ir-defunctionalization.h" @@ -1886,6 +1887,13 @@ Result linkAndOptimizeIR( specializeAddressSpaceForWGSL(irModule); } + // If we are generating code for CUDA, we should translate all immutable buffer loads to + // using `__ldg` intrinsic for improved performance. + if (isCUDATarget(targetRequest)) + { + lowerImmutableBufferLoadForCUDA(targetProgram, irModule); + } + performForceInlining(irModule); bool emitSpirvDirectly = targetProgram->shouldEmitSPIRVDirectly(); diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp index 28023e9e2..0b23c14b7 100644 --- a/source/slang/slang-intrinsic-expand.cpp +++ b/source/slang/slang-intrinsic-expand.cpp @@ -174,7 +174,6 @@ static bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType) if (numElems != imageFormatInfo.channelCount) { - SLANG_ASSERT(!"Format doesn't match channel count"); return false; } diff --git a/source/slang/slang-ir-cuda-immutable-load.cpp b/source/slang/slang-ir-cuda-immutable-load.cpp new file mode 100644 index 000000000..713d5cb5a --- /dev/null +++ b/source/slang/slang-ir-cuda-immutable-load.cpp @@ -0,0 +1,375 @@ +#include "slang-ir-cuda-immutable-load.h" + +#include "slang-ir-inst-pass-base.h" +#include "slang-ir-insts.h" +#include "slang-ir-layout.h" +#include "slang-ir-util.h" + +namespace Slang +{ + +enum LoadMethodKind +{ + Func, + Opcode +}; + +struct LoadMethod +{ + LoadMethodKind kind = LoadMethodKind::Func; + union + { + IRFunc* func; + IROp op; + }; + LoadMethod() { func = nullptr; } + operator bool() { return kind == LoadMethodKind::Func ? func != nullptr : op != kIROp_Nop; } + LoadMethod(IRFunc* f) + : kind(LoadMethodKind::Func), func(f) + { + } + LoadMethod(IROp irop) + : kind(LoadMethodKind::Opcode), op(irop) + { + } + LoadMethod& operator=(IRFunc* f) + { + kind = LoadMethodKind::Func; + this->func = f; + return *this; + } + LoadMethod& operator=(IROp irop) + { + kind = LoadMethodKind::Opcode; + this->op = irop; + return *this; + } + IRInst* apply(IRBuilder& builder, IRType* resultType, IRInst* operandAddr) + { + if (kind == LoadMethodKind::Func) + { + return builder.emitCallInst(resultType, func, 1, &operandAddr); + } + else + { + return builder.emitIntrinsicInst(resultType, op, 1, &operandAddr); + } + } +}; + +struct ImmutableBufferLoadLoweringContext : InstPassBase +{ + Dictionary<IRType*, LoadMethod> loadFuncs; + TargetProgram* targetProgram; + + IRFunc* createLoadFunc(IRBuilder& builder, IRType* valueType, IRParam*& outParam) + { + auto func = builder.createFunc(); + builder.addNameHintDecoration(func, toSlice("slang_ldg")); + builder.setInsertInto(func); + auto block = builder.emitBlock(); + auto ptrType = builder.getPtrType(valueType); + builder.setInsertInto(block); + outParam = builder.emitParam(ptrType); + builder.addNameHintDecoration(outParam, toSlice("ptr")); + func->setFullType(builder.getFuncType(ptrType, valueType)); + return func; + } + + LoadMethod createLoadFuncForType(IRType* type) + { + IRBuilder builder(type); + builder.setInsertAfter(type); + switch (type->getOp()) + { + case kIROp_FloatType: + case kIROp_HalfType: + case kIROp_DoubleType: + case kIROp_Int8Type: + case kIROp_Int16Type: + case kIROp_IntType: + case kIROp_Int64Type: + case kIROp_IntPtrType: + case kIROp_UInt8Type: + case kIROp_UInt16Type: + case kIROp_UIntType: + case kIROp_UInt64Type: + case kIROp_UIntPtrType: + case kIROp_BoolType: + case kIROp_CharType: + return kIROp_CUDALDG; + case kIROp_VectorType: + { + // For vector types that has a direct mapping to CUDA __ldg, + // use the instruction directly. + auto vectorType = as<IRVectorType>(type); + auto elementType = vectorType->getElementType(); + auto elementCount = getIntVal(vectorType->getElementCount()); + IRSizeAndAlignment elementSize; + getNaturalSizeAndAlignment( + targetProgram->getOptionSet(), + elementType, + &elementSize); + if (elementCount <= 2) + return kIROp_CUDALDG; + else if (elementCount == 4) + { + switch (elementType->getOp()) + { + case kIROp_FloatType: + case kIROp_UIntType: + case kIROp_IntType: + case kIROp_Int8Type: + case kIROp_UInt8Type: + case kIROp_Int16Type: + case kIROp_UInt16Type: + return kIROp_CUDALDG; + } + } + // For other vector types, we need to generate a function to load its content. + IRParam* ptrParam = nullptr; + auto func = createLoadFunc(builder, type, ptrParam); + List<IRInst*> args; + for (UInt i = 0; i < (UInt)elementCount; i++) + { + auto elementPtr = builder.emitElementAddress( + builder.getPtrType(elementType), + ptrParam, + builder.getIntValue(builder.getIntType(), i)); + auto loadedElement = + builder.emitIntrinsicInst(elementType, kIROp_CUDALDG, 1, &elementPtr); + args.add(loadedElement); + } + auto result = builder.emitMakeVector(type, args); + builder.emitReturn(result); + return func; + } + break; + case kIROp_MatrixType: + { + // For matrix types, we should generate a function to load its content by row or + // column, depending on the layout. + auto matrixType = as<IRMatrixType>(type); + auto elementType = matrixType->getElementType(); + auto rowCount = getIntVal(matrixType->getRowCount()); + auto colCount = getIntVal(matrixType->getColumnCount()); + auto layout = (MatrixLayoutMode)getIntVal(matrixType->getLayout()); + IRParam* ptrParam = nullptr; + auto func = createLoadFunc(builder, type, ptrParam); + if (layout == kMatrixLayoutMode_ColumnMajor) + { + // For column major matrix, we can load it by column (vector) directly. + auto vectorType = builder.getVectorType(elementType, rowCount); + auto vectorPtrType = builder.getPtrType(vectorType); + auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam); + List<IRInst*> args; + for (UInt i = 0; i < (UInt)colCount; i++) + { + auto colPtr = builder.emitGetOffsetPtr( + elementBasePtr, + builder.getIntValue(builder.getIntType(), i)); + auto loadedCol = emitImmutableLoad(builder, colPtr); + args.add(loadedCol); + } + // Rearrange loaded vectors in row-major order. + List<IRInst*> elements; + for (UInt i = 0; i < (UInt)rowCount; i++) + { + for (UInt j = 0; j < (UInt)colCount; j++) + { + elements.add(builder.emitElementExtract( + elementType, + args[j], + builder.getIntValue(builder.getIntType(), i))); + } + } + auto result = builder.emitMakeMatrix( + type, + (UInt)elements.getCount(), + elements.getArrayView().getBuffer()); + builder.emitReturn(result); + return func; + } + else + { + // For row major matrix, we can load it by row (vector) directly. + auto vectorType = builder.getVectorType(elementType, colCount); + auto vectorPtrType = builder.getPtrType(vectorType); + auto elementBasePtr = builder.emitBitCast(vectorPtrType, ptrParam); + List<IRInst*> args; + for (UInt i = 0; i < (UInt)rowCount; i++) + { + auto rowPtr = builder.emitGetOffsetPtr( + elementBasePtr, + builder.getIntValue(builder.getIntType(), i)); + auto loadedRow = emitImmutableLoad(builder, rowPtr); + args.add(loadedRow); + } + auto result = + builder.emitMakeMatrix(type, (UInt)args.getCount(), args.getBuffer()); + builder.emitReturn(result); + return func; + } + } + break; + case kIROp_ArrayType: + { + // For array types, we need to generate a function to load its content by element. + auto arrayType = as<IRArrayType>(type); + auto elementType = arrayType->getElementType(); + auto elementCount = getIntVal(arrayType->getElementCount()); + IRParam* ptrParam = nullptr; + auto func = createLoadFunc(builder, type, ptrParam); + List<IRInst*> args; + for (UInt i = 0; i < (UInt)elementCount; i++) + { + auto elementPtr = builder.emitElementAddress( + builder.getPtrType(elementType), + ptrParam, + builder.getIntValue(builder.getIntType(), i)); + auto loadedElement = emitImmutableLoad(builder, elementPtr); + if (!loadedElement) + { + func->removeAndDeallocate(); + return LoadMethod(); + } + args.add(loadedElement); + } + auto result = builder.emitMakeArray(type, (UInt)args.getCount(), args.getBuffer()); + builder.emitReturn(result); + return func; + } + case kIROp_StructType: + { + // For struct types, we need to generate a function to load its content by field. + auto structType = as<IRStructType>(type); + IRParam* ptrParam = nullptr; + auto func = createLoadFunc(builder, type, ptrParam); + List<IRInst*> args; + for (auto field : structType->getFields()) + { + auto fieldType = field->getFieldType(); + auto fieldPtr = builder.emitFieldAddress( + builder.getPtrType(fieldType), + ptrParam, + field->getKey()); + auto loadedField = emitImmutableLoad(builder, fieldPtr); + if (!loadedField) + { + func->removeAndDeallocate(); + return LoadMethod(); + } + args.add(loadedField); + } + auto result = builder.emitMakeStruct(type, args); + builder.emitReturn(result); + return func; + } + } + return LoadMethod(); + } + + LoadMethod getOrCreateLoadFuncForType(IRType* type) + { + if (auto func = loadFuncs.tryGetValue(type)) + return *func; + auto result = createLoadFuncForType(type); + loadFuncs[type] = result; + return result; + } + + IRInst* emitImmutableLoad(IRBuilder& builder, IRInst* ptr) + { + IRType* valueType = tryGetPointedToType(&builder, ptr->getDataType()); + if (!valueType) + return nullptr; + auto loadFunc = getOrCreateLoadFuncForType(valueType); + if (!loadFunc) + return nullptr; + return loadFunc.apply(builder, valueType, ptr); + } + + void processInst(IRInst* inst) + { + // For every load instruction we see in the module, if the it is loading from + // an immutable location, try to lower it into a series of __ldg calls. + // We need to handle both ordinary loads and structured buffer loads. + // + switch (inst->getOp()) + { + case kIROp_Load: + { + auto load = as<IRLoad>(inst); + if (isPointerToImmutableLocation(getRootAddr(load->getPtr()))) + { + IRBuilder builder(load); + builder.setInsertBefore(load); + if (auto newLoad = emitImmutableLoad(builder, load->getPtr())) + { + load->replaceUsesWith(newLoad); + load->removeAndDeallocate(); + } + } + } + break; + case kIROp_StructuredBufferLoad: + { + IRBuilder builder(inst); + builder.setInsertBefore(inst); + auto ptr = builder.emitRWStructuredBufferGetElementPtr( + inst->getOperand(0), + inst->getOperand(1)); + if (auto newLoad = emitImmutableLoad(builder, ptr)) + { + inst->replaceUsesWith(newLoad); + inst->removeAndDeallocate(); + } + else + { + // For some reason this load cannot be lowered, remove the ptr we just created. + ptr->removeAndDeallocate(); + } + } + break; + case kIROp_CUDALDG: + { + // Does the load needs lowering? If so insert lowered loads. + IRBuilder builder(inst); + builder.setInsertBefore(inst); + auto ptr = inst->getOperand(0); + auto valueType = tryGetPointedToType(&builder, ptr->getDataType()); + if (!valueType) + break; + auto loadFunc = getOrCreateLoadFuncForType(valueType); + if (!loadFunc) + break; + // If the type doesn't need further lowering, we don't need to do anything. + if (loadFunc.kind == LoadMethodKind::Opcode && loadFunc.op == kIROp_CUDALDG) + break; + auto newLoad = loadFunc.apply(builder, valueType, ptr); + inst->replaceUsesWith(newLoad); + inst->removeAndDeallocate(); + } + break; + } + } + + void processModule() + { + processAllInsts([&](IRInst* inst) { processInst(inst); }); + } + + ImmutableBufferLoadLoweringContext(IRModule* inModule) + : InstPassBase(inModule) + { + } +}; + +void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module) +{ + ImmutableBufferLoadLoweringContext context(module); + context.targetProgram = targetProgram; + context.processModule(); +} + +} // namespace Slang diff --git a/source/slang/slang-ir-cuda-immutable-load.h b/source/slang/slang-ir-cuda-immutable-load.h new file mode 100644 index 000000000..adaa58dc8 --- /dev/null +++ b/source/slang/slang-ir-cuda-immutable-load.h @@ -0,0 +1,17 @@ +#pragma once + +namespace Slang +{ + +/* +This pass will lower all immutable buffer loads into CUDA `__ldg` intrinsic calls +to make sure these loads are performed through the read-only data cache on the GPU +for better performance. +*/ + +struct IRModule; +class TargetProgram; + +void lowerImmutableBufferLoadForCUDA(TargetProgram* targetProgram, IRModule* module); + +} // namespace Slang diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp index 1ecae6574..ef7eed485 100644 --- a/source/slang/slang-ir-glsl-legalize.cpp +++ b/source/slang/slang-ir-glsl-legalize.cpp @@ -1477,7 +1477,7 @@ ScalarizedVal createSimpleGLSLGlobalVarying( auto accessQualifier = AccessQualifier::ReadWrite; if (kind == LayoutResourceKind::VaryingInput) - accessQualifier = AccessQualifier::Read; + accessQualifier = AccessQualifier::Immutable; IRType* paramType = builder->getPtrType(ptrOpCode, arrayType, accessQualifier, addrSpace); @@ -3109,7 +3109,7 @@ IRInst* getOrCreatePerVertexInputArray(GLSLLegalizationContext* context, IRInst* tryGetPointedToType(&builder, inputVertexAttr->getDataType()), builder.getIntValue(builder.getIntType(), 3)); arrayInst = builder.createGlobalParam( - builder.getPtrType(arrayType, AccessQualifier::Read, AddressSpace::Input)); + builder.getPtrType(arrayType, AccessQualifier::Immutable, AddressSpace::Input)); context->mapVertexInputToPerVertexArray[inputVertexAttr] = arrayInst; builder.addDecoration(arrayInst, kIROp_PerVertexDecoration); diff --git a/source/slang/slang-ir-insts-stable-names.lua b/source/slang/slang-ir-insts-stable-names.lua index 923e73ac4..ee999d382 100644 --- a/source/slang/slang-ir-insts-stable-names.lua +++ b/source/slang/slang-ir-insts-stable-names.lua @@ -681,4 +681,5 @@ return { ["Decoration.InParamProxyVar"] = 677, ["Attr.MemoryScope"] = 678, ["Undefined.LoadFromUninitializedMemory"] = 679, + ["CUDA_LDG"] = 680, } diff --git a/source/slang/slang-ir-insts.lua b/source/slang/slang-ir-insts.lua index ac72e718a..1290507d0 100644 --- a/source/slang/slang-ir-insts.lua +++ b/source/slang/slang-ir-insts.lua @@ -675,6 +675,7 @@ local insts = { { var = {} }, { load = { min_operands = 1 } }, { store = { min_operands = 2 } }, + { CUDA_LDG = {min_operands = 1 } }, -- Atomic Operations { AtomicOperation = { diff --git a/source/slang/slang-ir-redundancy-removal.cpp b/source/slang/slang-ir-redundancy-removal.cpp index 0308b50c2..e2822ba09 100644 --- a/source/slang/slang-ir-redundancy-removal.cpp +++ b/source/slang/slang-ir-redundancy-removal.cpp @@ -128,26 +128,8 @@ bool removeRedundancy(IRModule* module, bool hoistLoopInvariantInsts) bool isAddressMutable(IRInst* inst) { - auto rootType = getRootAddr(inst)->getDataType(); - switch (rootType->getOp()) - { - case kIROp_ParameterBlockType: - case kIROp_ConstantBufferType: - case kIROp_BorrowInParamType: - return false; // immutable - - // We should consider StructuredBuffer as mutable by default, since the resources may alias. - // There could be anotherRWStructuredBuffer pointing to the same memory location as the - // structured buffer. - case kIROp_StructuredBufferLoad: - case kIROp_GetStructuredBufferPtr: - return true; // mutable - } - - // Similarly, IRPtrTypeBase should also be considered writable always, - // because there can be aliasing. - - return true; // mutable + auto rootAddr = getRootAddr(inst); + return !isPointerToImmutableLocation(rootAddr); } /// Eliminate redundant temporary variable copies in load-store patterns. diff --git a/source/slang/slang-ir-spirv-legalize.cpp b/source/slang/slang-ir-spirv-legalize.cpp index 6bdd02d73..965d6dc9b 100644 --- a/source/slang/slang-ir-spirv-legalize.cpp +++ b/source/slang/slang-ir-spirv-legalize.cpp @@ -517,7 +517,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase // structured buffers in GLSL should be annotated as ReadOnly if (as<IRHLSLStructuredBufferType>(structuredBufferType)) { - access = AccessQualifier::Read; + access = AccessQualifier::Immutable; memoryFlags = MemoryQualifierSetModifier::Flags::kReadOnly; } if (as<IRHLSLRasterizerOrderedStructuredBufferType>(structuredBufferType)) @@ -2276,7 +2276,7 @@ struct SPIRVLegalizationContext : public SourceEmitterBase AccessQualifier accessQualifier = AccessQualifier::ReadWrite; if (as<IRHLSLStructuredBufferType>(t)) - accessQualifier = AccessQualifier::Read; + accessQualifier = AccessQualifier::Immutable; IRBuilder builder(t); builder.setInsertBefore(t); diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp index 11eb4dfa0..4fb4d61ae 100644 --- a/source/slang/slang-ir-util.cpp +++ b/source/slang/slang-ir-util.cpp @@ -2749,6 +2749,7 @@ bool isPointerToImmutableLocation(IRInst* loc) switch (loc->getOp()) { case kIROp_GetStructuredBufferPtr: + case kIROp_RWStructuredBufferGetElementPtr: case kIROp_ImageSubscript: return isPointerToImmutableLocation(loc->getOperand(0)); default: @@ -2784,6 +2785,8 @@ bool isPointerToImmutableLocation(IRInst* loc) case AddressSpace::UniformConstant: return true; } + if (ptrType->getAccessQualifier() == AccessQualifier::Immutable) + return true; } return false; } diff --git a/source/slang/slang-type-system-shared.h b/source/slang/slang-type-system-shared.h index aa9b07d89..b68e44b32 100644 --- a/source/slang/slang-type-system-shared.h +++ b/source/slang/slang-type-system-shared.h @@ -129,10 +129,26 @@ enum class MemoryScope : int32_t ShaderCall = 6, }; +// Represents the access qualifier of a pointer type. enum class AccessQualifier : uint64_t { ReadWrite = 0, + + // The data being pointed to by a pointer can only be read through the pointer. + // This is to be distinguished from `Immutable`, which means the data being pointed to + // won't be changed by any means. In contrast, data pointed to by a `Read` pointer + // may still be changed through another pointer that is not read-only. + // This means that a pointer with `Read` access is meaningful only to the front-end + // type system, and is not expected to provide any optimization opportunities to + // the back-end. Read = 1, + + // The data being pointed to by a pointer is known to be immutable and won't + // be changed by any means during the execution of the program. It is UB if + // the data is changed during the program execution. This is a stronger + // qualifier than `Read`, and may allow the backend to perform more aggresive + // optimizations. + Immutable = 2, }; } // namespace Slang diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang index 273e6dc58..376ef1f80 100644 --- a/tests/cuda/copy-elision-this-1.slang +++ b/tests/cuda/copy-elision-this-1.slang @@ -10,7 +10,7 @@ struct Data { // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}}) // CUDA-NEXT: { - // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load + // CUDA-NEXT: globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}] float fetch(int buffer, int index) { return input[buffer][index]; diff --git a/tests/cuda/dispatch-thread-id-extraction.slang b/tests/cuda/dispatch-thread-id-extraction.slang index 02705ff24..b1fac2d00 100644 --- a/tests/cuda/dispatch-thread-id-extraction.slang +++ b/tests/cuda/dispatch-thread-id-extraction.slang @@ -12,7 +12,7 @@ void computeMain(uint tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt { dst[tid.x] = src[tid.x]; } -// CHECK: uint _S1 = (blockIdx * blockDim + threadIdx).x; +// CHECK: uint {{.*}} = (blockIdx * blockDim + threadIdx).x; [shader("compute")] [numthreads(1, 1, 1)] @@ -20,7 +20,7 @@ void computeMain2(uint2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RW { dst[tid.x] = src[tid.y]; } -// CHECK: uint2 _S2 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y}; +// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y}; [shader("compute")] [numthreads(1, 1, 1)] @@ -28,9 +28,7 @@ void computeMain3(int2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWS { dst[tid.x] = src[tid.x]; } -// CHECK: uint2 _S3 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y}; -// CHECK: int2 _S4 = make_int2 ((int)_S3.x, (int)_S3.y); -// CHECK: int _S5 = _S4.x; +// CHECK: uint2 {{.*}} = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y}; [shader("compute")] [numthreads(1, 1, 1)] @@ -38,7 +36,7 @@ void computeMain4(int tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWSt { dst[tid.x] = src[tid.x]; } -// CHECK: int _S6 = int((blockIdx * blockDim + threadIdx).x); +// CHECK: int {{.*}} = int((blockIdx * blockDim + threadIdx).x); [shader("compute")] [numthreads(1, 1, 1)] @@ -46,4 +44,4 @@ void computeMain5(int tid: SV_GroupIndex, StructuredBuffer<uint> src, RWStructur { dst[tid.x] = src[tid.x]; } -// CHECK: int _S7 = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x); +// CHECK: int {{.*}} = int((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x); diff --git a/tests/optimization/buffer-load-defer-ptr.slang b/tests/optimization/buffer-load-defer-ptr.slang new file mode 100644 index 000000000..cde006dcf --- /dev/null +++ b/tests/optimization/buffer-load-defer-ptr.slang @@ -0,0 +1,38 @@ +//TEST:SIMPLE(filecheck=SPV): -target spirv + +struct Bottom +{ + float bigArray[1024]; + float bottomGetValue(int index) { return bigArray[index]; } +} + +struct Middle +{ + Bottom bottom; + float middleGetValue(int index) { return bottom.bottomGetValue(index); } +} + +struct Top +{ + Middle middle; + float topGetValue(int index) { return middle.middleGetValue(index); } +} + +struct Root +{ + Top top; +} + +uniform ImmutablePtr<Root> cb; + +RWStructuredBuffer<float> outputBuffer; + +// SPV: OpEntryPoint +// SPV-NOT: OpCompositeConstruct + +[shader("compute")] +[numthreads(1, 1, 1)] +void compute_main(uint3 tid: SV_DispatchThreadID) +{ + outputBuffer[0] = cb.top.topGetValue(0); +} diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang index f7f9b1888..22f93bcd1 100644 --- a/tests/optimization/defer-structured-buffer-load.slang +++ b/tests/optimization/defer-structured-buffer-load.slang @@ -28,7 +28,7 @@ RWStructuredBuffer<float> outputBuffer; // SPV: OpStore %{{.*}} %[[VALUE]] // CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]]) -// CUDA: return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}}; +// CUDA: __ldg(&(&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}}); [shader("compute")] [numthreads(1, 1, 1)] diff --git a/tests/optimization/immutable-buffer-load.slang b/tests/optimization/immutable-buffer-load.slang new file mode 100644 index 000000000..251a1ce9c --- /dev/null +++ b/tests/optimization/immutable-buffer-load.slang @@ -0,0 +1,21 @@ +// Test that we can use ImmutablePtr<T> to result in more optimized buffer loads +// in SPIR-V and CUDA. + +//TEST:SIMPLE(filecheck=PTX): -target ptx -entry computeMain -stage compute +//TEST:SIMPLE(filecheck=SPV): -target spirv -O0 +uniform ImmutablePtr<float4> data; + +uniform float4* result; + +float4 work(ImmutablePtr<float4> ptr) +{ + return *ptr; +} + +[numthreads(1,1,1)] +void computeMain() +{ + // SPV: Restrict + // PTX: ld.global.nc.v4.f32 + *result = work(data) + float4(1,2,3,4); +}
\ No newline at end of file |
