summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/user-guide/a3-02-reference-capability-atoms.md6
-rw-r--r--prelude/slang-cpp-types.h4
-rw-r--r--prelude/slang-cuda-prelude.h4
-rw-r--r--source/slang/slang-emit-wgsl.cpp5
-rw-r--r--source/slang/slang-emit.cpp8
-rw-r--r--source/slang/slang-ir-defer-buffer-load.cpp326
-rw-r--r--source/slang/slang-ir-defer-buffer-load.h22
-rw-r--r--source/slang/slang-ir-defunctionalization.cpp2
-rw-r--r--source/slang/slang-ir-glsl-legalize.cpp10
-rw-r--r--source/slang/slang-ir-metal-legalize.cpp2
-rw-r--r--source/slang/slang-ir-specialize-address-space.cpp43
-rw-r--r--source/slang/slang-ir-specialize-arrays.cpp32
-rw-r--r--source/slang/slang-ir-specialize-buffer-load-arg.cpp124
-rw-r--r--source/slang/slang-ir-specialize-function-call.cpp205
-rw-r--r--source/slang/slang-ir-specialize-function-call.h4
-rw-r--r--source/slang/slang-ir-specialize-resources.cpp3
-rw-r--r--source/slang/slang-ir-util.cpp246
-rw-r--r--source/slang/slang-ir-util.h8
-rw-r--r--tests/cuda/copy-elision-this-1.slang10
-rw-r--r--tests/glsl/global-uniform-with-varyings.slang6
-rw-r--r--tests/metal/out-param.slang19
-rw-r--r--tests/optimization/buffer-load-defer-aliasing-1.slang45
-rw-r--r--tests/optimization/buffer-load-defer-aliasing.slang38
-rw-r--r--tests/optimization/buffer-load-defer-bindless.slang58
-rw-r--r--tests/optimization/buffer-load-defer-user-pointer.slang63
-rw-r--r--tests/optimization/buffer-load-defer.slang38
-rw-r--r--tests/optimization/buffer-load-specialize-1.slang35
-rw-r--r--tests/optimization/buffer-store-defer.slang51
-rw-r--r--tests/optimization/defer-structured-buffer-load.slang38
-rw-r--r--tests/vkray/raygen-trace-ray-param-non-struct.slang12
-rw-r--r--tests/wgsl/switch-case.slang9
31 files changed, 1153 insertions, 323 deletions
diff --git a/docs/user-guide/a3-02-reference-capability-atoms.md b/docs/user-guide/a3-02-reference-capability-atoms.md
index 7f759ca24..7f27284d9 100644
--- a/docs/user-guide/a3-02-reference-capability-atoms.md
+++ b/docs/user-guide/a3-02-reference-capability-atoms.md
@@ -626,6 +626,9 @@ Extensions
`SPV_KHR_vulkan_memory_model`
> Represents the SPIR-V extension for SPV_KHR_vulkan_memory_model.
+`SPV_NV_bindless_texture`
+> Represents the SPIR-V extension for SPV_NV_bindless_texture.
+
`SPV_NV_cluster_acceleration_structure`
> Represents the SPIR-V extension for cluster acceleration structure.
@@ -675,6 +678,9 @@ Extensions
`spvAtomicFloat64MinMaxEXT`
> Represents the SPIR-V capability for atomic float 64 min/max operations.
+`spvBindlessTextureNV`
+> Represents the SPIR-V capability for the bindless texture.
+
`spvCooperativeMatrixBlockLoadsNV`
> Represents the SPIR-V capability for cooperative matrix 2
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 491438c80..26b45d53f 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -59,12 +59,12 @@ struct RWStructuredBuffer
template<typename T>
struct StructuredBuffer
{
- SLANG_FORCE_INLINE const T& operator[](size_t index) const
+ SLANG_FORCE_INLINE T& operator[](size_t index) const
{
SLANG_BOUND_CHECK(index, count);
return data[index];
}
- const T& Load(size_t index) const
+ T& Load(size_t index) const
{
SLANG_BOUND_CHECK(index, count);
return data[index];
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 5c5335ac5..6c68cdb71 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2312,7 +2312,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uintptr_t UPTR_max(uintptr_t a, uintptr_t b)
template<typename T>
struct StructuredBuffer
{
- SLANG_CUDA_CALL const T& operator[](size_t index) const
+ SLANG_CUDA_CALL T& operator[](size_t index) const
{
#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
SLANG_BOUND_CHECK(index, count);
@@ -2320,7 +2320,7 @@ struct StructuredBuffer
return data[index];
}
- SLANG_CUDA_CALL const T& Load(size_t index) const
+ SLANG_CUDA_CALL T& Load(size_t index) const
{
#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
SLANG_BOUND_CHECK(index, count);
diff --git a/source/slang/slang-emit-wgsl.cpp b/source/slang/slang-emit-wgsl.cpp
index 53c3aa487..b115c723a 100644
--- a/source/slang/slang-emit-wgsl.cpp
+++ b/source/slang/slang-emit-wgsl.cpp
@@ -295,6 +295,11 @@ void WGSLSourceEmitter::emitStructFieldAttributes(
{
SLANG_UNUSED(allowOffsetLayout);
+ // If the struct type is not used for physical storage, then we don't need to
+ // emit any layout attributes.
+ if (!structType->findDecoration<IRPhysicalTypeDecoration>())
+ return;
+
// Tint emits errors unless we explicitly spell out the layout in some cases, so emit
// offset and align attribtues for all fields.
IRSizeAndAlignmentDecoration* const sizeAndAlignmentDecoration =
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index f1cc6090d..09c2efea9 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -1387,16 +1387,10 @@ Result linkAndOptimizeIR(
specializeFuncsForBufferLoadArgs(codeGenContext, irModule);
// Push `structuredBufferLoad` to the end of access chain to avoid loading unnecessary data.
- if (isKhronosTarget(targetRequest) || isMetalTarget(targetRequest) ||
- isWGPUTarget(targetRequest))
- deferBufferLoad(irModule);
+ deferBufferLoad(codeGenContext, irModule);
// We also want to specialize calls to functions that
// takes unsized array parameters if possible.
- // Moreover, for Khronos targets, we also want to specialize calls to functions
- // that takes arrays/structs containing arrays as parameters with the actual
- // global array object to avoid loading big arrays into SSA registers, which seems
- // to cause performance issues.
specializeArrayParameters(codeGenContext, irModule);
#if 0
diff --git a/source/slang/slang-ir-defer-buffer-load.cpp b/source/slang/slang-ir-defer-buffer-load.cpp
index 51c6a161b..ccdfe4538 100644
--- a/source/slang/slang-ir-defer-buffer-load.cpp
+++ b/source/slang/slang-ir-defer-buffer-load.cpp
@@ -3,142 +3,211 @@
#include "slang-ir-clone.h"
#include "slang-ir-dominators.h"
#include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
#include "slang-ir-redundancy-removal.h"
#include "slang-ir-util.h"
#include "slang-ir.h"
namespace Slang
{
-struct DeferBufferLoadContext
-{
- // Map an original SSA value to a pointer that can be used to load the value.
- Dictionary<IRInst*, IRInst*> mapValueToPtr;
- // Map an ptr to its loaded value.
- Dictionary<IRInst*, IRInst*> mapPtrToValue;
+// Generally, we want to specialize arguments that are large in size, or arguments that
+// are arrays or composite type that contains arrays.
+// This is because:
+// 1. Struct types without arrays will eventually be SROA's into registers and then effectively
+// DCE'd, so they usually won't cause performance issues. In fact, front loading structs
+// and reusing the loaded value instead of repetitively loading from constant memory is
+// usually beneficial to performance. However large struct values can be SROA'd into a large
+// number of registers, causing slow downstream compilation. Therefore we should avoid/defer
+// loading them into registers if we can.
+// 2. Arrays usually cannot be SROA'd into individual registers, which usually leads to
+// large register consumption if they ever get loaded, so we want to defer loading array
+// typed values as much as possible.
- IRFunc* currentFunc = nullptr;
+// If the argument data is bigger than this threshold, it is considered a large object
+// and we will try to specialize it even if it doesn't contain arrays.
+static const int kBufferLoadElementSizeSpecializationThreshold = 128;
- // Ensure that for an original SSA value, we have formed a pointer that can be used to load the
- // value.
- IRInst* ensurePtr(IRInst* valueInst)
- {
- IRInst* result = nullptr;
- if (mapValueToPtr.tryGetValue(valueInst, result))
- return result;
+// If the argument data is smaller than this threshold, it is considered a tiny object
+// and we will not consider specializing it, even if it contains arrays.
+static const int kBufferLoadElementSizeSpecializationMinThreshold = 16;
- IRBuilder b(valueInst);
- b.setInsertBefore(valueInst);
-
- switch (valueInst->getOp())
+static bool isCompositeTypeContainingArrays(IRType* type)
+{
+ if (auto structType = as<IRStructType>(type))
+ {
+ for (auto field : structType->getFields())
{
- case kIROp_StructuredBufferLoad:
- case kIROp_StructuredBufferLoadStatus:
- {
- result = b.emitRWStructuredBufferGetElementPtr(
- valueInst->getOperand(0),
- valueInst->getOperand(1));
- break;
- }
- case kIROp_GetElement:
+ if (const auto arrayType = as<IRArrayTypeBase>(field->getFieldType()))
{
- auto ptr = ensurePtr(valueInst->getOperand(0));
- if (!ptr)
- return nullptr;
- result = b.emitElementAddress(ptr, valueInst->getOperand(1));
- break;
+ return true;
}
- case kIROp_FieldExtract:
+ if (auto subStructType = as<IRStructType>(field->getFieldType()))
{
- auto ptr = ensurePtr(valueInst->getOperand(0));
- if (!ptr)
- return nullptr;
- result = b.emitFieldAddress(ptr, valueInst->getOperand(1));
- break;
+ if (isCompositeTypeContainingArrays(subStructType))
+ return true;
}
- case kIROp_Load:
- result = valueInst->getOperand(0);
- break;
- }
- if (result)
- {
- mapValueToPtr[valueInst] = result;
}
- return result;
}
+ else if (as<IRArrayTypeBase>(type))
+ {
+ return true;
+ }
+ return false;
+}
- static bool isImmutableBufferLoad(IRInst* inst)
+bool isTypePreferrableToDeferLoad(CodeGenContext* codeGenContext, IRType* type)
+{
+ // If parameter is a pointer/reference, we should consider specialize it.
+ if (as<IROutTypeBase>(type) || as<IRRefType>(type) || as<IRConstRefType>(type))
+ return true;
+
+ // We only want to defer loading values that are "large enough" that
+ // we expect them to be expensive to pass by value.
+ //
+ IRSizeAndAlignment sizeAlignment = {};
+ if (SLANG_FAILED(getNaturalSizeAndAlignment(
+ codeGenContext->getTargetProgram()->getOptionSet(),
+ type,
+ &sizeAlignment)))
{
- // Note: we cannot defer loads from RWStructuredBuffer because there can be other
- // instructions that modify the buffer.
+ // If type contains fields that we don't know how to compute natural size
+ // for, default to specialize if it contains arrays.
+ return isCompositeTypeContainingArrays(type);
+ }
+
+ // If the argument is very small, don't bother specializing.
+ if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationMinThreshold)
+ return false;
+
+ // If the argument is somewhat small, don't specialize, unless it contains
+ // arrays.
+ if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationThreshold)
+ {
+ // We generally do not specialize for small values, except it contains
+ // arrays that usually present a challenge for the SROA pass to eliminate
+ // unnecessary loads.
+ if (!isCompositeTypeContainingArrays(type))
+ return false;
+ }
+ return true;
+}
+
+// Returns true if memory loaded by `loadInst` is not modified before `userInst` after it is
+// loaded.
+// This method is currently implementing a very conservative analysis that only allows
+// `loadInst` to be in the same block as `userInst`, with basic aliasing analysis for any
+// stores in between. All other cases are conservatively treated as the memory location may be
+// modified.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ TargetRequest* target,
+ IRInst* loadInst,
+ IRInst* userInst)
+{
+ auto func = getParentFunc(loadInst);
+ if (!func)
+ return false;
+
+ // For now we only check if loadInst and userInst are in the same block.
+ if (loadInst->getParent() != userInst->getParent())
+ return false;
+
+ for (IRInst* inst = loadInst->getNextInst(); inst; inst = inst->getNextInst())
+ {
+ // We found callInst before hitting any instruction that may modify the memory.
+ if (inst == userInst)
+ return true;
+
+ if (!inst->mightHaveSideEffects())
+ continue;
+
+ // If we see any inst that has side effect, check if it is simple case that we can rule
+ // out the possibility of modifying the memory location.
switch (inst->getOp())
{
- case kIROp_StructuredBufferLoad:
- case kIROp_StructuredBufferLoadStatus:
- return true;
- case kIROp_Load:
+ case kIROp_Store:
{
- auto rootAddr = getRootAddr(inst->getOperand(0));
- return isPointerToImmutableLocation(rootAddr);
+ auto storedDest = inst->getOperand(0);
+ if (canAddressesPotentiallyAlias(target, func, loadInst->getOperand(0), storedDest))
+ return false;
+ continue;
}
default:
+ // For any other case, conservatively assume the memory location may be modified.
return false;
}
}
+ // We didn't found callInst after loadInst within the same basic block.
+ // We conservatively assume the memory location may be modified.
+ // This check can be extended to use the dominator tree to allow
+ // loadInst and userInst to be in different blocks.
+ return false;
+}
- // Ensure that for a pointer value, we have created a load instruction to materialize the value.
- IRInst* materializePointer(IRBuilder& builder, IRInst* loadInst)
+struct DeferBufferLoadContext
+{
+ CodeGenContext* codeGenContext;
+
+
+ void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
{
- auto ptr = ensurePtr(loadInst);
- if (!ptr)
- return nullptr;
- IRInst* result = nullptr;
- if (mapPtrToValue.tryGetValue(ptr, result))
- return result;
- IRAlignedAttr* align = nullptr;
- if (auto load = as<IRLoad>(loadInst))
- align = load->findAttr<IRAlignedAttr>();
- if (!as<IRModuleInst>(ptr->getParent()))
+ // Don't defer the load anymore if the type is simple.
+ if (!isTypePreferrableToDeferLoad(codeGenContext, loadInst->getDataType()) ||
+ loadInst->findAttr<IRAlignedAttr>())
{
- setInsertAfterOrdinaryInst(&builder, ptr);
- IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
- result = builder.emitLoad(valueType, ptr, align);
- mapPtrToValue[ptr] = result;
+ return;
}
- else
+
+ auto rootAddr = getRootAddr(loadInst->getOperand(0));
+ bool isImmutableBufferLoad = isPointerToImmutableLocation(rootAddr);
+
+ // Don't defer the load if there are uses that are not getElement or fieldExtract.
+ // Because in this case we need to use the entire loaded value, and further deferring
+ // the load down any access chain will introduce redundant loads.
+ for (auto use = loadInst->firstUse; use; use = use->nextUse)
{
- setInsertBeforeOrdinaryInst(&builder, loadInst);
- IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
- result = builder.emitLoad(valueType, ptr, align);
- // Since we are inserting the load in a local scope, we can't register
- // the mapping to the pointer, since the global pointer needs to be
- // loaded once per function.
+ auto user = use->getUser();
+ switch (user->getOp())
+ {
+ case kIROp_GetElement:
+ case kIROp_FieldExtract:
+ // Can we defer the load to load only the requested element right before
+ // the element extract inst?
+ // If the buffer is immutable, we can always do that.
+ // If it is not, we need to make sure there is no other instructions that can modify
+ // the buffer between the load and the use.
+ //
+ if (isImmutableBufferLoad)
+ continue;
+ if (isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ codeGenContext->getTargetReq(),
+ loadInst,
+ user))
+ continue;
+ return;
+ default:
+ // If we see any other use the laod instruction, we assume the entire loaded value
+ // is needed, and we can't defer the load anymore.
+ return;
+ }
}
- return result;
- }
- static bool isSimpleType(IRInst* type)
- {
- if (auto modType = as<IRRateQualifiedType>(type))
- type = modType->getValueType();
- if (as<IRStructType>(type))
- return false;
- if (as<IRTupleType>(type))
- return false;
- if (as<IRArrayTypeBase>(type))
- return false;
- return true;
- }
+ // If we reach here, it means all uses are getElement or fieldExtract, and
+ // it is safe to defer the load down the access chain.
- void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
- {
- // Don't defer the load anymore if the type is simple.
- if (isSimpleType(loadInst->getDataType()) || loadInst->findAttr<IRAlignedAttr>())
+ if (loadInst->getOp() == kIROp_StructuredBufferLoad)
{
- auto materializedVal = materializePointer(builder, loadInst);
- loadInst->transferDecorationsTo(materializedVal);
- loadInst->replaceUsesWith(materializedVal);
- return;
+ // Convert the structuredBufferLoad to a regular load to reuse
+ // the same logic for deferring regular loads.
+ builder.setInsertBefore(loadInst);
+ auto bufferPtr = builder.emitRWStructuredBufferGetElementPtr(
+ loadInst->getOperand(0),
+ loadInst->getOperand(1));
+ auto sbLoad = builder.emitLoad(bufferPtr);
+ loadInst->transferDecorationsTo(sbLoad);
+ loadInst->replaceUsesWith(sbLoad);
+ loadInst->removeAndDeallocate();
+ loadInst = sbLoad;
}
// Otherwise, look for all uses and try to defer the load before actual use of the value.
@@ -148,19 +217,29 @@ struct DeferBufferLoadContext
loadInst,
[&](IRUse* use)
{
- if (needMaterialize)
- return;
-
auto user = use->getUser();
+
switch (user->getOp())
{
case kIROp_GetElement:
case kIROp_FieldExtract:
{
- auto basePtr = ensurePtr(loadInst);
- if (!basePtr)
- return;
- pendingWorkList.add(user);
+ // If we see a getElement or fieldExtract, we defer the load by
+ // replacing the getElement/fieldExtract with a load of the
+ // elementAddr/fieldAddr.
+ builder.setInsertBefore(user);
+ auto basePtr = loadInst->getOperand(0);
+ IRInst* gepArg = user->getOperand(1);
+ auto elementPtr = builder.emitElementAddress(
+ basePtr,
+ makeArrayViewSingle<IRInst*>(gepArg));
+ auto newLoad = builder.emitLoad(elementPtr);
+ user->transferDecorationsTo(newLoad);
+ user->replaceUsesWith(newLoad);
+ user->removeAndDeallocate();
+
+ // Now add the new load to work list to try to defer it further.
+ pendingWorkList.add(newLoad);
}
break;
default:
@@ -169,41 +248,37 @@ struct DeferBufferLoadContext
}
});
- if (needMaterialize)
- {
- auto val = materializePointer(builder, loadInst);
- loadInst->transferDecorationsTo(val);
- loadInst->replaceUsesWith(val);
- loadInst->removeAndDeallocate();
- }
- else
- {
- // Append to worklist in reverse order so we process the uses in natural appearance
- // order.
- for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
- workList.add(pendingWorkList[i]);
- }
+ // Append to worklist in reverse order so we process the uses in natural appearance
+ // order.
+ for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
+ workList.add(pendingWorkList[i]);
}
void deferBufferLoadInFunc(IRFunc* func)
{
removeRedundancyInFunc(func, false);
- currentFunc = func;
-
List<IRInst*> workList;
+ // Discover all load instructions and add to work list.
+
for (auto block : func->getBlocks())
{
for (auto inst : block->getChildren())
{
- if (isImmutableBufferLoad(inst))
+ switch (inst->getOp())
{
+ case kIROp_Load:
+ case kIROp_StructuredBufferLoad:
+ // Note: We don't handle `kIROp_StructuredBufferLoadStatus` here because
+ // it also writes to the status code out parameter, which we can't defer.
workList.add(inst);
+ break;
}
}
}
+ // Iteratively process the work list until it is empty.
IRBuilder builder(func);
for (Index i = 0; i < workList.getCount(); i++)
{
@@ -227,9 +302,10 @@ struct DeferBufferLoadContext
}
};
-void deferBufferLoad(IRModule* module)
+void deferBufferLoad(CodeGenContext* codeGenContext, IRModule* module)
{
DeferBufferLoadContext context;
+ context.codeGenContext = codeGenContext;
for (auto childInst : module->getGlobalInsts())
{
if (auto code = as<IRGlobalValueWithCode>(childInst))
diff --git a/source/slang/slang-ir-defer-buffer-load.h b/source/slang/slang-ir-defer-buffer-load.h
index b54271883..0f692b39a 100644
--- a/source/slang/slang-ir-defer-buffer-load.h
+++ b/source/slang/slang-ir-defer-buffer-load.h
@@ -4,9 +4,8 @@ namespace Slang
{
/*
-This pass implements a targeted optimization that defers the loading of structured buffer elements
-to the end of the access chain to avoid loading and repacking unnecessary data.
-For example, if we see:
+This pass implements a intra-function optimization that defers the loading of buffer
+elements to the end of the access chain to avoid loading unnecessary data. For example, if we see:
val = StructuredBufferLoad(s, i)
val2 = GetElement(val, j)
val3 = FieldExtract(val2, field_key_0)
@@ -20,7 +19,22 @@ We should rewrite the code into:
*/
struct IRModule;
+struct IRType;
+struct CodeGenContext;
+struct IRInst;
+class TargetRequest;
-void deferBufferLoad(IRModule* module);
+void deferBufferLoad(CodeGenContext* context, IRModule* module);
+
+// Returns true if the type is suitable for defer-load optimization.
+// Generally, we want to defer loading large structs or composites that contain arrays.
+bool isTypePreferrableToDeferLoad(CodeGenContext* context, IRType* type);
+
+// Returns true if memory loaded by `loadInst` may be modified before `userInst` after it is
+// loaded.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ TargetRequest* target,
+ IRInst* loadInst,
+ IRInst* userInst);
} // namespace Slang
diff --git a/source/slang/slang-ir-defunctionalization.cpp b/source/slang/slang-ir-defunctionalization.cpp
index af84ec78a..424971f90 100644
--- a/source/slang/slang-ir-defunctionalization.cpp
+++ b/source/slang/slang-ir-defunctionalization.cpp
@@ -12,7 +12,7 @@ struct FunctionParameterSpecializationCondition : FunctionCallSpecializeConditio
{
TargetRequest* targetRequest = nullptr;
- bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/)
+ bool doesParamWantSpecialization(IRParam* param, IRInst* /*arg*/, IRCall* /*callInst*/)
{
IRType* type = param->getDataType();
return as<IRFuncType>(type);
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index a79ca2379..d87d96da0 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2694,7 +2694,10 @@ static void legalizeMeshPayloadInputParam(
pp->replaceUsesWith(g);
struct MeshPayloadInputSpecializationCondition : FunctionCallSpecializeCondition
{
- bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+ bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+ {
+ return arg == g;
+ }
IRInst* g;
} condition;
condition.g = g;
@@ -2794,7 +2797,10 @@ static void legalizeMeshOutputParam(
// pp is only removed later on, so sadly we have to keep it around for now
struct MeshOutputSpecializationCondition : FunctionCallSpecializeCondition
{
- bool doesParamWantSpecialization(IRParam*, IRInst* arg) { return arg == g; }
+ bool doesParamWantSpecialization(IRParam*, IRInst* arg, IRCall* /*call*/)
+ {
+ return arg == g;
+ }
IRInst* g;
} condition;
condition.g = g;
diff --git a/source/slang/slang-ir-metal-legalize.cpp b/source/slang/slang-ir-metal-legalize.cpp
index e66617e72..e91da136a 100644
--- a/source/slang/slang-ir-metal-legalize.cpp
+++ b/source/slang/slang-ir-metal-legalize.cpp
@@ -172,7 +172,7 @@ struct MetalAddressSpaceAssigner : InitialAddressSpaceAssigner
{
if (ptrType->hasAddressSpace())
return ptrType->getAddressSpace();
- return AddressSpace::Global;
+ return AddressSpace::Generic;
}
return AddressSpace::Generic;
}
diff --git a/source/slang/slang-ir-specialize-address-space.cpp b/source/slang/slang-ir-specialize-address-space.cpp
index c4a155eec..04792bd8b 100644
--- a/source/slang/slang-ir-specialize-address-space.cpp
+++ b/source/slang/slang-ir-specialize-address-space.cpp
@@ -131,7 +131,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
bool processFunction(IRFunc* func)
{
bool retValAddrSpaceChanged = false;
- Dictionary<IRInst*, AddressSpace> mapVarValueToAddrSpace;
bool changed = true;
while (changed)
{
@@ -152,18 +151,23 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
continue;
}
- // If the inst already has a pointer type with explicit address space, then use
- // it.
- if (auto ptrType = as<IRPtrTypeBase>(inst->getDataType()))
+ // If the inst already has a pointer/pointer-like type with explicit address
+ // space, then use it.
+ auto addrSpaceFromType =
+ addrSpaceAssigner->getAddressSpaceFromVarType(inst->getDataType());
+ if (addrSpaceFromType != AddressSpace::Generic)
{
- if (ptrType->hasAddressSpace())
- {
- mapInstToAddrSpace[inst] = ptrType->getAddressSpace();
+ mapInstToAddrSpace[inst] = addrSpaceFromType;
+ changed = true;
+
+ // Don't return early if the inst itself is a call, as we may still need to
+ // specialize it down below.
+ if (inst->getOp() != kIROp_Call)
continue;
- }
}
- // Otherwise, try to assign an address space based on the instruction type.
+ // Try to assign an address space based on the instruction type, and specialize
+ // calls.
switch (inst->getOp())
{
case kIROp_Var:
@@ -195,15 +199,6 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
}
break;
case kIROp_Store:
- {
- auto addrSpace = getAddrSpace(inst->getOperand(1));
- if (addrSpace != AddressSpace::Generic)
- {
- mapVarValueToAddrSpace[inst->getOperand(0)] = addrSpace;
- mapInstToAddrSpace[inst] = addrSpace;
- changed = true;
- }
- }
break;
case kIROp_Param:
if (!isFirstBlock)
@@ -243,8 +238,9 @@ struct AddressSpaceContext : public AddressSpaceSpecializationContext
for (UInt i = 0; i < callInst->getArgCount(); i++)
{
auto arg = callInst->getArg(i);
- argAddrSpaces.add(getAddrSpace(arg));
- if (as<IRPtrTypeBase>(arg->getDataType()))
+ auto addrSpace = getAddrSpace(arg);
+ argAddrSpaces.add(addrSpace);
+ if (addrSpace != AddressSpace::Generic)
{
hasSpecializableArg = true;
}
@@ -477,8 +473,13 @@ void propagateAddressSpaceFromInsts(List<IRInst*>&& workList)
}
}
-AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst*)
+AddressSpace NoOpInitialAddressSpaceAssigner::getAddressSpaceFromVarType(IRInst* type)
{
+ if (auto ptrType = as<IRPtrTypeBase>(type))
+ {
+ if (ptrType->hasAddressSpace())
+ return ptrType->getAddressSpace();
+ }
return AddressSpace::Generic;
}
diff --git a/source/slang/slang-ir-specialize-arrays.cpp b/source/slang/slang-ir-specialize-arrays.cpp
index 4a4a72ee9..edb6cfa28 100644
--- a/source/slang/slang-ir-specialize-arrays.cpp
+++ b/source/slang/slang-ir-specialize-arrays.cpp
@@ -11,38 +11,14 @@ namespace Slang
struct ArrayParameterSpecializationCondition : FunctionCallSpecializeCondition
{
// This pass is intended to specialize functions
- // with struct parameters that has array fields
- // to avoid performance problems for GLSL targets.
- // Returns true if `type` is an `IRStructType` with array-typed fields.
- // It will also specialize functions with unsized array parameters into
- // sized arrays, if the function is called with an argument that has a
- // sized array type.
+ // with unsized array parameter called with a sized-array argument.
//
- bool isStructTypeWithArray(IRType* type)
- {
- if (auto structType = as<IRStructType>(type))
- {
- for (auto field : structType->getFields())
- {
- if (const auto arrayType = as<IRArrayType>(field->getFieldType()))
- {
- return true;
- }
- if (auto subStructType = as<IRStructType>(field->getFieldType()))
- {
- if (isStructTypeWithArray(subStructType))
- return true;
- }
- }
- }
- return false;
- }
- bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+ bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
{
+ SLANG_UNUSED(param);
SLANG_UNUSED(arg);
- if (isKhronosTarget(codeGenContext->getTargetReq()))
- return isStructTypeWithArray(param->getDataType());
+ SLANG_UNUSED(callInst);
return false;
}
diff --git a/source/slang/slang-ir-specialize-buffer-load-arg.cpp b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
index 905f2e058..a5a3dd2d9 100644
--- a/source/slang/slang-ir-specialize-buffer-load-arg.cpp
+++ b/source/slang/slang-ir-specialize-buffer-load-arg.cpp
@@ -1,8 +1,11 @@
// slang-ir-specialize-buffer-load-arg.cpp
#include "slang-ir-specialize-buffer-load-arg.h"
+#include "slang-ir-defer-buffer-load.h"
#include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
#include "slang-ir-specialize-function-call.h"
+#include "slang-ir-util.h"
#include "slang-ir.h"
namespace Slang
@@ -17,76 +20,115 @@ namespace Slang
// As swith most of our IR passes, we encapsulate the logic here in a context
// type so that the data that needs to be shared throughout the pass can
// be conveniently scoped.
+//
+
+// Note that this pass also ensures other more contrived cases are properly
+// handled. For example:
+//
+// * A load of a large structure from field in a constant buffer, so that
+// the value loaded is not the entire buffer contents.
+//
+// * A load of a large structure from a structured buffer, or any other kind
+// of buffer that requires an index.
+//
struct FuncBufferLoadSpecializationCondition : FunctionCallSpecializeCondition
{
typedef FunctionCallSpecializeCondition Super;
- virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+ CodeGenContext* codegenContext;
+
+ virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
{
// We only want to specialize for `struct` types and not base types.
//
- // TODO: We might want to consider some criteria here for the "large-ness"
- // of a structure (in terms of bytes and/or fields), so that we don't
- // eliminate loads of sufficiently small types (which are cheap to pass
- // by value).
- //
- auto paramType = param->getDataType();
- if (!as<IRStructType>(paramType))
+ auto paramType = (IRType*)unwrapAttributedType(param->getDataType());
+ if (!isTypePreferrableToDeferLoad(codegenContext, paramType))
return false;
- // We also only want to specialize for arguments that are a load
- // from some kind of global shader parameter.
+ // We want to handle loads from arbitrary access chains rooting from a shader parameter.
//
IRInst* a = arg;
- if (auto argLoad = as<IRLoad>(arg))
- {
- a = argLoad->getPtr();
- }
- else
+ for (;;)
{
- return false;
- }
+ // A user pointer can be directly passed into the function, so we no
+ // longer need to trace up further.
+ if (isUserPointerType(a->getDataType()))
+ break;
- // We want to handle loads from a shader parameter that is an array
- // of buffers, and not just a single global buffer.
- //
- while (auto argGetElement = as<IRGetElement>(a))
- {
- a = argGetElement->getBase();
+ if (auto argGetElement = as<IRGetElement>(a))
+ {
+ a = argGetElement->getBase();
+ }
+ else if (auto argSbLoad = as<IRStructuredBufferLoad>(a))
+ {
+ a = argSbLoad->getOperand(0);
+ }
+ else if (auto argBbLoad = as<IRByteAddressBufferLoad>(a))
+ {
+ a = argBbLoad->getOperand(0);
+ }
+ else if (auto argFieldExtract = as<IRFieldExtract>(a))
+ {
+ a = argFieldExtract->getBase();
+ }
+ else if (auto argGetElementPtr = as<IRGetElementPtr>(a))
+ {
+ a = argGetElementPtr->getBase();
+ }
+ else if (auto argSBGetElementPtr = as<IRRWStructuredBufferGetElementPtr>(a))
+ {
+ a = argSBGetElementPtr->getBase();
+ }
+ else if (auto argFieldAddr = as<IRFieldAddress>(a))
+ {
+ a = argFieldAddr->getBase();
+ }
+ else if (auto argLoad = as<IRLoad>(a))
+ {
+ a = argLoad->getPtr();
+
+ // We can safely defer a load to the callee if the source dest is immutable.
+ if (isPointerToImmutableLocation(a))
+ continue;
+
+ // Otherwise, we check if there is no other instructions in between the load and the
+ // call that can modify the memory location. If so, we can still safely defer the
+ // load to the callee.
+ if (!isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ codegenContext->getTargetReq(),
+ argLoad,
+ callInst))
+ return false;
+ }
+ else
+ {
+ break;
+ }
}
- // The "root" of the parameter must be a reference to a global-scope
- // shader parameter, so that we know we can substitute it into the callee.
+ // The "root" of the parameter must be one of the following:
+ // 1. A reference to a global-scope shader parameter that can be referenced directly from
+ // the callee.
+ // 2. A user pointer or bindless resource handle that can be passed to the callee as
+ // ordinary argument.
//
if (const auto argGlobalParam = as<IRGlobalParam>(a))
{
return true;
}
- else
+ else if (isUserPointerType(a->getDataType()) || as<IRCastDescriptorHandleToResource>(a))
{
- return false;
+ return true;
}
-
- // TODO: There are other patterns that we could attempt to optimize here.
- // For example, this logic only handles loads of the *entire* contents of
- // a buffer, so it would miss:
- //
- // * A load of a large structure from field in a constant buffer, so that
- // the value loaded is not the entire buffer contents.
- //
- // * A load of a large structure from a structured buffer, or any other kind
- // of buffer that requires an index.
- //
- // * Any resource load that is not expressed at the IR level with a `load`
- // instruction (e.g., those that might use an intrinsic function).
- //
+ return false;
}
};
void specializeFuncsForBufferLoadArgs(CodeGenContext* codegenContext, IRModule* module)
{
FuncBufferLoadSpecializationCondition condition;
+ condition.codegenContext = codegenContext;
specializeFunctionCalls(codegenContext, module, &condition);
}
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 7c82891a6..aead69258 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -40,6 +40,12 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
if (as<IRGlobalValueWithCode>(arg))
return true;
+ if (isUserPointerType(arg->getDataType()))
+ return true;
+
+ if (as<IRCastDescriptorHandleToResource>(arg))
+ return true;
+
// As we will see later, we can also
// specialize a call when the argument
// is the result of indexing into an
@@ -47,17 +53,29 @@ bool FunctionCallSpecializeCondition::isParamSuitableForSpecialization(
// of the indexing operation is also
// suitable for specialization.
//
- if (arg->getOp() == kIROp_GetElement || arg->getOp() == kIROp_Load)
+ switch (arg->getOp())
{
- auto base = arg->getOperand(0);
-
- // We will "recurse" on the base of
- // the indexing operation by continuing
- // our loop with the `base` as our new
- // argument.
- //
- arg = base;
- continue;
+ case kIROp_GetElement:
+ case kIROp_StructuredBufferLoad:
+ case kIROp_ByteAddressBufferLoad:
+ case kIROp_GetElementPtr:
+ case kIROp_RWStructuredBufferGetElementPtr:
+ case kIROp_FieldAddress:
+ case kIROp_FieldExtract:
+ case kIROp_Load:
+ {
+ auto base = arg->getOperand(0);
+
+ // We will "recurse" on the base of
+ // the indexing operation by continuing
+ // our loop with the `base` as our new
+ // argument.
+ //
+ arg = base;
+ continue;
+ }
+ default:
+ break;
}
// By default, we will *not* consider an argument
@@ -225,7 +243,7 @@ struct FunctionParameterSpecializationContext
// If neither the parameter nor the argument wants specialization,
// then we need to keep looking.
//
- auto paramWantSpecialization = doesParamWantSpecialization(param, arg);
+ auto paramWantSpecialization = doesParamWantSpecialization(param, arg, call);
auto paramTypeWantSpecialization = doesParamTypeWantSpecialization(param, arg);
if (!paramWantSpecialization && !paramTypeWantSpecialization)
continue;
@@ -255,9 +273,9 @@ struct FunctionParameterSpecializationContext
// Of course, now we need to back-fill the predicates that
// the above function used to evaluate prameters and arguments.
- bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+ bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
{
- return condition->doesParamWantSpecialization(param, arg);
+ return condition->doesParamWantSpecialization(param, arg, callInst);
}
bool doesParamTypeWantSpecialization(IRParam* param, IRInst* arg)
@@ -484,16 +502,20 @@ struct FunctionParameterSpecializationContext
UInt oldArgIndex = oldArgCounter++;
auto oldArg = oldCall->getArg(oldArgIndex);
- getCallInfoForParam(callInfo, oldParam, oldArg);
+ getCallInfoForParam(callInfo, oldParam, oldArg, oldCall);
}
}
- void getCallInfoForParam(CallSpecializationInfo& ioInfo, IRParam* oldParam, IRInst* oldArg)
+ void getCallInfoForParam(
+ CallSpecializationInfo& ioInfo,
+ IRParam* oldParam,
+ IRInst* oldArg,
+ IRCall* callInst)
{
// We know that the case where the parameter
// and argument don't want specialization is easy.
//
- if (!doesParamWantSpecialization(oldParam, oldArg))
+ if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
{
// The new call site will use the same argument
// value as the old one, and we don't need
@@ -546,7 +568,15 @@ struct FunctionParameterSpecializationContext
// Similarly for other global constants
ioInfo.key.vals.add(globalConstant);
}
- else if (oldArg->getOp() == kIROp_GetElement)
+ else if (isUserPointerType(oldArg->getDataType()))
+ {
+ // If the arg is a user pointer, we can pass it as an ordinary argument,
+ // and we won't need further tracing down the access chain.
+ //
+ ioInfo.key.vals.add(oldArg->getFullType());
+ ioInfo.newArgs.add(oldArg);
+ }
+ else if (isElementAccessInst(oldArg))
{
// This is the case where the `oldArg` is
// in the form `oldBase[oldIndex]`
@@ -587,19 +617,45 @@ struct FunctionParameterSpecializationContext
ioInfo.newArgs.add(oldIndex);
}
+ else if (isFieldAccessInst(oldArg))
+ {
+ // This is the case where the `oldArg` is
+ // in the form `oldBase.structKey`
+ //
+ auto oldBase = oldArg->getOperand(0);
+ auto structKey = oldArg->getOperand(1);
+
+ // Similar to the getElement case, we recursively setting up whatever
+ // `oldBase` needs first.
+ //
+ getCallInfoForArg(ioInfo, oldBase);
+
+ // The main difference from the `getElement` case is we actually want
+ // the structKey to be in the specialization key because it will be baked
+ // into the specialized function.
+ // And we won't introduce a new parameter to hold the index.
+ //
+ ioInfo.key.vals.add(structKey);
+ }
else if (oldArg->getOp() == kIROp_Load)
{
auto oldBase = oldArg->getOperand(0);
getCallInfoForArg(ioInfo, oldBase);
}
+ else if (oldArg->getOp() == kIROp_CastDescriptorHandleToResource)
+ {
+ // We are accessing a resource from a bindless handle.
+ // We can stop recursion here and just pass in the bindless handle as
+ // an argument.
+ auto oldBase = oldArg->getOperand(0);
+ ioInfo.key.vals.add(oldBase->getFullType());
+ ioInfo.newArgs.add(oldBase);
+ }
else
{
// If we fail to match any of the cases above
- // then a precondition was violated in that
- // `isArgSuitableForSpecialization` is allowing
- // a case that this routine is not covering.
- //
- SLANG_UNEXPECTED("mising case in 'getCallInfoForArg'");
+ // then the `SpecializeCondition` is letting through constructs that we cannot handle.
+ SLANG_UNEXPECTED("unexpected function call specialization argument form.");
}
}
@@ -641,7 +697,7 @@ struct FunctionParameterSpecializationContext
// will stand in for the parameter in the specialized
// function.
//
- auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg);
+ auto newVal = getSpecializedValueForParam(funcInfo, oldParam, oldArg, oldCall);
// We will collect the replacement value to use
// for each of the original parameters in an array.
@@ -681,12 +737,13 @@ struct FunctionParameterSpecializationContext
IRInst* getSpecializedValueForParam(
FuncSpecializationInfo& ioInfo,
IRParam* oldParam,
- IRInst* oldArg)
+ IRInst* oldArg,
+ IRCall* callInst)
{
// As always, the easy case is when the parameter of
// the original function doesn't need specialization.
//
- if (!doesParamWantSpecialization(oldParam, oldArg))
+ if (!doesParamWantSpecialization(oldParam, oldArg, callInst))
{
// The specialized callee will need a new parameter
// that fills the same role as the old one, so we
@@ -718,6 +775,36 @@ struct FunctionParameterSpecializationContext
}
}
+ // Returns true if `inst` is an instruction that accesses an element from an array or a buffer.
+ //
+ static bool isElementAccessInst(IRInst* inst)
+ {
+ switch (inst->getOp())
+ {
+ case kIROp_GetElementPtr:
+ case kIROp_GetElement:
+ case kIROp_RWStructuredBufferGetElementPtr:
+ case kIROp_StructuredBufferLoad:
+ case kIROp_ByteAddressBufferLoad:
+ return true;
+ }
+ return false;
+ }
+
+ // Returns true if `inst` is an instruction that accesses a field from a struct, that is
+ // either a FieldAddress or FieldExtract.
+ //
+ static bool isFieldAccessInst(IRInst* inst)
+ {
+ switch (inst->getOp())
+ {
+ case kIROp_FieldAddress:
+ case kIROp_FieldExtract:
+ return true;
+ }
+ return false;
+ }
+
IRInst* getSpecializedValueForArg(FuncSpecializationInfo& ioInfo, IRInst* oldArg)
{
// The logic here parallels `gatherCallInfoForArg`,
@@ -735,13 +822,24 @@ struct FunctionParameterSpecializationContext
//
return globalParam;
}
+ if (isUserPointerType(oldArg->getDataType()))
+ {
+ // If argument is a user pointer, we can pass it into the callee
+ // directly as an oridinary argument without further specializing
+ // for the access chain beyond the pointer.
+ //
+ auto builder = getBuilder();
+ auto newParam = builder->createParam(oldArg->getFullType());
+ ioInfo.newParams.add(newParam);
+ return newParam;
+ }
if (auto globalFunc = as<IRGlobalValueWithCode>(oldArg))
{
// As above, the identity of the specialized function is sufficient
// to resolve the uses
return globalFunc;
}
- else if (oldArg->getOp() == kIROp_GetElement)
+ else if (isElementAccessInst(oldArg))
{
// This is the case where the argument is
// in the form `oldBase[oldIndex]`.
@@ -801,7 +899,9 @@ struct FunctionParameterSpecializationContext
// of things, and then inserted to a more permanent location later.
//
builder->setInsertLoc(IRInsertLoc());
- auto newVal = builder->emitElementExtract(oldArg->getFullType(), newBase, newIndex);
+ IRInst* newOperands[] = {newBase, newIndex};
+ auto newVal =
+ builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
// Because our new instruction wasn't
// actually inserted anywhere, we need to
@@ -813,6 +913,30 @@ struct FunctionParameterSpecializationContext
return newVal;
}
+ else if (isFieldAccessInst(oldArg))
+ {
+ // This is the case where the argument is
+ // in the form `oldBase.structKey`.
+ //
+ auto oldBase = oldArg->getOperand(0);
+ auto structKey = oldArg->getOperand(1);
+
+ // We handle this case in a similar way as the `oldBase[oldIndex]`
+ // case, except that we don't need to introduce a new parameter
+ // for the index, since the struct key is known at compile-time.
+ auto newBase = getSpecializedValueForArg(ioInfo, oldBase);
+
+ auto builder = getBuilder();
+
+ builder->setInsertLoc(IRInsertLoc());
+ IRInst* newOperands[] = {newBase, structKey};
+ auto newVal =
+ builder->emitIntrinsicInst(oldArg->getFullType(), oldArg->getOp(), 2, newOperands);
+
+ ioInfo.newBodyInsts.add(newVal);
+
+ return newVal;
+ }
else if (auto oldArgLoad = as<IRLoad>(oldArg))
{
auto oldPtr = oldArgLoad->getPtr();
@@ -825,15 +949,30 @@ struct FunctionParameterSpecializationContext
return newVal;
}
+ else if (auto castHandleToResource = as<IRCastDescriptorHandleToResource>(oldArg))
+ {
+ // We are accessing a resource from a bindless handle.
+ // We should create a param for the handle, and load the resource from the param.
+ auto builder = getBuilder();
+ auto oldHandle = castHandleToResource->getOperand(0);
+ auto newHandle = builder->createParam(oldHandle->getFullType());
+ ioInfo.newParams.add(newHandle);
+
+ builder->setInsertLoc(IRInsertLoc());
+ IRInst* newOperands[] = {newHandle};
+ auto newVal = builder->emitIntrinsicInst(
+ oldArg->getFullType(),
+ kIROp_CastDescriptorHandleToResource,
+ 1,
+ newOperands);
+ ioInfo.newBodyInsts.add(newVal);
+ return newVal;
+ }
else
{
// If we don't match one of the above cases,
- // then `isArgSuitableForSpecialization` is
- // letting through cases that this function
- // hasn't been updated to handle.
- //
- SLANG_UNEXPECTED("mising case in 'getSpecializedValueForArg'");
- UNREACHABLE_RETURN(nullptr);
+ // then we are running into an invalid case.
+ SLANG_UNEXPECTED("unknown argument form for function call specialization.");
}
}
diff --git a/source/slang/slang-ir-specialize-function-call.h b/source/slang/slang-ir-specialize-function-call.h
index bab4ce2f4..afb8c2365 100644
--- a/source/slang/slang-ir-specialize-function-call.h
+++ b/source/slang/slang-ir-specialize-function-call.h
@@ -7,12 +7,14 @@ struct CodeGenContext;
struct IRInst;
struct IRModule;
struct IRParam;
+struct IRCall;
+
class Module;
class FunctionCallSpecializeCondition
{
public:
- virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg) = 0;
+ virtual bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst) = 0;
virtual bool isParamSuitableForSpecialization(IRParam* param, IRInst* arg);
diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp
index 871ba2c24..0ac08236f 100644
--- a/source/slang/slang-ir-specialize-resources.cpp
+++ b/source/slang/slang-ir-specialize-resources.cpp
@@ -20,9 +20,10 @@ struct ResourceParameterSpecializationCondition : FunctionCallSpecializeConditio
TargetRequest* targetRequest = nullptr;
TargetProgram* targetProgram = nullptr;
- bool doesParamWantSpecialization(IRParam* param, IRInst* arg)
+ bool doesParamWantSpecialization(IRParam* param, IRInst* arg, IRCall* callInst)
{
SLANG_UNUSED(arg);
+ SLANG_UNUSED(callInst);
// Whether or not a parameter needs specialization is really
// a function of its type:
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 8584ea95e..551a72fc7 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -17,6 +17,14 @@ bool isPointerOfType(IRInst* type, IROp opCode)
return false;
}
+bool isUserPointerType(IRInst* type)
+{
+ auto ptrType = as<IRPtrType>(type);
+ if (!ptrType)
+ return false;
+ return ptrType->getAddressSpace() == AddressSpace::UserPointer;
+}
+
IRType* getVectorElementType(IRType* type)
{
if (auto vectorType = as<IRVectorType>(type))
@@ -792,35 +800,212 @@ IRInst* getRootAddr(IRInst* addr, List<IRInst*>& outAccessChain, List<IRInst*>*
return addr;
}
-// A simple and conservative address aliasing check.
-bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+IRInst* getRootBufferOrAddr(IRInst* addr)
{
- if (addr1 == addr2)
- return true;
+ auto rootAddr = getRootAddr(addr);
+ if (as<IRRWStructuredBufferGetElementPtr>(rootAddr))
+ {
+ auto bufferHandle = rootAddr->getOperand(0);
+ // Check if the bufferHandle itself is a load from a global parameter.
+ if (auto load = as<IRLoad>(bufferHandle))
+ {
+ auto newRoot = getRootAddr(load->getPtr());
+ if (newRoot->getOp() == kIROp_GlobalParam)
+ return newRoot;
+ }
+ }
+ return rootAddr;
+}
+
+// The aliasing class of an address. This is used to determine
+// if two addresses may alias.
+enum class AddressAliasingClass
+{
+ Unknown,
+ UserPointer, // A user pointer into global memory
+ Var, // A thread-local or groupshared var.
+ ConstantBuffer, // A constant buffer or parameter block.
+ BoundBuffer, // A bound buffer.
+ BoundTexture, // A bound texture resource.
+ DescriptorHandle, // A bindless buffer or resource.
+};
+
+AddressAliasingClass getAliasingClass(IRInst* addr)
+{
+ if (auto globalParam = as<IRGlobalParam>(addr))
+ {
+ auto type = unwrapArray(globalParam->getDataType());
+ if (!type)
+ return AddressAliasingClass::Unknown;
+ switch (type->getOp())
+ {
+ case kIROp_TextureType:
+ return AddressAliasingClass::BoundTexture;
+ case kIROp_HLSLStructuredBufferType:
+ case kIROp_HLSLRWStructuredBufferType:
+ case kIROp_HLSLAppendStructuredBufferType:
+ case kIROp_HLSLConsumeStructuredBufferType:
+ case kIROp_HLSLRasterizerOrderedStructuredBufferType:
+ case kIROp_HLSLByteAddressBufferType:
+ case kIROp_HLSLRWByteAddressBufferType:
+ case kIROp_HLSLRasterizerOrderedByteAddressBufferType:
+ case kIROp_GLSLShaderStorageBufferType:
+ return AddressAliasingClass::BoundBuffer;
+ case kIROp_ConstantBufferType:
+ case kIROp_ParameterBlockType:
+ return AddressAliasingClass::ConstantBuffer;
+ case kIROp_PtrType:
+ if (isUserPointerType(type))
+ return AddressAliasingClass::UserPointer;
+ return AddressAliasingClass::Unknown;
+ case kIROp_DynamicResourceType:
+ return AddressAliasingClass::DescriptorHandle;
+ default:
+ return AddressAliasingClass::Unknown;
+ }
+ }
+ else if (as<IRVar>(addr))
+ return AddressAliasingClass::Var;
+ else if (as<IRGlobalVar>(addr))
+ return AddressAliasingClass::Var;
+ else if (as<IRRWStructuredBufferGetElementPtr>(addr))
+ return AddressAliasingClass::DescriptorHandle;
+ else if (as<IRCastDescriptorHandleToResource>(addr))
+ return AddressAliasingClass::DescriptorHandle;
- // Two variables can never alias.
- addr1 = getRootAddr(addr1);
- addr2 = getRootAddr(addr2);
+ auto type = addr->getDataType();
+ if (isUserPointerType(type))
+ return AddressAliasingClass::UserPointer;
+ return AddressAliasingClass::Unknown;
+}
- // Global addresses can alias with anything.
- if (!isChildInstOf(addr1, func))
+bool canAddrClassesAlias(AddressAliasingClass c1, AddressAliasingClass c2)
+{
+ if (c1 == AddressAliasingClass::Unknown || c2 == AddressAliasingClass::Unknown)
return true;
- if (!isChildInstOf(addr2, func))
+ switch (c1)
+ {
+ case AddressAliasingClass::Unknown:
return true;
+ case AddressAliasingClass::UserPointer:
+ case AddressAliasingClass::Var:
+ // A users pointer or var can only alias with another
+ // object that is either a user pointer or var.
+ //
+ // Generally, a var should never alias with anything else that isn't a var,
+ // if we never allow the user to take address of a local var.
+ // We don't allow taking addresses of a local var on most GPU targets, but
+ // we currently do expose an internal intrinsic to do so when targeting CPU.
+ // We should consider disallowing this across the board, or enable more aggresive
+ // criteria when targeting GPU backends.
+ // For now we stay conservative and just report true even when addr1 is var and
+ // addr2 is not rooted from a var.
+ //
+ return c2 == AddressAliasingClass::UserPointer || c2 == AddressAliasingClass::Var;
+ case AddressAliasingClass::BoundBuffer:
+ case AddressAliasingClass::BoundTexture:
+ // A bound resource can only alias with another
+ // object that is a bound resource or descriptor handle
+ return c2 == c1 || c2 == AddressAliasingClass::DescriptorHandle;
+
+ case AddressAliasingClass::DescriptorHandle:
+ // Can alias with any other resource.
+ switch (c2)
+ {
+ case AddressAliasingClass::BoundBuffer:
+ case AddressAliasingClass::BoundTexture:
+ case AddressAliasingClass::DescriptorHandle:
+ return true;
+ default:
+ return false;
+ }
+ case AddressAliasingClass::ConstantBuffer:
+ // Constant buffer cannot alias with anything.
+ return false;
+ }
+ // For any other unknown case, assume they may alias.
+ return true;
+}
+
+// Has `var` being used in a way that may allow it to alias with a user pointer?
+bool canVarAliasWithUserPointer(TargetRequest* target, IRInst* var)
+{
+ if (target && !isCPUTarget(target))
+ {
+ // We don't allow taking the address of a variable on anything other
+ // than the CPU target. Therefore a var can never alias with a user
+ // pointer on these targets.
+ return false;
+ }
+
+ SLANG_UNUSED(var);
+ return true;
+}
+
+// A simple and conservative address aliasing check.
+bool canAddressesPotentiallyAlias(
+ TargetRequest* target,
+ IRGlobalValueWithCode* func,
+ IRInst* addr1,
+ IRInst* addr2)
+{
+ if (addr1 == addr2)
+ return true;
+
+ addr1 = getRootBufferOrAddr(addr1);
+ addr2 = getRootBufferOrAddr(addr2);
+
+ auto addr1Class = getAliasingClass(addr1);
+ auto addr2Class = getAliasingClass(addr2);
- if (addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Var && addr1 != addr2)
+ if (!canAddrClassesAlias(addr1Class, addr2Class))
return false;
+ if (addr1Class == addr2Class)
+ {
+ // For these classes of addresses, the identity of the root
+ // determines whether or not the addresse can alias.
+ // Note that we assume two different bound resources can never
+ // alias, and two different variables can never alias.
+ switch (addr1Class)
+ {
+ case AddressAliasingClass::Var:
+ case AddressAliasingClass::BoundBuffer:
+ case AddressAliasingClass::BoundTexture:
+ case AddressAliasingClass::ConstantBuffer:
+ if (addr1 != addr2)
+ return false;
+ break;
+ }
+ }
+
// A param and a var can never alias.
if (addr1->getOp() == kIROp_Param && addr1->getParent() == func->getFirstBlock() &&
addr2->getOp() == kIROp_Var ||
addr1->getOp() == kIROp_Var && addr2->getOp() == kIROp_Param &&
addr2->getParent() == func->getFirstBlock())
return false;
+
+ // If one addr is user pointer and one addr is a var,
+ // they can never alias, if the user code never took the address of
+ // the var.
+ if (addr1Class == AddressAliasingClass::Var && addr2Class == AddressAliasingClass::UserPointer)
+ {
+ return canVarAliasWithUserPointer(target, addr1);
+ }
+ if (addr2Class == AddressAliasingClass::Var && addr1Class == AddressAliasingClass::UserPointer)
+ {
+ return canVarAliasWithUserPointer(target, addr2);
+ }
return true;
}
+bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2)
+{
+ return canAddressesPotentiallyAlias(nullptr, func, addr1, addr2);
+}
+
bool isPtrLikeOrHandleType(IRInst* type)
{
if (!type)
@@ -1141,15 +1326,15 @@ bool areCallArgumentsSideEffectFree(IRCall* call, SideEffectAnalysisOptions opti
if (isBitSet(options, SideEffectAnalysisOptions::UseDominanceTree))
dom = module->findOrCreateDominatorTree(parentFunc);
- // If the pointer argument is a local variable (thus can't alias with other addresses)
- // and it is never read from in the function, we can safely treat the call as having
- // no side-effect.
- // This is a conservative test, but is sufficient to detect the most common case where
- // a temporary variable is used as the inout argument and the result stored in the temp
- // variable isn't being used elsewhere in the parent func.
+ // If the pointer argument is a local variable (thus can't alias with other
+ // addresses) and it is never read from in the function, we can safely treat the
+ // call as having no side-effect. This is a conservative test, but is sufficient to
+ // detect the most common case where a temporary variable is used as the inout
+ // argument and the result stored in the temp variable isn't being used elsewhere in
+ // the parent func.
//
- // A more aggresive test can check all other address uses reachable from the call site
- // and see if any of them are aliasing with the argument.
+ // A more aggresive test can check all other address uses reachable from the call
+ // site and see if any of them are aliasing with the argument.
for (auto use = arg->firstUse; use; use = use->nextUse)
{
if (as<IRDecoration>(use->getUser()))
@@ -1323,8 +1508,8 @@ bool doesCalleeHaveSideEffect(IRInst* callee)
}
}
- // If the callee has no side effect, check if any of its associated functions have side effect.
- // If so, we want to keep the callee around.
+ // If the callee has no side effect, check if any of its associated functions have side
+ // effect. If so, we want to keep the callee around.
//
// Typically, once the relevant pass has completed, the association is removed,
// and at that point we can remove the function.
@@ -2230,13 +2415,12 @@ void legalizeDefUse(IRGlobalValueWithCode* func)
!(as<IRVar>(inst) && loopHeaderBlockMap.containsKey(block)))
continue;
- // Normally, if the common dominator is not `block`, we can simply move the definition
- // to the common dominator.
- // An exception is when the common dominator is the target block of a
- // loop.
- // Another exception is when a var in the loop condition block is accessed both inside
- // and outside the loop. It is technically visible, but effects on the 'var' are not
- // visible outside the loop, so we'll need to hoist it out of the loop.
+ // Normally, if the common dominator is not `block`, we can simply move the
+ // definition to the common dominator. An exception is when the common dominator is
+ // the target block of a loop. Another exception is when a var in the loop condition
+ // block is accessed both inside and outside the loop. It is technically visible,
+ // but effects on the 'var' are not visible outside the loop, so we'll need to hoist
+ // it out of the loop.
//
// Note that after normalization, loops are in the form of:
// ```
@@ -2377,9 +2561,9 @@ bool canOperationBeSpecConst(IROp op, IRType* resultType, IRInst* const* fixedAr
// Returns true for ops that can be declared as an operation under `OpSpecConstantOp`.
//
// Integer arithmetic and comparison operations can be `OpSpecConstantOp` with the `Shader`
- // capability, while floating-point arithmetic and comparison operations require the `Kernel`
- // capability. We only support `Shader` capability for now, return false when floating-point
- // arithmetic/comparison is encountered.
+ // capability, while floating-point arithmetic and comparison operations require the
+ // `Kernel` capability. We only support `Shader` capability for now, return false when
+ // floating-point arithmetic/comparison is encountered.
switch (op)
{
case kIROp_Add:
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index c0410fa3c..b8937d569 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -70,6 +70,8 @@ bool isPointerOfType(IRInst* ptrType, IRInst* elementType);
// True if ptrType is a pointer type to a type of opCode
bool isPointerOfType(IRInst* ptrType, IROp opCode);
+bool isUserPointerType(IRInst* type);
+
// Builds a dictionary that maps from requirement key to requirement value for `interfaceType`.
Dictionary<IRInst*, IRInst*> buildInterfaceRequirementDict(IRInterfaceType* interfaceType);
@@ -205,6 +207,12 @@ IRInst* getRootAddr(
bool canAddressesPotentiallyAlias(IRGlobalValueWithCode* func, IRInst* addr1, IRInst* addr2);
+bool canAddressesPotentiallyAlias(
+ TargetRequest* target,
+ IRGlobalValueWithCode* func,
+ IRInst* addr1,
+ IRInst* addr2);
+
String dumpIRToString(
IRInst* root,
IRDumpOptions options = {IRDumpOptions::Mode::Simplified, IRDumpOptions::Flag::DumpDebugIds});
diff --git a/tests/cuda/copy-elision-this-1.slang b/tests/cuda/copy-elision-this-1.slang
index 295b45c73..273e6dc58 100644
--- a/tests/cuda/copy-elision-this-1.slang
+++ b/tests/cuda/copy-elision-this-1.slang
@@ -1,4 +1,6 @@
-//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda
+//TEST:SIMPLE(filecheck=CUDA): -stage compute -entry computeMain -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=PTX): -stage compute -entry computeMain -target cuda
+
struct Data {
StructuredBuffer<float> input[2];
RWStructuredBuffer<float> output;
@@ -6,7 +8,9 @@ struct Data {
StructuredBuffer<uint> index_buffer;
uint index_count;
- // CUDA: fetch{{.*}}Data{{.*}}*{{.*}}this
+ // CUDA: __device__ float Data_fetch{{.*}}(int {{.*}}, int {{.*}})
+ // CUDA-NEXT: {
+ // CUDA-NEXT: return globalParams{{.*}}->data{{.*}}->input{{.*}}[{{.*}}].Load
float fetch(int buffer, int index)
{
return input[buffer][index];
@@ -15,6 +19,8 @@ struct Data {
ParameterBlock<Data> data;
+// PTX: computeMain
+
[shader("compute")]
[numthreads(8, 8, 1)]
void computeMain(uint3 tid: SV_DispatchThreadID)
diff --git a/tests/glsl/global-uniform-with-varyings.slang b/tests/glsl/global-uniform-with-varyings.slang
index 678855dbf..174560840 100644
--- a/tests/glsl/global-uniform-with-varyings.slang
+++ b/tests/glsl/global-uniform-with-varyings.slang
@@ -3,9 +3,9 @@
// CHECK_SPIRV: OpEntryPoint
// CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 0
-// CHECK_SPIRV: OpVariable {{.*}} Uniform
-// CHECK_SPIRV: OpVariable {{.*}} Input {{.*}} Location 1
-// CHECK_SPIRV: OpVariable {{.*}} Output {{.*}} Location 0
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Uniform
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Input {{.*}} Location 1
+// CHECK_SPIRV-DAG: OpVariable {{.*}} Output {{.*}} Location 0
// CHECK_GLSL: layout(location = 0)
// CHECK_GLSL-NEXT: in
diff --git a/tests/metal/out-param.slang b/tests/metal/out-param.slang
index e488f8844..68b6e2b62 100644
--- a/tests/metal/out-param.slang
+++ b/tests/metal/out-param.slang
@@ -9,13 +9,20 @@
//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<int> outputBuffer;
-// METAL: void Test_out_param{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+// CHECK-DAG: void Test_out_param{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int {{.*}}, KernelContext{{.*}} thread* {{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// CHECK-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param{{.*}}(int threadgroup* value{{.*}})
+
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
+// METAL-DAG: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int thread* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int device* value{{.*}})
-// METAL: void Test_out_param_wrapper{{.*}}(int threadgroup* value{{.*}})
void Test_out_param(out int value)
{
diff --git a/tests/optimization/buffer-load-defer-aliasing-1.slang b/tests/optimization/buffer-load-defer-aliasing-1.slang
new file mode 100644
index 000000000..f50d5306c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing-1.slang
@@ -0,0 +1,45 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we can defer buffer loads by ruling out potential aliasing writes.
+
+struct Bottom
+{
+ float bigArray[1024];
+
+ float bottomGetValue(int index)
+ {
+ // RWStructuredBuffer is considered to not alias with anything else.
+ // this write should not prevent deferring loading bigArray.
+ gOther[0] = 100;
+ // this write should not prevent deferring loading bigArray.
+ gSharedVar = 1;
+ // this write should not prevent deferring loading bigArray.
+ gStaticVar = 2;
+
+ // We should return the value from bigArray from a previously loaded value of `this`.
+ return bigArray[index];
+ }
+}
+
+struct Root
+{
+ Bottom bottom1;
+ Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform RWStructuredBuffer<int> gOther;
+static int gStaticVar;
+groupshared int gSharedVar;
+
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ // SPV: OpEntryPoint
+ // SPV-NOT: OpLoad %Bottom_natural
+ outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-aliasing.slang b/tests/optimization/buffer-load-defer-aliasing.slang
new file mode 100644
index 000000000..a0240cc40
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-aliasing.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+// Test that we are not deferring buffer loads due to potential aliasing writes.
+
+struct Bottom
+{
+ float bigArray[1024];
+
+ float bottomGetValue(int index)
+ {
+ // this write may cause data stored at gRoot to be modified,
+ // thus bigArray[index] may be different from what it was before the call to
+ // bottomGetValue. So we should not defer loading bigArray until after this write.
+ *gOther = 100;
+
+ // We should return the value from bigArray from a previously loaded value of `this`.
+ return bigArray[index];
+ }
+}
+
+struct Root
+{
+ Bottom bottom1;
+ Bottom bottom2;
+}
+
+uniform Root* gRoot;
+uniform int* gOther;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ // SPV: OpLoad %Bottom_natural
+ outputBuffer[0] = gRoot.bottom1.bottomGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-bindless.slang b/tests/optimization/buffer-load-defer-bindless.slang
new file mode 100644
index 000000000..2108d562c
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-bindless.slang
@@ -0,0 +1,58 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Check that we can specialize buffer loads through bindless handles, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+ float bigArray[1024];
+ float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+ Bottom bottom;
+ float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+ StructuredBuffer<Middle>.Handle middle;
+
+ // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+ // struct to be loaded into registers. Instead, we should be able to specialize
+ // `middleGetValue` to take a `StructuredBuffer<Middle>.Handle` and an `int`
+ // index, and recursively specialize `bottomGetValue` to only load the `Bottom.bigArray[index]` element.
+ float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+ Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpLoad %Middle
+// SPV: %[[REG:[A-Za-z0-9_]+]] = OpLoad %float
+// SPV: OpStore {{.*}} %[[REG]]
+
+// Check that the generated CUDA code contains a specialized `bottomGetValue` function that has
+// the complete parameter list to access the `bigArray` element directly, without needing to load
+// the entire `Bottom` struct from the caller.
+//
+// CUDA-DAG: __device__ float Bottom_bottomGetValue{{.*}}(StructuredBuffer<Middle{{.*}}> {{.*}}, int {{.*}}, int {{.*}})
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer-user-pointer.slang b/tests/optimization/buffer-load-defer-user-pointer.slang
new file mode 100644
index 000000000..58e6386f9
--- /dev/null
+++ b/tests/optimization/buffer-load-defer-user-pointer.slang
@@ -0,0 +1,63 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=PTX): -target ptx -entry compute_main -stage compute
+
+// Check that we can specialize buffer loads through user pointers, and
+// do not load big struct elements into registers unnecessarily.
+
+struct Bottom
+{
+ float bigArray[1024];
+ float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+ Bottom bottom;
+ float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+ StructuredBuffer<Middle*>.Handle middle;
+
+ // Calling `middleGetValue` on `middle[0]` should not causing the entire `Middle`
+ // struct to be loaded into registers. Instead, we should be able to specialize
+ // `middleGetValue` to take a `Middle*` and recursively specialize `bottomGetValue`
+ // to only load the `Bottom.bigArray[index]` element.
+ float topGetValue(int index) { return middle[0].middleGetValue(index); }
+}
+
+struct Root
+{
+ Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that the generated CUDA code never loads a `Middle` or `Bottom` struct into a local var.
+// CUDA-NOT: Middle{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Bottom{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+// CUDA-NOT: Top{{[_A-Za-z0-9]*}} {{[a-zA-Z0-9_]+}} =
+
+// Check that the generated CUDA code can be compiled by nvrtc correctly into PTX.
+// PTX: compute_main
+
+// Check that the generated (unoptimized) SPIR-V contains a specialized Bottom_bottomGetValue function
+// that takes in a Bottom* and use access chain to load the required array element directly, without
+// needing to load the entire Bottom struct.
+// SPV: %Bottom_bottomGetValue = OpFunction %float None
+// SPV: OpFunctionParameter %_ptr_PhysicalStorageBuffer_Middle_natural
+// SPV: %[[INDEX:[A-Za-z0-9_]+]] = OpFunctionParameter %int
+// SPV: %[[PTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_float %{{.*}} %[[INDEX]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[PTR]]
+// SPV: OpReturnValue %[[VALUE]]
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-defer.slang b/tests/optimization/buffer-load-defer.slang
new file mode 100644
index 000000000..b2df43c13
--- /dev/null
+++ b/tests/optimization/buffer-load-defer.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+ float bigArray[1024];
+ float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Middle
+{
+ Bottom bottom;
+ float middleGetValue(int index) { return bottom.bottomGetValue(index); }
+}
+
+struct Top
+{
+ Middle middle;
+ float topGetValue(int index) { return middle.middleGetValue(index); }
+}
+
+struct Root
+{
+ Top top;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// SPV: OpEntryPoint
+// SPV-NOT: OpCompositeConstruct
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ outputBuffer[0] = cb.top.topGetValue(0);
+}
diff --git a/tests/optimization/buffer-load-specialize-1.slang b/tests/optimization/buffer-load-specialize-1.slang
new file mode 100644
index 000000000..55f2df473
--- /dev/null
+++ b/tests/optimization/buffer-load-specialize-1.slang
@@ -0,0 +1,35 @@
+//TEST:SIMPLE(filecheck=SPV): -target spirv -O0
+
+struct Bottom
+{
+ float bigArray[1024];
+ // SPV: %Bottom_bottomGetValue = OpFunction %float None %{{.*}}
+ // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+ // SPV-NEXT: OpLabel
+ // SPV-NOT: OpCompositeConstruct
+ // SPV: OpFunctionEnd
+
+ // SPV: %Bottom_bottomGetValue_0 = OpFunction %float None %{{.*}}
+ // SPV-NEXT: %{{.*}} = OpFunctionParameter %int
+ // SPV-NEXT: OpLabel
+ float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+ Bottom bottom1;
+ Bottom bottom2;
+}
+
+ConstantBuffer<Root> cb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ outputBuffer[0] = cb.bottom1.bottomGetValue(0);
+ outputBuffer[1] = cb.bottom2.bottomGetValue(1);
+ outputBuffer[2] = cb.bottom2.bottomGetValue(2);
+}
diff --git a/tests/optimization/buffer-store-defer.slang b/tests/optimization/buffer-store-defer.slang
new file mode 100644
index 000000000..14362a477
--- /dev/null
+++ b/tests/optimization/buffer-store-defer.slang
@@ -0,0 +1,51 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=PTX): -target ptx -stage compute -entry compute_main
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+struct Bottom
+{
+ float bigArray[1024];
+
+ [mutating]
+ void setVal(int index, float value) { bigArray[index] = value; }
+}
+
+struct Root
+{
+ Bottom top[2];
+ [mutating]
+ void setTopVal(int x, int y, float value)
+ {
+ top[x].setVal(y, value);
+ }
+}
+
+RWStructuredBuffer<Root> sb;
+
+// Check that we don't load the entire `Root` struct, modify it, and then write it back.
+// Instead we should generate a single store instruction to write the single float value
+// directly to the buffer.
+
+// SPV: OpEntryPoint
+// SPV: OpLabel
+// SPV-NEXT: OpAccessChain
+// SPV-NOT: OpCompositeInsert
+// SPV-NOT: OpLoad
+// SPV: OpStore
+// SPV-NOT: OpLoad
+// SPV-NOT: OpCompositeInsert
+// SPV: OpStore
+// SPV: OpReturn
+
+// CUDA: __device__ void Bottom_setVal_0(int [[INDEX0:[A-Za-z0-9_]+]], int [[INDEX1:[A-Za-z0-9_]+]], int [[INDEX2:[A-Za-z0-9_]+]], float [[VAL:[A-Za-z0-9_]+]])
+// CUDA: (&(&(globalParams{{.*}}->sb{{.*}}){{\[}}[[INDEX0]]{{\]}})->top{{.*}}{{\[}}[[INDEX1]]{{\]}})->bigArray{{.*}}{{\[}}[[INDEX2]]{{\]}} = [[VAL]];
+// PTX: compute_main
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ sb[0].setTopVal(1, 2, 100.0f);
+
+ sb[3].top[1].setVal(8, 200.0f);
+}
diff --git a/tests/optimization/defer-structured-buffer-load.slang b/tests/optimization/defer-structured-buffer-load.slang
new file mode 100644
index 000000000..f7f9b1888
--- /dev/null
+++ b/tests/optimization/defer-structured-buffer-load.slang
@@ -0,0 +1,38 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -entry compute_main -stage compute
+//TEST:SIMPLE(filecheck=SPV): -target spirv
+
+// Test that we can defer loading big structured buffer elements.
+
+struct Bottom
+{
+ float bigArray[1024];
+ float bottomGetValue(int index) { return bigArray[index]; }
+}
+
+struct Root
+{
+ Bottom bottom;
+}
+
+StructuredBuffer<Root> sb;
+
+RWStructuredBuffer<float> outputBuffer;
+
+// Check that we don't load the entire `Root` struct and then do ElementExtract to get to `bigArray[0]`.
+// Instead we use access chain all the way to point to the required array element, and load just a single float.
+
+// SPV: OpEntryPoint
+// SPV: %[[SBPTRARRAY:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer__arr_float_int_1024
+// SPV: %[[SBPTR:[A-Za-z0-9_]+]] = OpAccessChain %_ptr_StorageBuffer_float %[[SBPTRARRAY]]
+// SPV: %[[VALUE:[A-Za-z0-9_]+]] = OpLoad %float %[[SBPTR]]
+// SPV: OpStore %{{.*}} %[[VALUE]]
+
+// CUDA: __device__ float Bottom_bottomGetValue{{.*}}(uint [[PARAM0:[A-Za-z0-9_]+]], int [[PARAM1:[A-Za-z0-9_]+]])
+// CUDA: return (&(&(globalParams_0->sb_0){{\[}}[[PARAM0]]{{\]}})->bottom_0)->bigArray_0{{\[}}[[PARAM1]]{{\]}};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void compute_main(uint3 tid: SV_DispatchThreadID)
+{
+ outputBuffer[0] = sb[tid.x].bottom.bottomGetValue(0);
+}
diff --git a/tests/vkray/raygen-trace-ray-param-non-struct.slang b/tests/vkray/raygen-trace-ray-param-non-struct.slang
index b0a129761..72d85ed02 100644
--- a/tests/vkray/raygen-trace-ray-param-non-struct.slang
+++ b/tests/vkray/raygen-trace-ray-param-non-struct.slang
@@ -28,7 +28,7 @@ void main()
// CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
// CHECK: TraceRay(
- // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+ // CHECK: rayPayload{{.*}}.data{{.*}};
TraceRay(as,
1,
0xff,
@@ -39,9 +39,9 @@ void main()
someInData1);
outputBuffer1[0] = outputBuffer1[0]+someInData1;
- // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
+ // CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = rayPayload{{.*}}.data{{.*}};
// CHECK: TraceMotionRay(
- // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+ // CHECK: rayPayload{{.*}}.data{{.*}};
TraceMotionRay(as,
1,
0xff,
@@ -55,7 +55,7 @@ void main()
// CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
// CHECK: NvTraceRayHitObject(
- // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+ // CHECK: rayPayload{{.*}}.data{{.*}};
HitObject::TraceRay(as,
1,
0xff,
@@ -68,7 +68,7 @@ void main()
// CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
// CHECK: TraceMotionRay(
- // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+ // CHECK: rayPayload{{.*}}.data{{.*}};
HitObject::TraceMotionRay(as,
1,
0xff,
@@ -82,7 +82,7 @@ void main()
// CHECK: rayPayload{{_[0-9]}}.data{{_[0-9]}} = {{.*}}
// CHECK: NvInvokeHitObject(
- // CHECK: {{.*}} = rayPayload{{.*}}.data{{.*}};
+ // CHECK: rayPayload{{.*}}.data{{.*}};
HitObject hitObject_HitOrMiss;
HitObject::Invoke(
as,
diff --git a/tests/wgsl/switch-case.slang b/tests/wgsl/switch-case.slang
index c4ff0996e..fc24bd67a 100644
--- a/tests/wgsl/switch-case.slang
+++ b/tests/wgsl/switch-case.slang
@@ -70,17 +70,14 @@ func fs_main(VertexOutput input)->FragmentOutput
return output;
}
-//WGSL: fn _S9( _S10 : Tuple_0) -> f32
-//WGSL-NEXT: {
-//WGSL-NEXT: switch(_S10.value1_0.x)
+//WGSL: switch({{.*}})
//WGSL-NEXT: {
//WGSL-NEXT: case u32(0):
//WGSL-NEXT: {
-//WGSL-NEXT: return Circle_getArea_0(unpackAnyValue16_0(_S10.value2_0));
+//WGSL-NEXT: return Circle_getArea_0
//WGSL-NEXT: }
//WGSL-NEXT: default :
//WGSL-NEXT: {
-//WGSL-NEXT: return Rectangle_getArea_0(unpackAnyValue16_1(_S10.value2_0));
+//WGSL-NEXT: return Rectangle_getArea_0
//WGSL-NEXT: }
//WGSL-NEXT: }
-//WGSL-NEXT: }