summaryrefslogtreecommitdiff
path: root/source/slang/slang-ir-defer-buffer-load.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/slang/slang-ir-defer-buffer-load.cpp')
-rw-r--r--source/slang/slang-ir-defer-buffer-load.cpp326
1 files changed, 201 insertions, 125 deletions
diff --git a/source/slang/slang-ir-defer-buffer-load.cpp b/source/slang/slang-ir-defer-buffer-load.cpp
index 51c6a161b..ccdfe4538 100644
--- a/source/slang/slang-ir-defer-buffer-load.cpp
+++ b/source/slang/slang-ir-defer-buffer-load.cpp
@@ -3,142 +3,211 @@
#include "slang-ir-clone.h"
#include "slang-ir-dominators.h"
#include "slang-ir-insts.h"
+#include "slang-ir-layout.h"
#include "slang-ir-redundancy-removal.h"
#include "slang-ir-util.h"
#include "slang-ir.h"
namespace Slang
{
-struct DeferBufferLoadContext
-{
- // Map an original SSA value to a pointer that can be used to load the value.
- Dictionary<IRInst*, IRInst*> mapValueToPtr;
- // Map an ptr to its loaded value.
- Dictionary<IRInst*, IRInst*> mapPtrToValue;
+// Generally, we want to specialize arguments that are large in size, or arguments that
+// are arrays or composite type that contains arrays.
+// This is because:
+// 1. Struct types without arrays will eventually be SROA's into registers and then effectively
+// DCE'd, so they usually won't cause performance issues. In fact, front loading structs
+// and reusing the loaded value instead of repetitively loading from constant memory is
+// usually beneficial to performance. However large struct values can be SROA'd into a large
+// number of registers, causing slow downstream compilation. Therefore we should avoid/defer
+// loading them into registers if we can.
+// 2. Arrays usually cannot be SROA'd into individual registers, which usually leads to
+// large register consumption if they ever get loaded, so we want to defer loading array
+// typed values as much as possible.
- IRFunc* currentFunc = nullptr;
+// If the argument data is bigger than this threshold, it is considered a large object
+// and we will try to specialize it even if it doesn't contain arrays.
+static const int kBufferLoadElementSizeSpecializationThreshold = 128;
- // Ensure that for an original SSA value, we have formed a pointer that can be used to load the
- // value.
- IRInst* ensurePtr(IRInst* valueInst)
- {
- IRInst* result = nullptr;
- if (mapValueToPtr.tryGetValue(valueInst, result))
- return result;
+// If the argument data is smaller than this threshold, it is considered a tiny object
+// and we will not consider specializing it, even if it contains arrays.
+static const int kBufferLoadElementSizeSpecializationMinThreshold = 16;
- IRBuilder b(valueInst);
- b.setInsertBefore(valueInst);
-
- switch (valueInst->getOp())
+static bool isCompositeTypeContainingArrays(IRType* type)
+{
+ if (auto structType = as<IRStructType>(type))
+ {
+ for (auto field : structType->getFields())
{
- case kIROp_StructuredBufferLoad:
- case kIROp_StructuredBufferLoadStatus:
- {
- result = b.emitRWStructuredBufferGetElementPtr(
- valueInst->getOperand(0),
- valueInst->getOperand(1));
- break;
- }
- case kIROp_GetElement:
+ if (const auto arrayType = as<IRArrayTypeBase>(field->getFieldType()))
{
- auto ptr = ensurePtr(valueInst->getOperand(0));
- if (!ptr)
- return nullptr;
- result = b.emitElementAddress(ptr, valueInst->getOperand(1));
- break;
+ return true;
}
- case kIROp_FieldExtract:
+ if (auto subStructType = as<IRStructType>(field->getFieldType()))
{
- auto ptr = ensurePtr(valueInst->getOperand(0));
- if (!ptr)
- return nullptr;
- result = b.emitFieldAddress(ptr, valueInst->getOperand(1));
- break;
+ if (isCompositeTypeContainingArrays(subStructType))
+ return true;
}
- case kIROp_Load:
- result = valueInst->getOperand(0);
- break;
- }
- if (result)
- {
- mapValueToPtr[valueInst] = result;
}
- return result;
}
+ else if (as<IRArrayTypeBase>(type))
+ {
+ return true;
+ }
+ return false;
+}
- static bool isImmutableBufferLoad(IRInst* inst)
+bool isTypePreferrableToDeferLoad(CodeGenContext* codeGenContext, IRType* type)
+{
+ // If parameter is a pointer/reference, we should consider specialize it.
+ if (as<IROutTypeBase>(type) || as<IRRefType>(type) || as<IRConstRefType>(type))
+ return true;
+
+ // We only want to defer loading values that are "large enough" that
+ // we expect them to be expensive to pass by value.
+ //
+ IRSizeAndAlignment sizeAlignment = {};
+ if (SLANG_FAILED(getNaturalSizeAndAlignment(
+ codeGenContext->getTargetProgram()->getOptionSet(),
+ type,
+ &sizeAlignment)))
{
- // Note: we cannot defer loads from RWStructuredBuffer because there can be other
- // instructions that modify the buffer.
+ // If type contains fields that we don't know how to compute natural size
+ // for, default to specialize if it contains arrays.
+ return isCompositeTypeContainingArrays(type);
+ }
+
+ // If the argument is very small, don't bother specializing.
+ if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationMinThreshold)
+ return false;
+
+ // If the argument is somewhat small, don't specialize, unless it contains
+ // arrays.
+ if (sizeAlignment.size <= kBufferLoadElementSizeSpecializationThreshold)
+ {
+ // We generally do not specialize for small values, except it contains
+ // arrays that usually present a challenge for the SROA pass to eliminate
+ // unnecessary loads.
+ if (!isCompositeTypeContainingArrays(type))
+ return false;
+ }
+ return true;
+}
+
+// Returns true if memory loaded by `loadInst` is not modified before `userInst` after it is
+// loaded.
+// This method is currently implementing a very conservative analysis that only allows
+// `loadInst` to be in the same block as `userInst`, with basic aliasing analysis for any
+// stores in between. All other cases are conservatively treated as the memory location may be
+// modified.
+bool isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ TargetRequest* target,
+ IRInst* loadInst,
+ IRInst* userInst)
+{
+ auto func = getParentFunc(loadInst);
+ if (!func)
+ return false;
+
+ // For now we only check if loadInst and userInst are in the same block.
+ if (loadInst->getParent() != userInst->getParent())
+ return false;
+
+ for (IRInst* inst = loadInst->getNextInst(); inst; inst = inst->getNextInst())
+ {
+ // We found callInst before hitting any instruction that may modify the memory.
+ if (inst == userInst)
+ return true;
+
+ if (!inst->mightHaveSideEffects())
+ continue;
+
+ // If we see any inst that has side effect, check if it is simple case that we can rule
+ // out the possibility of modifying the memory location.
switch (inst->getOp())
{
- case kIROp_StructuredBufferLoad:
- case kIROp_StructuredBufferLoadStatus:
- return true;
- case kIROp_Load:
+ case kIROp_Store:
{
- auto rootAddr = getRootAddr(inst->getOperand(0));
- return isPointerToImmutableLocation(rootAddr);
+ auto storedDest = inst->getOperand(0);
+ if (canAddressesPotentiallyAlias(target, func, loadInst->getOperand(0), storedDest))
+ return false;
+ continue;
}
default:
+ // For any other case, conservatively assume the memory location may be modified.
return false;
}
}
+ // We didn't found callInst after loadInst within the same basic block.
+ // We conservatively assume the memory location may be modified.
+ // This check can be extended to use the dominator tree to allow
+ // loadInst and userInst to be in different blocks.
+ return false;
+}
- // Ensure that for a pointer value, we have created a load instruction to materialize the value.
- IRInst* materializePointer(IRBuilder& builder, IRInst* loadInst)
+struct DeferBufferLoadContext
+{
+ CodeGenContext* codeGenContext;
+
+
+ void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
{
- auto ptr = ensurePtr(loadInst);
- if (!ptr)
- return nullptr;
- IRInst* result = nullptr;
- if (mapPtrToValue.tryGetValue(ptr, result))
- return result;
- IRAlignedAttr* align = nullptr;
- if (auto load = as<IRLoad>(loadInst))
- align = load->findAttr<IRAlignedAttr>();
- if (!as<IRModuleInst>(ptr->getParent()))
+ // Don't defer the load anymore if the type is simple.
+ if (!isTypePreferrableToDeferLoad(codeGenContext, loadInst->getDataType()) ||
+ loadInst->findAttr<IRAlignedAttr>())
{
- setInsertAfterOrdinaryInst(&builder, ptr);
- IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
- result = builder.emitLoad(valueType, ptr, align);
- mapPtrToValue[ptr] = result;
+ return;
}
- else
+
+ auto rootAddr = getRootAddr(loadInst->getOperand(0));
+ bool isImmutableBufferLoad = isPointerToImmutableLocation(rootAddr);
+
+ // Don't defer the load if there are uses that are not getElement or fieldExtract.
+ // Because in this case we need to use the entire loaded value, and further deferring
+ // the load down any access chain will introduce redundant loads.
+ for (auto use = loadInst->firstUse; use; use = use->nextUse)
{
- setInsertBeforeOrdinaryInst(&builder, loadInst);
- IRType* valueType = tryGetPointedToType(&builder, ptr->getFullType());
- result = builder.emitLoad(valueType, ptr, align);
- // Since we are inserting the load in a local scope, we can't register
- // the mapping to the pointer, since the global pointer needs to be
- // loaded once per function.
+ auto user = use->getUser();
+ switch (user->getOp())
+ {
+ case kIROp_GetElement:
+ case kIROp_FieldExtract:
+ // Can we defer the load to load only the requested element right before
+ // the element extract inst?
+ // If the buffer is immutable, we can always do that.
+ // If it is not, we need to make sure there is no other instructions that can modify
+ // the buffer between the load and the use.
+ //
+ if (isImmutableBufferLoad)
+ continue;
+ if (isMemoryLocationUnmodifiedBetweenLoadAndUser(
+ codeGenContext->getTargetReq(),
+ loadInst,
+ user))
+ continue;
+ return;
+ default:
+ // If we see any other use the laod instruction, we assume the entire loaded value
+ // is needed, and we can't defer the load anymore.
+ return;
+ }
}
- return result;
- }
- static bool isSimpleType(IRInst* type)
- {
- if (auto modType = as<IRRateQualifiedType>(type))
- type = modType->getValueType();
- if (as<IRStructType>(type))
- return false;
- if (as<IRTupleType>(type))
- return false;
- if (as<IRArrayTypeBase>(type))
- return false;
- return true;
- }
+ // If we reach here, it means all uses are getElement or fieldExtract, and
+ // it is safe to defer the load down the access chain.
- void deferBufferLoadInst(IRBuilder& builder, List<IRInst*>& workList, IRInst* loadInst)
- {
- // Don't defer the load anymore if the type is simple.
- if (isSimpleType(loadInst->getDataType()) || loadInst->findAttr<IRAlignedAttr>())
+ if (loadInst->getOp() == kIROp_StructuredBufferLoad)
{
- auto materializedVal = materializePointer(builder, loadInst);
- loadInst->transferDecorationsTo(materializedVal);
- loadInst->replaceUsesWith(materializedVal);
- return;
+ // Convert the structuredBufferLoad to a regular load to reuse
+ // the same logic for deferring regular loads.
+ builder.setInsertBefore(loadInst);
+ auto bufferPtr = builder.emitRWStructuredBufferGetElementPtr(
+ loadInst->getOperand(0),
+ loadInst->getOperand(1));
+ auto sbLoad = builder.emitLoad(bufferPtr);
+ loadInst->transferDecorationsTo(sbLoad);
+ loadInst->replaceUsesWith(sbLoad);
+ loadInst->removeAndDeallocate();
+ loadInst = sbLoad;
}
// Otherwise, look for all uses and try to defer the load before actual use of the value.
@@ -148,19 +217,29 @@ struct DeferBufferLoadContext
loadInst,
[&](IRUse* use)
{
- if (needMaterialize)
- return;
-
auto user = use->getUser();
+
switch (user->getOp())
{
case kIROp_GetElement:
case kIROp_FieldExtract:
{
- auto basePtr = ensurePtr(loadInst);
- if (!basePtr)
- return;
- pendingWorkList.add(user);
+ // If we see a getElement or fieldExtract, we defer the load by
+ // replacing the getElement/fieldExtract with a load of the
+ // elementAddr/fieldAddr.
+ builder.setInsertBefore(user);
+ auto basePtr = loadInst->getOperand(0);
+ IRInst* gepArg = user->getOperand(1);
+ auto elementPtr = builder.emitElementAddress(
+ basePtr,
+ makeArrayViewSingle<IRInst*>(gepArg));
+ auto newLoad = builder.emitLoad(elementPtr);
+ user->transferDecorationsTo(newLoad);
+ user->replaceUsesWith(newLoad);
+ user->removeAndDeallocate();
+
+ // Now add the new load to work list to try to defer it further.
+ pendingWorkList.add(newLoad);
}
break;
default:
@@ -169,41 +248,37 @@ struct DeferBufferLoadContext
}
});
- if (needMaterialize)
- {
- auto val = materializePointer(builder, loadInst);
- loadInst->transferDecorationsTo(val);
- loadInst->replaceUsesWith(val);
- loadInst->removeAndDeallocate();
- }
- else
- {
- // Append to worklist in reverse order so we process the uses in natural appearance
- // order.
- for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
- workList.add(pendingWorkList[i]);
- }
+ // Append to worklist in reverse order so we process the uses in natural appearance
+ // order.
+ for (Index i = pendingWorkList.getCount() - 1; i >= 0; i--)
+ workList.add(pendingWorkList[i]);
}
void deferBufferLoadInFunc(IRFunc* func)
{
removeRedundancyInFunc(func, false);
- currentFunc = func;
-
List<IRInst*> workList;
+ // Discover all load instructions and add to work list.
+
for (auto block : func->getBlocks())
{
for (auto inst : block->getChildren())
{
- if (isImmutableBufferLoad(inst))
+ switch (inst->getOp())
{
+ case kIROp_Load:
+ case kIROp_StructuredBufferLoad:
+ // Note: We don't handle `kIROp_StructuredBufferLoadStatus` here because
+ // it also writes to the status code out parameter, which we can't defer.
workList.add(inst);
+ break;
}
}
}
+ // Iteratively process the work list until it is empty.
IRBuilder builder(func);
for (Index i = 0; i < workList.getCount(); i++)
{
@@ -227,9 +302,10 @@ struct DeferBufferLoadContext
}
};
-void deferBufferLoad(IRModule* module)
+void deferBufferLoad(CodeGenContext* codeGenContext, IRModule* module)
{
DeferBufferLoadContext context;
+ context.codeGenContext = codeGenContext;
for (auto childInst : module->getGlobalInsts())
{
if (auto code = as<IRGlobalValueWithCode>(childInst))