diff options
| author | Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com> | 2023-05-07 13:35:27 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-05-07 10:35:27 -0700 |
| commit | 89a1234964a1927c4936a2758f72b7d6c9d0bc73 (patch) | |
| tree | a5967857c68419795cdf39cd0eb2e8ade29cf763 | |
| parent | 271dc1b98d3887b6297c5407dc67692716687f4d (diff) | |
Optimize logic around indexed temporary variables (#2873)
| -rw-r--r-- | source/slang/slang-ir-autodiff-primal-hoist.cpp | 58 | ||||
| -rw-r--r-- | tests/autodiff/reverse-loop-checkpoint-test.slang | 95 | ||||
| -rw-r--r-- | tests/autodiff/reverse-loop-checkpoint-test.slang.expected.txt | 6 |
3 files changed, 158 insertions, 1 deletions
diff --git a/source/slang/slang-ir-autodiff-primal-hoist.cpp b/source/slang/slang-ir-autodiff-primal-hoist.cpp index ab23aeb40..353d56cfa 100644 --- a/source/slang/slang-ir-autodiff-primal-hoist.cpp +++ b/source/slang/slang-ir-autodiff-primal-hoist.cpp @@ -36,9 +36,16 @@ static bool isDifferentialBlock(IRBlock* block) return block->findDecoration<IRDifferentialInstDecoration>(); } -static IRBlock* getLoopRegionBodyBlock(IRLoop* loop) +static IRBlock* getLoopConditionBlock(IRLoop* loop) { auto condBlock = as<IRBlock>(loop->getTargetBlock()); + SLANG_ASSERT(as<IRIfElse>(condBlock->getTerminator())); + return condBlock; +} + +static IRBlock* getLoopRegionBodyBlock(IRLoop* loop) +{ + auto condBlock = getLoopConditionBlock(loop); // We assume the loop body always sit at the true side of the if-else. if (auto ifElse = as<IRIfElse>(condBlock->getTerminator())) { @@ -183,6 +190,12 @@ static Dictionary<IRBlock*, IRBlock*> createPrimalRecomputeBlocks( auto bodyRecomputeBlock = createRecomputeBlock(bodyBlock); bodyRecomputeBlock->insertBefore(diffBodyBlock); diffBodyBlock->replaceUsesWith(bodyRecomputeBlock); + + // Map the primal condition block directly to the diff + // conditon block (we won't create a recompute block for this) + // + recomputeBlockMap[getLoopConditionBlock(loop)] = getLoopConditionBlock(diffLoop); + moveParams(bodyRecomputeBlock, diffBodyBlock); { // After CFG normalization, the loop body will contain only jumps to the @@ -1060,6 +1073,37 @@ static int getInstRegionNestLevel( return (int)result; } +// Trim defBlockIndices based on the indices of out of scope uses. +// +static List<IndexTrackingInfo> maybeTrimIndices( + const List<IndexTrackingInfo>& defBlockIndices, + const Dictionary<IRBlock*, List<IndexTrackingInfo>>& indexedBlockInfo, + const List<IRUse*>& outOfScopeUses) +{ + // Go through uses, lookup the defBlockIndices, and remove any indices if they + // are not present in any of the uses. (This is sort of slow...) + // + List<IndexTrackingInfo> result; + for (auto& index : defBlockIndices) + { + bool found = false; + for (auto& use : outOfScopeUses) + { + auto useInst = use->getUser(); + auto useBlock = useInst->getParent(); + auto useBlockIndices = indexedBlockInfo[as<IRBlock>(useBlock)].getValue(); + if (useBlockIndices.contains(index)) + { + found = true; + break; + } + } + if (found) + result.add(index); + } + return result; +} + /// Legalizes all accesses to primal insts from recompute and diff blocks. /// @@ -1230,6 +1274,12 @@ RefPtr<HoistedPrimalsInfo> ensurePrimalAvailability( setInsertAfterOrdinaryInst(&builder, getInstInBlock(storeUse->getUser())); + // There is an edge-case optimization we apply here, + // If none of the out-of-scope uses are actually within the indexed + // region, that means there's no need to allocate a fully indexed var. + // + defBlockIndices = maybeTrimIndices(defBlockIndices, indexedBlockInfo, outOfScopeUses); + IRVar* localVar = storeIndexedValue( &builder, varBlock, @@ -1260,6 +1310,11 @@ RefPtr<HoistedPrimalsInfo> ensurePrimalAvailability( { defBlockIndices.removeAt(0); } + else + { + // For all others, check out of scope uses and trim indices if possible. + defBlockIndices = maybeTrimIndices(defBlockIndices, indexedBlockInfo, outOfScopeUses); + } setInsertAfterOrdinaryInst(&builder, instToStore); auto localVar = storeIndexedValue(&builder, varBlock, instToStore, defBlockIndices); @@ -1650,6 +1705,7 @@ static bool shouldStoreInst(IRInst* inst) case kIROp_BitXor: case kIROp_Lsh: case kIROp_Rsh: + case kIROp_Select: return false; case kIROp_GetElement: diff --git a/tests/autodiff/reverse-loop-checkpoint-test.slang b/tests/autodiff/reverse-loop-checkpoint-test.slang new file mode 100644 index 000000000..732360013 --- /dev/null +++ b/tests/autodiff/reverse-loop-checkpoint-test.slang @@ -0,0 +1,95 @@ +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -output-using-type +//TEST:SIMPLE(filecheck=CHECK): -target hlsl -profile cs_5_0 -entry computeMain -line-directive-mode none + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0], stride=4):out,name=outputBuffer +RWStructuredBuffer<float> outputBuffer; + +typedef DifferentialPair<float> dpfloat; +typedef float.Differential dfloat; + +// Test that compute does not have a context. +// CHECK-NOT: struct {{[a-zA-Z0-9_]*}}_compute_{{[a-zA-Z0-9_]*}} + +[BackwardDifferentiable] +[PreferRecompute] +float compute(float x, float y) +{ + return x * y; +} + +[BackwardDifferentiable] +[ForceInline] +float infinitesimal(float x) +{ + return x - detach(x); +} + +// Test that computeLoop compiles to just return 0. +// CHECK: float3 computeLoop{{[_0-9]*}}(float y{{[_0-9]*}}) +// CHECK-NOT: for{{.*}} +// CHECK: return (float3)0 + +// Test that computeLoop's intermediates have no float sitting +// around (must not cache the outvar from 'compute()') +// CHECK: struct s_bwd_computeLoop_Intermediates +// CHECK-NEXT: { +// CHECK-NOT: {{[A-Za-z0-9_]+}} {{[A-Za-z0-9_]+}}[{{.*}}] +// CHECK: } + +[PreferRecompute] +[BackwardDifferentiable] +[ForceInline] +float3 infinitesimal(float3 x) +{ + return x - detach(x); +} + +[BackwardDifferentiable] +[PreferRecompute] +float3 computeLoop(float y) +{ + float w = 0; + float3 w3 = float3(0, 0, 0); + + for (int i = 0; i < 8; i++) + { + float k = compute(i, y); + float g = select(k > 0.0, k, 0.0); + w += g; + w3 += float3(g) * y; + } + + var p = (w3 / float3(w)); + return infinitesimal(p); +} + +// Since computeLoop is recomputed, test_simple_loop should have nothing to store +// therefore we check that there is no intermediate context type generated for test_simple_loop. + +// CHECK-NOT: struct {{[a-zA-Z0-9_]*}}test_simple_loop{{[a-zA-Z0-9_]*}} +[BackwardDifferentiable] +float test_simple_loop(float y) +{ + float3 x = computeLoop(y); + return y + x.x; +} + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + { + dpfloat dpa = dpfloat(1.0, 0.0); + + __bwd_diff(test_simple_loop)(dpa, 1.0f); + outputBuffer[0] = dpa.d; // Expect: 1.0 + } + + { + dpfloat dpa = dpfloat(0.4, 0.0); + + __bwd_diff(test_simple_loop)(dpa, 0.5f); + outputBuffer[1] = dpa.d; // Expect: 2.0 + } + + outputBuffer[2] = computeLoop(1.0).x; +} diff --git a/tests/autodiff/reverse-loop-checkpoint-test.slang.expected.txt b/tests/autodiff/reverse-loop-checkpoint-test.slang.expected.txt new file mode 100644 index 000000000..86aa47f11 --- /dev/null +++ b/tests/autodiff/reverse-loop-checkpoint-test.slang.expected.txt @@ -0,0 +1,6 @@ +type: float +2.000000 +1.000000 +0.000000 +0.000000 +0.000000 |
