summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
Diffstat (limited to 'source')
-rw-r--r--source/slang/hlsl.meta.slang9
-rw-r--r--source/slang/slang-emit-cuda.cpp25
-rw-r--r--source/slang/slang-emit.cpp26
-rw-r--r--source/slang/slang-ir-dominators.cpp50
-rw-r--r--source/slang/slang-ir-dominators.h4
-rw-r--r--source/slang/slang-ir-inst-defs.h8
-rw-r--r--source/slang/slang-ir-insts.h9
-rw-r--r--source/slang/slang-ir-ssa.cpp167
-rw-r--r--source/slang/slang-ir-synthesize-active-mask.cpp2109
-rw-r--r--source/slang/slang-ir-synthesize-active-mask.h26
-rw-r--r--source/slang/slang-ir.cpp49
-rw-r--r--source/slang/slang-ir.h28
-rw-r--r--source/slang/slang.vcxproj20
-rw-r--r--source/slang/slang.vcxproj.filters15
14 files changed, 2463 insertions, 82 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index fcd028b6e..ac49402a1 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2528,12 +2528,17 @@ __target_intrinsic(cuda, "__activemask()")
__target_intrinsic(hlsl, "WaveActiveBallot(true).x")
WaveMask WaveGetConvergedMask();
+__intrinsic_op($(kIROp_WaveGetActiveMask))
+WaveMask __WaveGetActiveMask();
+
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true).x")
__target_intrinsic(hlsl, "WaveActiveBallot(true).x")
-__target_intrinsic(cuda, "__activemask()") // Note: semantically incorrect, but best we can do for now.
-WaveMask WaveGetActiveMask();
+WaveMask WaveGetActiveMask()
+{
+ return __WaveGetActiveMask();
+}
__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index fa63ba255..d8f910faa 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -518,6 +518,31 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
m_writer->emit("\n}");
return true;
}
+ case kIROp_WaveMaskBallot:
+ {
+ m_writer->emit("__ballot_sync(");
+ emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+ m_writer->emit(", ");
+ emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+ m_writer->emit(")");
+ return true;
+ }
+ case kIROp_WaveMaskMatch:
+ {
+ SemanticVersion version;
+ version.set(7, 0);
+ if (version > m_extensionTracker->m_smVersion)
+ {
+ m_extensionTracker->m_smVersion = version;
+ }
+
+ m_writer->emit("__match_any_sync(");
+ emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+ m_writer->emit(", ");
+ emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+ m_writer->emit(")");
+ return true;
+ }
default: break;
}
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index efa56c261..6c29485cd 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -17,6 +17,7 @@
#include "slang-ir-specialize-resources.h"
#include "slang-ir-ssa.h"
#include "slang-ir-strip-witness-tables.h"
+#include "slang-ir-synthesize-active-mask.h"
#include "slang-ir-union.h"
#include "slang-ir-validate.h"
#include "slang-ir-wrap-structured-buffers.h"
@@ -500,6 +501,31 @@ Result linkAndOptimizeIR(
legalizeByteAddressBufferOps(session, irModule, byteAddressBufferOptions);
}
+ // For CUDA targets only, we will need to turn operations
+ // the implicitly reference the "active mask" into ones
+ // that use (and pass around) an explicit mask instead.
+ //
+ switch(target)
+ {
+ case CodeGenTarget::CUDASource:
+ case CodeGenTarget::PTX:
+ {
+ synthesizeActiveMask(
+ irModule,
+ compileRequest->getSink());
+
+#if 0
+ dumpIRIfEnabled(compileRequest, irModule, "AFTER synthesizeActiveMask");
+#endif
+ validateIRModuleIfEnabled(compileRequest, irModule);
+
+ }
+ break;
+
+ default:
+ break;
+ }
+
// For GLSL only, we will need to perform "legalization" of
// the entry point and any entry-point parameters.
//
diff --git a/source/slang/slang-ir-dominators.cpp b/source/slang/slang-ir-dominators.cpp
index 7960bcaf1..bae9f772f 100644
--- a/source/slang/slang-ir-dominators.cpp
+++ b/source/slang/slang-ir-dominators.cpp
@@ -34,6 +34,27 @@ bool IRDominatorTree::immediatelyDominates(IRBlock* dominator, IRBlock* dominate
bool IRDominatorTree::properlyDominates(IRBlock* dominator, IRBlock* dominated)
{
+ // We need to deal with the cases where `dominator` and/or
+ // `dominated` are unreachable, and thus not represtend
+ // in the nodes of the dominator tree we constructed.
+ //
+ // If `dominated` is unreachable, then there are zero
+ // control flow paths that can reach it, so that *all*
+ // of those (zero) control flow paths go through
+ // `dominator`.
+ //
+ if(isUnreachable(dominated))
+ return true;
+
+ // If `dominated` is reachable then there must exist at least
+ // one control-flow path to it. Thus if `dominator` is not
+ // reachable, it cannot be on that path, and thus must
+ // not be a dominator.
+ //
+ if(isUnreachable(dominator))
+ return false;
+
+
// Because of how we laid out the tree, we can test if one node
// properly dominates another in constant time.
//
@@ -67,6 +88,11 @@ bool IRDominatorTree::dominates(IRBlock* dominator, IRBlock* dominated)
IRBlock* IRDominatorTree::getImmediateDominator(IRBlock* block)
{
+ // An unreachable block has no immediate dominator.
+ //
+ if(isUnreachable(block))
+ return nullptr;
+
// The immediate dominator of a block is its parent
// in the dominator tree. Looking this up is straightforward,
// and we just need to be a bit careful to deal with
@@ -83,6 +109,11 @@ IRBlock* IRDominatorTree::getImmediateDominator(IRBlock* block)
IRDominatorTree::DominatedList IRDominatorTree::getImmediatelyDominatedBlocks(IRBlock* block)
{
+ // An unreachable block doesn't immediately dominate anything.
+ //
+ if(isUnreachable(block))
+ return DominatedList();
+
// Because of our representation, the immediately dominated blocks
// for a node are contiguous, and we store their range in the
// node already.
@@ -99,6 +130,13 @@ IRDominatorTree::DominatedList IRDominatorTree::getImmediatelyDominatedBlocks(IR
IRDominatorTree::DominatedList IRDominatorTree::getProperlyDominatedBlocks(IRBlock* block)
{
+ // Technically each unreachable block dominates all the other
+ // unreachable blocks, but setting things up to answer that
+ // query "correctly" would be a hassle.
+ //
+ if(isUnreachable(block))
+ return DominatedList();
+
// Because of our representation, the properly dominated blocks
// for a node are contiguous, and we store their range in the
// node already.
@@ -123,6 +161,12 @@ Int IRDominatorTree::getBlockIndex(IRBlock* block)
return index;
}
+bool IRDominatorTree::isUnreachable(IRBlock* block)
+{
+ return !mapBlockToIndex.ContainsKey(block);
+}
+
+
// IRDominatorTree::DominatedList
IRDominatorTree::DominatedList::DominatedList()
@@ -181,6 +225,12 @@ bool IRDominatorTree::DominatedList::Iterator::operator==(Iterator const& that)
return mIndex == that.mIndex;
}
+bool IRDominatorTree::DominatedList::Iterator::operator!=(Iterator const& that) const
+{
+ SLANG_ASSERT(mTree == that.mTree);
+ return mIndex != that.mIndex;
+}
+
//
// The dominance computation algorithm we are using relies on being able to compute
// a reverse postorder traversal of the nodes in the CFG, which is done using a depth-first
diff --git a/source/slang/slang-ir-dominators.h b/source/slang/slang-ir-dominators.h
index 7ec31b821..f4ecd0cd0 100644
--- a/source/slang/slang-ir-dominators.h
+++ b/source/slang/slang-ir-dominators.h
@@ -54,6 +54,9 @@ namespace Slang
/// These are the descendents of the block in the dominator tree.
DominatedList getProperlyDominatedBlocks(IRBlock* block);
+ /// Is `block` unrechable in the control flow graph?
+ bool isUnreachable(IRBlock* block);
+
struct DominatedList
{
public:
@@ -67,6 +70,7 @@ namespace Slang
IRBlock* operator*() const;
void operator++();
bool operator==(Iterator const& that) const;
+ bool operator!=(Iterator const& that) const;
private:
friend struct DominatedList;
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index 46ad566fd..b07703503 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -417,6 +417,14 @@ INST(Dot, dot, 2, 0)
INST(GetStringHash, getStringHash, 1, 0)
+INST(WaveGetActiveMask, waveGetActiveMask, 0, 0)
+
+ /// trueMask = waveMaskBallot(mask, condition)
+INST(WaveMaskBallot, waveMaskBallot, 2, 0)
+
+ /// matchMask = waveMaskBallot(mask, value)
+INST(WaveMaskMatch, waveMaskMatch, 2, 0)
+
// Texture sampling operation of the form `t.Sample(s,u)`
INST(Sample, sample, 3, 0)
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index 881d7a93b..8b13f48a1 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -1509,6 +1509,9 @@ struct SharedIRBuilder
// TODO: We probably shouldn't use this in the long run.
Dictionary<void*, IRLayout*> layoutMap;
+
+
+ void insertBlockAlongEdge(IREdge const& edge);
};
struct IRBuilderSourceLocRAII;
@@ -2019,6 +2022,12 @@ struct IRBuilder
IRType* type,
IRInst* val);
+ IRInst* emitWaveMaskBallot(IRType* type, IRInst* mask, IRInst* condition);
+ IRInst* emitWaveMaskMatch(IRType* type, IRInst* mask, IRInst* value);
+
+ IRInst* emitBitAnd(IRType* type, IRInst* left, IRInst* right);
+ IRInst* emitBitNot(IRType* type, IRInst* value);
+
//
// Decorations
//
diff --git a/source/slang/slang-ir-ssa.cpp b/source/slang/slang-ir-ssa.cpp
index 08a5ffa67..19f85eaa9 100644
--- a/source/slang/slang-ir-ssa.cpp
+++ b/source/slang/slang-ir-ssa.cpp
@@ -891,63 +891,127 @@ void processBlock(
}
}
-static void breakCriticalEdges(
- ConstructSSAContext* context)
+IRBlock* IREdge::getPredecessor() const
+{
+ return cast<IRBlock>(getUse()->getUser()->parent);
+}
+
+IRBlock* IREdge::getSuccessor() const
+{
+ return cast<IRBlock>(getUse()->get());
+}
+
+void SharedIRBuilder::insertBlockAlongEdge(
+ IREdge const& edge)
+{
+ auto pred = edge.getPredecessor();
+ auto succ = edge.getSuccessor();
+ auto edgeUse = edge.getUse();
+
+ IRBuilder builder;
+ builder.sharedBuilder = this;
+ builder.setInsertInto(pred);
+
+ // Create a new block that will sit "along" the edge
+ IRBlock* edgeBlock = builder.createBlock();
+
+ edgeUse->debugValidate();
+
+ // The predecessor block should now branch to
+ // the edge block.
+ edgeUse->set(edgeBlock);
+
+ // The edge block should branch (unconditionally)
+ // to the successor block.
+ builder.setInsertInto(edgeBlock);
+ builder.emitBranch(succ);
+
+ // Insert the new block into the block list
+ // for the function.
+ //
+ // In principle, the order of this list shouldn't
+ // affect the semantics of a program, but we
+ // might want to be careful about ordering anyway.
+ edgeBlock->insertAfter(pred);
+}
+
+bool IREdge::isCritical() const
{
// A critical edge is an edge P -> S where
// P has multiple sucessors, and S has multiple
// predecessors.
//
- // In the context of our CFG representation, such an edge
- // will be an `IRUse` in the terminator instruction of block P,
- // which refers to block S.
+ auto pred = getPredecessor();
+ auto succ = getSuccessor();
+
+ // If the predecessor block doesn't have multiple successors,
+ // then this can't be a critical edge.
//
- // We will make a pass over the CFG to collect all the critical
- // edges, and then we will break them in a follow-up pass.
+ if(pred->getSuccessors().getCount() <= 1)
+ return false;
- List<IRUse*> criticalEdges;
+ // For the edge to be critical, the successor must have
+ // more than one predecessor.
+ //
+ // More than that, we require that it has more than one
+ // *unique* predecessor, to handle the case where multiple
+ // cases of a `switch` might lead to the same block.
+ //
+ // To implement this, we test if it has any predecessor
+ // other than `pred` which we already know about.
+ //
+ bool multiplePreds = false;
+ for (auto pp : succ->getPredecessors())
+ {
+ if (pp != pred)
+ {
+ multiplePreds = true;
+ break;
+ }
+ }
+ if(!multiplePreds)
+ return false;
+ // At this point we have confirmed that `edgeUse`
+ // leads from a block with multiple successors
+ // to one with multiple (distinct) predecessors,
+ // so we are sure it is a critical edge.
+ //
+ return true;
+}
+
+static void breakCriticalEdges(
+ ConstructSSAContext* context)
+{
auto globalVal = context->globalVal;
+
+ // We will make a pass over the CFG to collect all the critical
+ // edges, and then we will break them in a follow-up pass.
+ //
+ List<IREdge> criticalEdges;
for (auto pred = globalVal->getFirstBlock(); pred; pred = pred->getNextBlock())
{
+ // As an early out: if the predecessor block
+ // has one (or fewer) successors, then no
+ // edge from that block can be critical.
+ //
auto successors = pred->getSuccessors();
if (successors.getCount() <= 1)
continue;
+ // Otherwise, we iterate over its outgoing
+ // edges (represented with its successor list),
+ // and record those that are critical.
+ //
auto succIter = successors.begin();
auto succEnd = successors.end();
-
for (; succIter != succEnd; ++succIter)
{
- auto succ = *succIter;
-
- // For the edge to be critical, the successor must have
- // more than one predecessor.
- // More than that, we require that it has more than one
- // *unique* predecessor, to handle the case where multiple
- // cases of a `switch` might lead to the same block.
- //
- // To implement this, we test if it has any predecessor
- // other than `pred` which we already know about.
-
- bool multiplePreds = false;
- for (auto pp : succ->getPredecessors())
+ auto edge = succIter.getEdge();
+ if( edge.isCritical() )
{
- if (pp != pred)
- {
- multiplePreds = true;
- break;
- }
+ criticalEdges.add(edge);
}
- if (!multiplePreds)
- continue;
-
- // We have found a critical edge from `pred` to `succ`.
- //
- // Furthermore, the `IRUse` embedded in `succIter` represents
- // that edge directly.
- auto edgeUse = succIter.use;
- criticalEdges.add(edgeUse);
}
}
@@ -956,36 +1020,9 @@ static void breakCriticalEdges(
// to break the edges while doing the initial walk, because
// that would change the CFG while we are walking it.
- for (auto edgeUse : criticalEdges)
+ for (auto edge : criticalEdges)
{
- auto pred = cast<IRBlock>(edgeUse->getUser()->parent);
- auto succ = cast<IRBlock>(edgeUse->get());
-
- IRBuilder builder;
- builder.sharedBuilder = &context->sharedBuilder;
- builder.setInsertInto(pred);
-
- // Create a new block that will sit "along" the edge
- IRBlock* edgeBlock = builder.createBlock();
-
- edgeUse->debugValidate();
-
- // The predecessor block should now branch to
- // the edge block.
- edgeUse->set(edgeBlock);
-
- // The edge block should branch (unconditionally)
- // to the successor block.
- builder.setInsertInto(edgeBlock);
- builder.emitBranch(succ);
-
- // Insert the new block into the block list
- // for the function.
- //
- // In principle, the order of this list shouldn't
- // affect the semantics of a program, but we
- // might want to be careful about ordering anyway.
- edgeBlock->insertAfter(pred);
+ context->sharedBuilder.insertBlockAlongEdge(edge);
}
}
diff --git a/source/slang/slang-ir-synthesize-active-mask.cpp b/source/slang/slang-ir-synthesize-active-mask.cpp
new file mode 100644
index 000000000..a92ef2c80
--- /dev/null
+++ b/source/slang/slang-ir-synthesize-active-mask.cpp
@@ -0,0 +1,2109 @@
+// slang-ir-synthesize-active-mask.cpp
+#include "slang-ir-synthesize-active-mask.h"
+
+#include "slang-ir-dominators.h"
+#include "slang-ir-insts.h"
+
+namespace Slang
+{
+
+// This file implements a pass on the IR to make the "active mask" concept
+// from HLSL explicit in the code, so that we can generate code for
+// targets like CUDA where explicit masks are used for warp-/wave-level
+// collective operations.
+//
+// At the time this pass was written, the exact semantics of the implicit
+// active mask in HLSL (and GLSL/SPIR-V) had not been specified, so this
+// file had to define what the semantics ought to be.
+//
+// The main guideline for these semantics is that it should conform to
+// the user's view of control flow based on the high-level language.
+//
+// The intuitive version of the semantics provided by this pass are:
+//
+// * Code is conceptually executed by "shards" of threads/lanes, where
+// each shard consists of a subset of the threads/lanes in the warp/wave
+// that are executing "together."
+//
+// * When a shard encounters a structured control-flow statement, the
+// shard may fragment into sub-shards that partition its set of threads/lanes.
+// E.g. for a two-way `if` we end up with a sub-shard for threads/lanes
+// where the condition was `true`, and another sub-shard for the
+// threads/lanes where the condition was `false`.
+//
+// * When the threads in a shard exit some structured control-flow
+// statement normally (e.g., execution reaches the end of the "then"
+// statement for an `if`), those threads *re-converge* with all the other
+// threads that entered the structured statement together and which
+// exited normally.
+//
+// * When threads in a shard exit some structured control-flow statement
+// "abnormally" (bypassing the statements that logically come after),
+// those threads do *not* re-converge with the other threads that exited
+// normally, although they may still re-converge at the normal exit of
+// some lexically outer statement.
+//
+// Turning these rules into something formal mostly comes down to codifying:
+//
+// 1. when threads exit a structured control flow statement, and
+// 2. which of those exits are "normal" vs. "abnormal."
+//
+// The Slang IR maintains structure information in its IR (similar to the
+// information maintained by SPIR-V for Vulkan). For a structured control
+// flow statement like an `if`, the IR-level representation encodes both
+// the conditional branch for the `if`, but also the label/block representing
+// the code that logically comes after the `if` in the high-level-language
+// code. We can refer to a block like that as a "merge point."
+//
+// Our more formal divergence and re-convergence behavior then depends
+// on two definitions:
+//
+// 1. Code is "inside" a control-flow structure if it is dominated by
+// the entry block, but not dominated by the merge block. Control flow
+// thus exits the structure whenever it branchs from a block inside the
+// structure to some place not inside the structure.
+//
+// 2. An edge that exits a control-flow structure represents a "normal"
+// exit if it branches to the merge block for the structure, and an
+// "abnormal" exit otherwise.
+//
+// These definitions result in divergence and re-convergence behavior
+// that closely matches what programmers expect from their high-level
+// language code.
+//
+// (Note: because the definition of what code is inside a control-flow
+// structure depends on dominance in the CFG, the existence of arbitrary
+// `goto` operations that can jump "into" a control-flow statement would
+// break our assumptions. This pass would only be compatible with
+// `goto` operations that jump "out" of control-flow statements.)
+
+// The translation of code to make masks explicit can be broadly
+// broken up into:
+//
+// * Analysis and transformation of the entire module and its call
+// graph, to identify and update functions that need an explicit
+// active mask.
+//
+// * Analysis and transformation of a single function, that was
+// identified by the above logic.
+//
+// We will start with the whole-module logic.
+//
+// As it typical for our IR passes, we will wrap up the module-level
+// synthesis work in a "context" type.
+//
+struct SynthesizeActiveMaskForModuleContext
+{
+ // The context needs to know the module it will process, and to
+ // have a sink where it can report any errors it might run into
+ // (e.g., any cases where a mask is needed but cannot be synthesized)
+ //
+ IRModule* m_module;
+ DiagnosticSink* m_sink;
+
+ // We use a single shared IR builder for the entire pass, to
+ // make sure we deduplicate types/values as much as possible.
+ //
+ SharedIRBuilder m_sharedBuilder;
+
+ // Because we are introducing explicit masks, it is important
+ // for all the code to agree on the type of mask that will be
+ // used.
+ //
+ IRType* m_maskType;
+
+ void processModule()
+ {
+ // When asked to process an IR module, our pass first
+ // sets up the shared state.
+ //
+ m_sharedBuilder.session = m_module->getSession();
+ m_sharedBuilder.module = m_module;
+
+ // We use the plain 32-bit `uint` type masks we
+ // generate here since it matches the current
+ // definition of `WaveMask` in the standard library.
+ //
+ // TODO: If/when the `WaveMask` type in the stdlib is
+ // made opaque, this should use the opaque type instead,
+ // so that the pass is compatible with all targets that
+ // support a wave mask.
+ //
+ IRBuilder builder;
+ builder.sharedBuilder = &m_sharedBuilder;
+ m_maskType = builder.getBasicType(BaseType::UInt);
+
+ // With the setup out of the way, the job of the module
+ // level pass is simple to describe:
+ //
+ // First we want to identify and mark all of the functions
+ // in the module that use the active mask.
+ //
+ markFuncsUsingActiveMask();
+ //
+ // Then we want to transform all of those functions that
+ // were marked so that they use an explicitly synthesized
+ // value for the active mask.
+ //
+ transformFuncsUsingActiveMask();
+ }
+
+ // During the marking process we will build up a list of functions
+ // that need the active mask, and we will also maintain a set of
+ // those functions so that we don't waste time redundantly considering
+ // functions we've already marked.
+ //
+ List<IRFunc*> m_funcsUsingActiveMask;
+ HashSet<IRFunc*> m_funcsUsingActiveMaskSet;
+
+ // When the code finds an IR function that seems to use the active
+ // mask, we will mark it by adding it to the list and set, but
+ // only if we haven't already done so previously.
+ //
+ void markFuncUsingActiveMask(IRFunc* func)
+ {
+ if(m_funcsUsingActiveMaskSet.Contains(func))
+ return;
+
+ m_funcsUsingActiveMask.add(func);
+ m_funcsUsingActiveMaskSet.Add(func);
+ }
+
+ // The easiest way to know that a function uses the active
+ // mask is if it contains an *instruction* that uses the active
+ // mask.
+ //
+ // Because it is easiest to detect use of the active mask at
+ // an instruciton level, it is convenient to have a utility
+ // that, given an instruction which uses the active mask,
+ // marks the function that contains it (if any) as using
+ // the active mask.
+ //
+ void markInstUsingActiveMask(IRInst* inst)
+ {
+ // We expect the immedaite parent of an ordinary
+ // instruction to be a basic block (although in
+ // unlikely cases an instruction can be directly nested
+ // in the `IRModule`).
+ //
+ auto parent = inst->getParent();
+ if( auto block = as<IRBlock>(parent) )
+ {
+ parent = block->getParent();
+ }
+ //
+ // We expect the immediate parent of that block
+ // to be an IR function (again, in unlikely cases
+ // the block could be nested in an IR generic,
+ // or a global variable with an initializer, etc.).
+ //
+ if( auto func = as<IRFunc>(parent) )
+ {
+ markFuncUsingActiveMask(func);
+ }
+ }
+
+ void markFuncsUsingActiveMask()
+ {
+ // We break down the task of identifying all functions
+ // that use the active mask into two steps.
+ //
+ // First we identify those functions that *directly* use
+ // the active mask, by virtuae of having a `getActiveMask`
+ // instruction in their body.
+ //
+ markFuncsDirectlyUsingActiveMaskRec(m_module->getModuleInst());
+
+ // Second, we identify any function that indirectly use
+ // the active mask.
+ //
+ // The detecting of indirect use is handled by looking
+ // at all the functions we've already marked, and identifying
+ // unmarked functions that call marked functions.
+ //
+ // We also modify these call sites (from unmarked functions
+ // to marked functions) so that they pass along the explicit
+ // active mask value to the callee.
+ //
+ // Note: we are iterating over `m_funcsUsingActiveMask` here
+ // while also invoking code that could add additional
+ // functions to that list. As such it is *intentional* that
+ // we are querying `getCount()` on the list on every iteration
+ // of the loop, rather than querying it once and using a
+ // variable (or using a range-based `for` loop).
+ //
+ for( Index i = 0; i < m_funcsUsingActiveMask.getCount(); ++i )
+ {
+ markAndModifyFuncsIndirectlyUsingActiveMask(m_funcsUsingActiveMask[i]);
+ }
+
+ // TODO: The logic here isn't robust against cases where we
+ // might have indirect calls.
+ //
+ // One possible solution would be to always treat indirect
+ // calls as if they require an active mask, and pass one along.
+ // Then all functions that might be used as the callee of an
+ // indirect call site (those whose values "escape") would need
+ // to have a variant that takes the active mask generated (if
+ // they didn't actually need the active mask based on their body).
+ // All function pointer values would be based on the mask-taking
+ // variant.
+ //
+ // Another alternative would be to make dependency on the
+ // active mask an explicit part of the type/signature of functions
+ // in the case where masks need to be passed, so that even indirect
+ // call sites could be decomposed into mask-taking and non-mask-taking
+ // cases just based on type information.
+ //
+ // There are probably other options to consider, each with
+ // its own trade-offs. This is a design question that we aren't
+ // really equipped to tackle right now, so the easiest solution
+ // is to defer the decision until we actually need an answer.
+ }
+
+ void markFuncsDirectlyUsingActiveMaskRec(IRInst* inst)
+ {
+ // Detecting functions that directly use the active
+ // mask is fairly simple: we recursively walk the
+ // IR module and look for instances of `waveGetActiveMask`
+ // instructions. When we find one we mark the function
+ // that contains it.
+
+ if( inst->op == kIROp_WaveGetActiveMask )
+ {
+ markInstUsingActiveMask(inst);
+ }
+
+ for( auto child : inst->getChildren() )
+ {
+ markFuncsDirectlyUsingActiveMaskRec(child);
+ }
+ }
+
+ void markAndModifyFuncsIndirectlyUsingActiveMask(IRFunc* callee)
+ {
+ // In order to detect functions that indirectly use the active
+ // mask through `callee`, we need to identify call sites.
+ //
+ // We will build up a list of call sites during the marking
+ // step, so that we can also modify those call sites later
+ // in this function.
+
+ List<IRCall*> calls;
+ for( IRUse* use = callee->firstUse; use; use = use->nextUse )
+ {
+ // We are looking for instructions that use `callee`,
+ // where the instruction is a call, and `callee` is used
+ // as, well, the callee.
+ //
+ IRInst* user = use->getUser();
+ IRCall* call = as<IRCall>(user);
+ if(!call)
+ continue;
+ if(call->getCallee() != callee)
+ continue;
+
+ // Once we have found a call site to `callee`,
+ // we mark the function that contains the call
+ // site as one we need to process, and also
+ // add the call site to our list of call sites
+ // we need to modify.
+ //
+ markInstUsingActiveMask(call);
+ calls.add(call);
+ }
+
+ // Once we've discovered all the call sites for `callee`,
+ // we will go ahead and modify them.
+ //
+ // (We don't mix up marking and modification because
+ // changing the use/def list of `callee` while also
+ // walking it would be dangerous)
+ //
+ for(auto call : calls)
+ {
+ // The basic situation here is that we have a call of the form:
+ //
+ // callee(arg0, arg1, arg2, ...);
+ //
+ // and we want to transform it to:
+ //
+ // mask = waveGetActiveMask();
+ // callee(arg0, arg1, arg2, ..., m);
+ //
+ IRBuilder builder;
+ builder.sharedBuilder = &m_sharedBuilder;
+ builder.setInsertBefore(call);
+
+ // First we synthesize the mask to pass down by
+ // directly invoking `waveGetActiveMask()`.
+ //
+ // Note that after this instruction has been inserted,
+ // the function containing `call` is now a function
+ // that directly uses the active mask.
+ //
+ auto mask = builder.emitIntrinsicInst(builder.getBasicType(BaseType::UInt), kIROp_WaveGetActiveMask, 0, nullptr);
+
+ // Next we synthesize the new argument list for the
+ // call, which consists of the original arguments,
+ // followed by the `mask`.
+ //
+ List<IRInst*> newArgs;
+ Int originalArgCount = call->getArgCount();
+ for(Int a = 0; a < originalArgCount; ++a)
+ {
+ newArgs.add(call->getArg(a));
+ }
+ newArgs.add(mask);
+
+ // Finally we emit a new call instruction to the same
+ // callee with our new arguments, and use the new call
+ // to replace the original `call`.
+ //
+ // Note: We can't just modify the `call` in-place because
+ // of the way that the operands to an `IRInst` are allocated
+ // contiguously in the same storage as the `IRInst` itself.
+ //
+ auto newCall = builder.emitCallInst(call->getFullType(), call->getCallee(), newArgs);
+ call->replaceUsesWith(newCall);
+ call->removeAndDeallocate();
+ }
+ }
+
+ void transformFuncsUsingActiveMask()
+ {
+ // Once all the functions that need the active mask have
+ // been marked (and all call sites to them have
+ // been modified), we can transform each of the marked
+ // functions one by one.
+ //
+ for( auto func : m_funcsUsingActiveMask )
+ {
+ transformFuncUsingActiveMask(func);
+ }
+ }
+
+ // We will get to the details of how we transform each
+ // function shortly, but for now we just forward-declare
+ // the operation that makes it happen.
+ //
+ void transformFuncUsingActiveMask(IRFunc* func);
+};
+
+// The module-level pass isn't too complicated, and its main job is
+// to reduce the scope of the problem down to individual functions.
+// As a result the function-level pass, which has to deal with much
+// more complicated issues, at least doesn't have to deal with
+// distractions related to the whole-module processing.
+//
+struct SynthesizeActiveMaskForFunctionContext
+{
+ // The funciton-level pass unsurprisngly operates on one IR function.
+ //
+ IRFunc* m_func;
+
+ // Some of the state used for building up mask operations is
+ // passed down form the module-level pass.
+ //
+ SharedIRBuilder* m_sharedBuilder;
+ IRType* m_maskType;
+
+ void transformFunc()
+ {
+ // The first thing we will do to our function is preprocess it
+ // so that it satisfies several properties that will make our
+ // pass easier to implement correctly.
+ //
+ // Note that given a CFG edge P->S from a predecessor P to
+ // a successor S, we can always rewrite the CFG by removing
+ // the edge P->S, creating a new block E, and then inserting
+ // edges P->E and E->S. That transformation is referred to
+ // as "breaking" the edge P->S, and it always preserves the
+ // semantics of the original program.
+ //
+ // We will start out by breaking several kinds of edges in
+ // the CFG of the function. Specifically, we break:
+ //
+ // * *Critical* edges which are those from a block P with
+ // multiple successors to a block S with multiple predecessors.
+ //
+ // * What we are calling *pseudo-critical* edges, which are
+ // those from a block P with multiple successors to a block
+ // M that is used as the merge point of a structured control-flow
+ // construct.
+ //
+ // The benefit of these transformations is the invariants they
+ // provide us for the rest of this pass:
+ //
+ // * When looking at a conditional branch instruction, we can assume:
+ //
+ // * The block with the branch is the only predecessor of the target blocks
+ //
+ // * The block with the branch is the immediate dominator of its target blocks
+ //
+ // * None of the target blocks is the merge point for the conditional branch
+ //
+ // * As a corallary of the above, all branches to a merge block are uncondtiional branches
+ //
+ // * Any block with multiple predecessors is only targetted by unconditional branches
+ //
+ // We will hightlight our use of these assumptions in the code
+ // that takes advantage of them.
+ //
+ breakCriticalAndPseudoCriticalEdges();
+
+ // Given that we are processing a function that had a `waveGetActiveMask` instruction
+ // in its body, we expect to find an entry block on the function.
+ //
+ auto funcEntryBlock = m_func->getFirstBlock();
+ SLANG_ASSERT(funcEntryBlock);
+
+ // We don't want to inject logic for computing and maintaining the
+ // active mask into parts of a function that don't actually use it,
+ // so one of the first things we do (once we've finished messing with
+ // the CFG of the function) is to mark which blocks (transitively) need
+ // the active mask, and which don't.
+ //
+ markBlocksNeedingActiveMask();
+ //
+ // We expect to find that the entry block to the function needs the
+ // active mask, or else that would imply nothing else in the function
+ // did, and we shouldn't be processing this function at all.
+ //
+ SLANG_ASSERT(m_blocksNeedingActiveMask.Contains(funcEntryBlock));
+
+ // Our basic approach will be to associate an `IRInst*` that represents
+ // the active mask value to use with each basic block of the function.
+ //
+ // The active mask to use for the entry block of a function is a special
+ // case, since it can't be derived from the value in other blocks.
+ //
+ createInitialActiveMask(funcEntryBlock);
+
+ // For blocks wtih multiple predecessors, it is possible that the correct
+ // active mask to use will depend on the predecessor.
+ //
+ // We will thus add an extra parameter (a phi node in more traditional
+ // SSA representations) to any block with multiple distinct predecessors,
+ // that can be used to represent the incoming active mask.
+ //
+ addActiveMaskParametersToBlocksWithMultiplePredecessors();
+
+ // The remaining work of assigning active masks to blocks will
+ // be handled by a recursive traversal of the function's CFG
+ // in terms of control-flow regions, where the entry block
+ // for the function serves as the starting point of the outer-most region.
+ //
+ transformRegions(funcEntryBlock);
+ }
+
+ void breakCriticalAndPseudoCriticalEdges()
+ {
+ // In order to break all the critical pseudo-critical
+ // edges, we will first identify them and build a list
+ // of `IREdge`s, and then go along and break them all.
+ //
+ // This approach ensures things don't get tripped up
+ // by modifying the CFG while also walking its structure.
+ //
+ List<IREdge> edgesToBreak;
+ //
+ // We are going to consider each block in turn as a
+ // candidate predecessor block, and look at its
+ // outgoing edges.
+ //
+ for (auto pred = m_func->getFirstBlock(); pred; pred = pred->getNextBlock())
+ {
+ auto successors = pred->getSuccessors();
+ auto succIter = successors.begin();
+ auto succEnd = successors.end();
+ for (; succIter != succEnd; ++succIter)
+ {
+ auto edge = succIter.getEdge();
+
+ // We want to collect the edges that are critical
+ // or psuedo-critical.
+ //
+ if( edge.isCritical() || isPseudoCriticalEdge(edge) )
+ {
+ edgesToBreak.add(edge);
+ }
+ }
+ }
+
+ // Once we've identified the edges we want to break,
+ // we do so by inserting an edge along them.
+ //
+ for( auto& edge : edgesToBreak )
+ {
+ m_sharedBuilder->insertBlockAlongEdge(edge);
+ }
+ }
+
+ bool isPseudoCriticalEdge(IREdge const& edge)
+ {
+ // Our definition of a pseudo-critical edge is one
+ // where the prececessors is a conditional branch
+ // (meaning it has multiple outgoing edges), and
+ // where the successor is a merge point for some
+ // structured control-flow construct.
+ //
+ auto pred = edge.getPredecessor();
+ auto succ = edge.getSuccessor();
+
+ // The condition on the predecessor is easy
+ // enough to check.
+ //
+ if(pred->getSuccessors().getCount() <= 1)
+ return false;
+
+ // To check if the successor is used as a merge
+ // point, we must iterate over its uses.
+ //
+ for( auto use = succ->firstUse; use; use = use->nextUse )
+ {
+ // For each use, we need to consider if
+ // the user represents a structured control-flow
+ // construct, and then whether the `succ` block
+ // is being used as a merge point in that construct.
+ //
+ auto user = use->getUser();
+ if(auto ifElseInst = as<IRIfElse>(user))
+ {
+ // The merge point for an `if` or `if`/`else`
+ // if the block *after* the statement.
+ //
+ if(ifElseInst->getAfterBlock() == succ)
+ return true;
+ }
+ else if( auto switchInst = as<IRSwitch>(user) )
+ {
+ // The merge point for a `switch` is the block
+ // after the statement, which is also the label
+ // where control flow goes on a `break`.
+ //
+ if(switchInst->getBreakLabel() == succ)
+ return true;
+ }
+ else if( auto loopInst = as<IRLoop>(user) )
+ {
+ // A loop construct can actually have two
+ // merge points.
+ //
+ // First, the merge point for the entire loop
+ // is the block after the loop, which is also
+ // where control flow goes on a `break`.
+ //
+ if(loopInst->getBreakBlock() == succ)
+ return true;
+ //
+ // Second, for a given iteration of the loop,
+ // the "continue clause" of a `for` loop
+ // is a merge point, and is where control flow
+ // goes on a `continue`.
+ //
+ if(loopInst->getContinueBlock() == succ)
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ // We want to identify which blocks need the active mask,
+ // and will use a set to keep track of them.
+ //
+ HashSet<IRBlock*> m_blocksNeedingActiveMask;
+
+ // To make things convenient for downstream code, we will
+ // define a simple accessor to check if a block needs the
+ // active mask, as determined by our setup logic.
+ //
+ bool doesBlockNeedActiveMask(IRBlock* block)
+ {
+ return m_blocksNeedingActiveMask.Contains(block);
+ }
+
+ void markBlocksNeedingActiveMask()
+ {
+ // We can start by identifying the blocks that directly
+ // use the active because they contain a `waveGetActiveMask()`
+ // instruciton.
+ //
+ // Note: the module-level pass will have modified any
+ // function that indirectly uses the active mask (that is,
+ // a function that calls another functio nneeding the mask)
+ // so that it uses `waveGetActiveMask` at those call sites,
+ // so there whould always be at least one block that gets
+ // discovered this way, even in functions that did not
+ // initially contain direct uses of `waveGetActiveMask`.
+ //
+ for( auto block : m_func->getBlocks() )
+ {
+ for( auto inst : block->getOrdinaryInsts() )
+ {
+ if( inst->op == kIROp_WaveGetActiveMask )
+ {
+ m_blocksNeedingActiveMask.Add(block);
+ break;
+ }
+ }
+ }
+
+ // Once we've identified an initial set of blocks that need/use
+ // the active mask, we want to mark any blocks that lead to
+ // blocks we've already marked, since they will need to pass
+ // along the active mask.
+ //
+ // We will do this in an iterative fashion, by looping over
+ // all the blocks in the CFG and checking if they branch to
+ // a block that needs the active mask.
+ //
+ {
+ // Once we make a sweep over the CFG and don't see any change
+ // in the decision about what blocks use the active mask, we
+ // know we have converged.
+ //
+ bool change = true;
+ while( change )
+ {
+ change = false;
+
+ // Because we are solving a backwards dataflow problem,
+ // we iterate over the blocks of the function in reverse
+ // order to minimize the number of iterations required
+ // in the common case.
+ //
+ for(auto block = m_func->getLastBlock(); block; block = block->getPrevBlock())
+ {
+ if(m_blocksNeedingActiveMask.Contains(block))
+ continue;
+
+ for( auto successor : block->getSuccessors() )
+ {
+ if( !m_blocksNeedingActiveMask.Contains(successor) )
+ continue;
+
+ // If we get here then `block` has *not* been marked
+ // as needing the active mask, but it branches to
+ // `successor` which *has* been marked, so we need
+ // to mark `block` and keep looking for changes.
+ //
+ m_blocksNeedingActiveMask.Add(block);
+ change = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // As described previously, our main goal in this pass is to associated an
+ // `IRInst*` representing the active mask with each block in the CFG
+ // of the function (or rather, with each block that *needs* the active mask).
+ //
+ // We will keep track of the active mask values that have been registered
+ // so far in a dictionary.
+ //
+ Dictionary<IRBlock*, IRInst*> m_activeMaskForBlock;
+
+ void createInitialActiveMask(IRBlock* funcEntryBlock)
+ {
+ // The active mask value to use on entry to a function depends
+ // on whether the function is an entry point or not.
+ //
+ // The easy case is ordinary functions (ones that aren't entry
+ // points).
+ //
+ if( !m_func->findDecoration<IREntryPointDecoration>() )
+ {
+ // We simplyu need to add a new parameter to the entry block
+ // (which holds the parameters for the function itself).
+ // That parameter will receive the initial of the active
+ // mask as an argument at (modified) call sites to the function.
+ //
+ addActiveMaskParameter(funcEntryBlock);
+ }
+ else
+ {
+ // The case of a shader entry point is a bit trickier.
+ //
+ // We can't change the signature of a shader entry point (e.g., adding
+ // new varying parameters), so we need to compute the initial active
+ // mask value from first principles.
+ //
+ // We will insert the code we generate at the start of the entry block.
+ //
+ IRBuilder builder;
+ builder.sharedBuilder = m_sharedBuilder;
+ builder.setInsertBefore(funcEntryBlock->getFirstOrdinaryInst());
+ //
+ // A naive approach would be to set the active mask to an all-ones
+ // value representing the idea that all threads/lanes in the warp/wave
+ // will be active at the start of execution.
+ //
+ auto allLanesMask = builder.getIntValue(m_maskType, -1);
+ //
+ // That all-ones mask won't actually be correct in any case where
+ // a warp/wave is less than fully occupied (e.g., what if the thread
+ // group size is smaller than the warp/wave size?).
+ //
+ // We can refine that all-ones mask down to a more accurate one by
+ // using a ballot operation:
+ //
+ // initialActiveMask = waveMaskBallot(allOnesMask, true);
+ //
+ auto initialActiveMask = builder.emitWaveMaskBallot(m_maskType, allLanesMask, builder.getBoolValue(true));
+ //
+ // Note: an important detail here is that we are invoking `waveMaskBallot`
+ // with the all-ones mask and *assuming* that a target will properly ignore
+ // the bits in `allOnesMask` that correspond to threads/lanes that aren't
+ // actually executing. Fortunately CUDA at least guarantees this
+ // behavior for `__ballot_sync()` and its other collective operations:
+ // bits corresponding to threads that aren't executing are ignored.
+ //
+ // Once we've computed the mask value to start with, we add it to
+ // our tracking structure so we can remember which value to use.
+ //
+ m_activeMaskForBlock.Add(funcEntryBlock, initialActiveMask);
+ }
+ }
+
+ void addActiveMaskParametersToBlocksWithMultiplePredecessors()
+ {
+ // Blocks with multiple (distinct) predecessors will
+ // recieve their input active mask as a block parameter
+ // (an SSA phi operation in more traditional representations).
+ //
+ for( auto block : m_func->getBlocks() )
+ {
+ // We only care about blocks that want the active mask,
+ // and that have multiple predecessors.
+ //
+ if(!doesBlockNeedActiveMask(block))
+ continue;
+ if(block->getPredecessors().getCount() <= 1)
+ continue;
+
+ // What is more, we only care about blocks with multiple
+ // *distinct* predecessors, which means we need to look
+ // out for things like a `switch` where multiple outgoing
+ // edges from a single block could lead to the same
+ // destination.
+ //
+ // A block with multiple distinct predecessors is one
+ // where the predecessors aren't all the same.
+ //
+ bool allPredsTheSame = true;
+ //
+ // We will check if all the predecessors of `block`
+ // are the same as the first predecessor (which we
+ // know must exist because we dissmissed blocks
+ // with <= 1 predecessor already).
+ //
+ IRBlock* firstPred = *block->getPredecessors().begin();
+ for(auto pred : block->getPredecessors())
+ {
+ if(pred != firstPred)
+ {
+ allPredsTheSame = false;
+ break;
+ }
+ }
+ if(allPredsTheSame)
+ continue;
+
+ // If we have identified a block with multiple distinct
+ // predecessors, then we know that it needs a new
+ // block parameter to be added to represent the active mask.
+ //
+ addActiveMaskParameter(block);
+ }
+ }
+
+ void addActiveMaskParameter(IRBlock* block)
+ {
+ // Adding a paramter to a block to represent the
+ // active mask is a straightforward application
+ // of `IRBuilder`.
+ //
+ // As a small sanity check, we make sure that the
+ // block we are modifying actually wants the active
+ // mask.
+ //
+ SLANG_ASSERT(doesBlockNeedActiveMask(block));
+
+ IRBuilder builder;
+ builder.sharedBuilder = m_sharedBuilder;
+ builder.setInsertBefore(block->getFirstOrdinaryInst());
+
+ auto activeMaskParam = builder.emitParam(m_maskType);
+
+ m_activeMaskForBlock.Add(block, activeMaskParam);
+ }
+
+ // The remainder of the work in this pass is going to be based
+ // on a recursive walk of the CFG for the function in terms of
+ // regions.
+ //
+ // Our definition of regions will rely on having built a dominator
+ // tree for the function.
+ //
+ RefPtr<IRDominatorTree> m_dominatorTree;
+ //
+ // Briefly, a node (basic block) A dominates block B if every
+ // possible path through the graph that ends in B goes through A.
+ //
+ // The dominator tree encodes dominance relationships by making
+ // it so that A is an ancestor node of B in the tree if and only if
+ // A dominates B. Furthermore, if A is the direct parent of B in
+ // the dominator tree, we say that A *immediately dominates* B.
+ //
+ // We will use a few wrapper/helper functions to make working
+ // with the dominator tree more convenient.
+ //
+ // First is a query to check if one block dominates another:
+ //
+ bool dominates(IRBlock* dominator, IRBlock* dominated)
+ {
+ return m_dominatorTree->dominates(dominator, dominated);
+ }
+ //
+ // Next is an operation to check if one block domiantes
+ // another *or* the child region is unreachable in the CFG.
+ //
+ // TODO: The `IRDominatorTree` type should now make this
+ // operation redundant, since it will treat an unreachable
+ // block as being domianted by any other block.
+ //
+ bool dominatesOrIsUnreachable(IRBlock* dominator, IRBlock* dominated)
+ {
+ return m_dominatorTree->isUnreachable(dominated)
+ || dominates(dominator, dominated);
+ }
+
+ // With the definition of dominance in hand, we are now ready
+ // to define the notion of regions that we will use.
+ //
+ // There are many ways to define regions on a CFG, and we are
+ // specifically interested in single-entry, multiple-exit
+ // regions.
+ //
+ struct RegionInfo
+ {
+ // Each region will have a dedicated entry block, which
+ // is the only way into the region (it is single-entry,
+ // after all). This means that the entry block must
+ // dominate everything inside the region.
+ //
+ IRBlock* entryBlock = nullptr;
+
+ // Each region will also have a dedicated *merge* block,
+ // which is where control flow goes when exiting the
+ // region normally. Code after the merge block is considered
+ // to be outside the region.
+ //
+ IRBlock* mergeBlock = nullptr;
+
+ // Regions in the CFG will nest with a parent-child relationship,
+ // and during our processing it will be important to track
+ // the chain of ancestor regions that enclosue a piece of code.
+ //
+ // Note: it is possible that `parentRegion` will have the same
+ // `mergeBlock` as this region, and code that handles regions
+ // needs to deal with that case.
+ //
+ RegionInfo* parentRegion = nullptr;
+
+ // As a convenience, we also make our region type hold
+ // the active mask for the entry block of the region.
+ // This could technically be accessed using `m_activeMaskForBlock`,
+ // but storing it here can avoid some extra lookups and
+ // error checks around them.
+ //
+ IRInst* activeMaskOnEntry = nullptr;
+ };
+
+ // Given the definition of a region, one of the most important
+ // queries to be able to answer is: given a block B and a region R,
+ // is B inside R?
+ //
+ bool isBlockInRegion(IRBlock* block, RegionInfo* region)
+ {
+ // By our definition, a region only contains blocks
+ // that are dominated by its entry block.
+ //
+ // Thus, any block not dominated by the entry block
+ // is outside the region.
+ //
+ if(!m_dominatorTree->dominates(region->entryBlock, block))
+ return false;
+
+ // In addition, if `block` is dominated by the merge
+ // block of `region` then it can only be reached by
+ // going through the merge block, which implies leaving
+ // the region.
+ //
+ // Thus any block that is dominated by the merge block
+ // of `region` (or any of its parents) is not in the region.
+ //
+ // TODO: It may not be necessary to check the mrege block
+ // of parent regions, so that checking just the one merge
+ // block would suffice. We need to double-check that
+ // nothing would be changed before removing that logic...
+ //
+ for( auto r = region; r; r = r->parentRegion )
+ {
+ // Note: It is possible for the merge block of a region to
+ // be null (specifically for the entry block of a function,
+ // where the logical merge point is after the function
+ // returns).
+ //
+ if(r->mergeBlock && dominates(r->mergeBlock, block))
+ return false;
+ }
+
+ return true;
+ }
+
+ void transformRegions(IRBlock* funcEntryBlock)
+ {
+ // Given the definition of regiosn we will use,
+ // we can process the entire function by recursively
+ // carving it up into regions.
+ //
+ // We start by computing a dominator tree for
+ // the function, since that will help us
+ // identify the regions.
+ //
+ m_dominatorTree = computeDominatorTree(m_func);
+
+ // Next we look up th active mask for the function's
+ // entry region, which had better be set before
+ // we run this code.
+ //
+ IRInst* activeMaskOnFuncEntry = nullptr;
+ m_activeMaskForBlock.TryGetValue(funcEntryBlock, activeMaskOnFuncEntry);
+ SLANG_ASSERT(activeMaskOnFuncEntry);
+
+ // The root region of our tree of regions will
+ // start at the entry block of the function, and
+ // its merge block will be left as null (because
+ // we merge upon return from the function, which isn't
+ // represented as an explicit block in the CFG).
+ //
+ RegionInfo rootRegion;
+ rootRegion.entryBlock = funcEntryBlock;
+ rootRegion.activeMaskOnEntry = activeMaskOnFuncEntry;
+
+ // We then kick off transformation of the whole
+ // CFG by transforming the root region.
+ //
+ transformRegion(&rootRegion);
+ }
+
+ void transformRegion(RegionInfo* region)
+ {
+ // The task of transforming a region with a given
+ // entry block and initial active mask comprises
+ // two steps.
+ //
+ IRBlock* regionEntry = region->entryBlock;
+ IRInst* activeMaskOnRegionEntry = region->activeMaskOnEntry;
+ //
+ // The first step is the easy one: any uses of
+ // the `waveGetActiveMask` instruction in the entry
+ // block of the region can be replaced with the value
+ // of `actvieMaskOnRegionEntry`.
+ //
+ // The only wrinkle here is carefully fetching the
+ // next instruction before processing each instruction
+ // so that we can continue to iterative while also
+ // potentially removing and replacing instructions along
+ // the way.
+ //
+ IRInst* nextInst = nullptr;
+ for( IRInst* inst = regionEntry->getFirstInst(); inst; inst = nextInst )
+ {
+ nextInst = inst->getNextInst();
+
+ if( inst->op == kIROp_WaveGetActiveMask )
+ {
+ inst->replaceUsesWith(activeMaskOnRegionEntry);
+ inst->removeAndDeallocate();
+ }
+ }
+
+ // The second and much more involved step is to process all
+ // the other blocks in the region, recursively.
+ //
+ // The way to proceed will be determined by the terminator
+ // instruction of the entry block.
+ //
+ auto terminator = regionEntry->getTerminator();
+ SLANG_ASSERT(terminator);
+ switch( terminator->op )
+ {
+ // There are some cases of terminator instructions that
+ // we explicit do not or cannot handle.
+ //
+ // A `conditionalBranch` instruction represents a two-way
+ // conditional branch *without* structured control-flow
+ // information. Right now our front-end doesn't produce
+ // such instructions.
+ //
+ // TODO: We could theoretically handle unstructured control
+ // flow either by running a restructuring pass, or by saying
+ // that unstructured branches can split the active mask,
+ // without any option to reconverge it.
+ //
+ case kIROp_conditionalBranch:
+ //
+ // A `discard` instruction can only appear in fragment shaders,
+ // and this pass is currently only being applied for the CUDA
+ // target (which doesn't support rasterization stages).
+ //
+ // The difficulty with `discard` is that a `discard` operation
+ // in a subroutine can affect the active mask seen by code
+ // after a call to that subroutine. Thus, the `discard` operation
+ // would violate our assumption that the active mask cannot change
+ // inside of a single basic block.
+ //
+ // Devising a solution to synthesize the active mask in the presence
+ // of `discard` would seem to require a different approach to this
+ // pass *or* some new constraints on the front-end representation
+ // (e.g., a `discard` is really more akin to a `throw` or other
+ // error-propagation operation, and might need to be explicit in
+ // the signature of any function that may `discard`).
+ //
+ case kIROp_discard:
+ //
+ // Finally, we also don't handle any control-flow op we might not
+ // have introduced at the time this pass was created.
+ //
+ default:
+ SLANG_UNEXPECTED("unhandled terminator op");
+ break;
+
+ // Next, there are cases of terminators that represent code that
+ // is or must be be unreachable. The runtime semantics of executing
+ // one of these instructions would be some kind of undefined behavior,
+ // so we can elect to simply do nothing.
+ //
+ case kIROp_MissingReturn:
+ case kIROp_Unreachable:
+ break;
+
+ case kIROp_unconditionalBranch:
+ {
+ auto branch = cast<IRUnconditionalBranch>(terminator);
+
+ // An `unconditionalBranch` instruction represents any kind of
+ // unconditonal branch in the code. This includes cases like:
+ //
+ // * Leaving a control-flow structure normally (e.g., ending the "then"
+ // block of an `if` statement and moving on the code after the `if`;
+ // jumping out of a loop with a `break`)
+ //
+ // * Leaving a control-flow structure abnormally (e.g., leaving the "then"
+ // part of an `if` statement by `break`ing out of an outer loop).
+ //
+ // * Ordinary control flow that doesn't leave any region(s) (e.g., cycling
+ // back to the top of a `while` loop; progressing through straight-line
+ // non-branching control flow that happens to use multiple blocks)
+ //
+ // We factor most of the handling of unconditional branches out into
+ // a subroutine, so we just invoke it here and leave the explanation
+ // to later.
+ //
+ transformUnconditionalEdge(region, terminator, branch->getTargetBlock(), activeMaskOnRegionEntry);
+
+ // Once we've handled the control-flow edge at the end of the entry
+ // block of our region, we need to process any child regions that
+ // might exist.
+ //
+ // E.g. if we had straight-line control flow from block A->B->C->... and
+ // we were processing the region that starts with A, then the previous
+ // code would have handled all work related to the A->B edge, and then
+ // this step would handle recursively processing the B->C->... region.
+ //
+ transformChildRegions(region, region);
+ }
+ break;
+
+ case kIROp_ReturnVal:
+ case kIROp_ReturnVoid:
+ {
+ // A `return` instruction is akin to an unconditional branch,
+ // except that it is guaranteed to exit any structured control
+ // flow regions that we are nested under.
+ //
+ // We thus handle a `return` as a special case of an unconditional
+ // branch where the target block is null (which also happens to
+ // be the value used to represent the merge block for the region
+ // representing the entire function body).
+ //
+ transformUnconditionalEdge(region, terminator, nullptr, activeMaskOnRegionEntry);
+
+ // We also make a call to recusrively process any child regions here,
+ // although in practice there should be no child regions if the
+ // entry block ended with a `return`.
+ //
+ // TODO: Consider eliminating this call if it really isn't needed.
+ //
+ transformChildRegions(region, region);
+ }
+ break;
+
+ case kIROp_ifElse:
+ {
+ // A structured `ifElse` instruction is a two-way branch on a
+ // Boolean coniditon, along with a specific block representing
+ // the code after the high-level-language `if` statement.
+ //
+ auto ifElse = cast<IRIfElse>(terminator);
+ auto condition = ifElse->getCondition();
+ auto trueBlock = ifElse->getTrueBlock();
+ auto falseBlock = ifElse->getFalseBlock();
+ auto afterBlock = ifElse->getAfterBlock();
+ //
+ // We can picture the situation as something like:
+ //
+ // I // block with if/else |
+ // / \ |
+ // T F // true and false blocks |
+ // /|\ /|\ |
+ // |
+ // ... |
+ // |
+ // \|/ |
+ // A // "after" block |
+ // |
+ // ... |
+ //
+ // Note that very few assumptions can or should be
+ // made about the code under T and F. In particular:
+ //
+ // * Code under T or F could exit the region under
+ // consideration without going through A
+ //
+ // * There could exist some block X such that X is in
+ // the region of our if/else, and control flow can reach
+ // A from both T and F *without* going through A
+ //
+ // Our construction doesn't rely on many assumptions.
+ // All we need is that we can construct a sub-region
+ // representing the code dominated by I but not by A
+ // (which serves as the merge point).
+ //
+ RegionInfo ifElseRegion;
+ ifElseRegion.entryBlock = regionEntry;
+ ifElseRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+ ifElseRegion.mergeBlock = afterBlock;
+ ifElseRegion.parentRegion = region;
+ //
+ // This sub-region will be used as a parent region
+ // when recursively processing code reachable through
+ // the `ifElse`.
+
+ // Because we broke all critical edges as a pre-process, we
+ // can be certain that the entry block for `region` dominates
+ // the blocks for the `true` and `false` cases (it should
+ // be their only predecessor, and thus the immediate
+ // dominator).
+ //
+ SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, trueBlock));
+ SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, falseBlock));
+
+ // Because we also broke all pseudo-critical edges, we also
+ // expect taht the blocks we branch to on `true` or `false`
+ // are not the same as the merge block.
+ //
+ SLANG_ASSERT(trueBlock != afterBlock);
+ SLANG_ASSERT(falseBlock != afterBlock);
+
+ // Because of the above assumptions, we know that this
+ // code can take responsibility for computing the mask
+ // value to use for both `trueBlock` and `falseBlock`.
+ //
+ IRBuilder builder;
+ builder.sharedBuilder = m_sharedBuilder;
+
+ // To establish the mask value for `trueBlock` we will
+ // insert a `waveMaskBallot` before the branch:
+ //
+ // trueMask = waveMaskBallot(activeMaskOnRegionEntry, condition);
+ //
+ // That mask weill consist of all threads that entered the
+ // parent region and computed a `condition` value of `true`.
+ //
+ builder.setInsertBefore(ifElse);
+ auto trueMask = builder.emitWaveMaskBallot(m_maskType, activeMaskOnRegionEntry, condition);
+
+ // To establish the mask value for `falseBlock`, we will
+ // insert code into the false block that computes an inverted mask as:
+ //
+ // falseMask = activeMaskOnRegionEntry & ~trueMask;
+ //
+ builder.setInsertBefore(falseBlock->getFirstOrdinaryInst());
+ auto falseMask = builder.emitBitAnd(
+ m_maskType,
+ activeMaskOnRegionEntry,
+ builder.emitBitNot(m_maskType, trueMask));
+
+ // The task of associating the mask value comptued by a conditional branch
+ // with the successor block(s) is delegated to a subroutine, which we will
+ // detail later.
+ //
+ // For now it suffices that we need to transform each of the outgoing
+ // edges from our conditional branch.
+ //
+ // TODO: There is a pathological corner case that is not handled here,
+ // when `trueBlock == falseBlock`. Right now we have no optimizations
+ // that could make this case arise, but we should be careful about
+ // it if we ever add such passes.
+ //
+ transformConditionalEdge(&ifElseRegion, terminator, trueBlock, trueMask);
+ transformConditionalEdge(&ifElseRegion, terminator, falseBlock, falseMask);
+
+ // Next we need to transform any and all child regions of our `if`-`else`
+ // construct as well as the children of the overall `region`.
+ //
+ // Those children chould (at least) include child regions where `trueBlock`
+ // and `falseBlock` are the entry blocks.
+ //
+ transformChildRegions(&ifElseRegion, region);
+ }
+ break;
+
+ case kIROp_loop:
+ {
+ // At the most basic level, a `loop` instruction is just an uncondtional
+ // branch. What is stores above and beyond an `unconditionalBranch`
+ // instruction is the strucrutal information about the loop, including
+ // where the code "after" the loop starts (the same as the `break`
+ // target), and where the code that starts a new loop iteration goes
+ // (the `continue` target).
+ //
+ auto loopInst = cast<IRLoop>(terminator);
+ auto loopHeader = loopInst->getTargetBlock();
+ auto breakBlock = loopInst->getBreakBlock();
+ auto continueBlock = loopInst->getContinueBlock();
+ //
+ // We can visualize it as something like this:
+ //
+ // L // block with `loop` instruction |
+ // | |
+ // | ___ // back edge(s) that start a |
+ // |/ \ // new iteration |
+ // H | // loop header, where control flow |
+ // /|\ | // starts on each iteration |
+ // ... |
+ // |
+ // \|/ |
+ // C // continue label, which leads |
+ // ... // to any/all back edges |
+ // |
+ // \|/ |
+ // B // break label, which is the start |
+ // ... // of code after the loop |
+ //
+ // Much like the case for `ifElse`, we can't make a lot
+ // of definition statements about the structure of the control
+ // flow after the `loop` instruction, so we need to be careful
+ // and construct regions that don't rely on unsafe assumptions.
+ //
+ // We can construct a region that represents all the code that
+ // is logically inside the loop. This regions starts with the
+ // loop header (*not* the block with the `loop` instruction),
+ // and ends at the block B where a `break` goes (to exit
+ // the loop).
+ //
+ RegionInfo loopRegion;
+ loopRegion.entryBlock = loopHeader;
+ loopRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+ loopRegion.mergeBlock = breakBlock;
+ loopRegion.parentRegion = region;
+
+ // In order for our structured control-flow information to
+ // mean anything, the `loop` instruction must dominate
+ // the loop body (starting with the header).
+ //
+ SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, loopHeader));
+ //
+ // We also require that the region with the `loop` either
+ // dominates the `break` block (you can't exit the loop without
+ // first enterting it), *or* that block is unreachable (the
+ // loop never exits normally).
+ //
+ SLANG_ASSERT(dominatesOrIsUnreachable(regionEntry, breakBlock));
+ //
+ // Simlarly, we assume that the `continue` label is also either
+ // dominated by the `loop` instruction or is unreachable (the
+ // loop never actually iterates).
+ //
+ SLANG_ASSERT(dominatesOrIsUnreachable(regionEntry, continueBlock));
+ //
+ // Finally, our pre-pass on the CFG will have ruled out very
+ // silly cases like a `loop` that branches directly to the `break`
+ // label.
+ //
+ SLANG_ASSERT(loopHeader != breakBlock);
+
+ // Each iteration of the loop will execute under an active
+ // mask, and the mask may vary per iteration.
+ //
+ IRInst* iterEntryMask = nullptr;
+ //
+ // For a loop that actually *loops*, the loop header block
+ // must have multiple distinct predecessors: one for the
+ // `loop` instruction and one or more for back edges that
+ // continue execution of the loop.
+ //
+ if( loopHeader->getPredecessors().getCount() > 1 )
+ {
+ iterEntryMask = loopHeader->getLastParam();
+ SLANG_ASSERT(iterEntryMask);
+ }
+ else
+ {
+ // It technically possible that a `loop` instruction
+ // doesn't actually yield a loop. For example, one
+ // can write:
+ //
+ // for(;;)
+ // {
+ // doThings();
+ // if(someCondition) break;
+ // doOtherThings();
+ // break;
+ // }
+ //
+ // There are no cases where this code actually loops,
+ // but the use of a `for` loop is still enabling some
+ // non-trivial control flow with the early `break`.
+ //
+ // We cannot in general eliminate all `loop`s that don't
+ // actually loop as a pre-process, without adding some
+ // alternative kind of control-flow construct that
+ // represents this kind of alternative use of looping
+ // constructs.
+ //
+ // Fortunately, handling this case isn't complicated,
+ // because the mask to use on entry to the (only)
+ // iteration is well known.
+ //
+ iterEntryMask = activeMaskOnRegionEntry;
+ }
+
+ // Along with the outer region that represents the
+ // entire loop (which is exited with `break`) there
+ // is conceptually a nested inner region that represents
+ // the body of a single loop iteration, and which is
+ // exited with a `continue`.
+ //
+ // This detail primarily matters for a `for` loop where
+ // there can be non-trivial code (including code that
+ // depends on the active mask) that executes on a
+ // `continue`.
+ //
+ // The nested region thus begins at the loop header,
+ // and ends at the `continue` label. Note that it
+ // is possible for this region to be empty, in the
+ // case where `loopHeader == continueBlock` (which
+ // will happen for any non-`for` loop).
+ //
+ RegionInfo iterRegion;
+ iterRegion.entryBlock = loopHeader;
+ iterRegion.activeMaskOnEntry = iterEntryMask;
+ iterRegion.mergeBlock = continueBlock;
+ iterRegion.parentRegion = &loopRegion;
+
+ // Once we've computed the child regions that are introduced
+ // by the loop structure, we can move on to processing
+ // its single outgoing edge, and the child control flow
+ // regions.
+ //
+ transformUnconditionalEdge(region, terminator, loopHeader, activeMaskOnRegionEntry);
+ transformChildRegions(&iterRegion, region);
+ }
+ break;
+
+
+ case kIROp_Switch:
+ {
+ // A `switch` instruction represents a structured N-way
+ // branch on an integer condition. The structural information
+ // gives us a block that represents code after the `switch`
+ // statement, and which is also the target for any `break`s
+ // inside the `switch`.
+ //
+ auto switchInst = cast<IRSwitch>(terminator);
+ auto condition = switchInst->getCondition();
+ auto mergeBlock = switchInst->getBreakLabel();
+
+ // A `switch` is mostly just a more involved and complicated
+ // version of an `if`-`else`, so the overall logical is similar
+ // but with a lot more details to handle.
+ //
+ // We can construct a child region that represents the code
+ // that is logically inside the `switch`, consisting of all
+ // code from teh entry block up to the merge (`break`) block.
+ //
+ RegionInfo switchRegion;
+ switchRegion.entryBlock = regionEntry;
+ switchRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+ switchRegion.mergeBlock = mergeBlock;
+ switchRegion.parentRegion = region;
+
+ // Next, we need to establish a mask value that will
+ // represent the active mask on entry to a given `case`.
+ //
+ IRBuilder builder;
+ builder.sharedBuilder = m_sharedBuilder;
+ builder.setInsertBefore(switchInst);
+
+ // For now we are computing a simple-but-inaccurate version
+ // of the active mask. We are using:
+ //
+ // matchingMask = waveMaskMatch(activeMaskOnRegionEntry, condition);
+ //
+ // This value will yield a mask for each thread that consists of
+ // the threads who had exactly the same value for `condition`.
+ //
+ auto matchingMask = builder.emitWaveMaskMatch(m_maskType, activeMaskOnRegionEntry, condition);
+ //
+ // TODO: this mask computation yields a surprising value in cases where
+ // multiple values go to the same code:
+ //
+ // switch(condition)
+ // {
+ // case 0: case 1: A(WaveGetActiveMask()); break;
+ //
+ // case 2: default: B(WaveGetActiveMask()); break;
+ // }
+ //
+ // In this case we want the mask seen by `A()` to be all the threads
+ // where `condition` was `0` *or* `1` (and not to see two
+ // different masks, one for each value).
+ //
+ // Similarly, if we have threads where `condition` is `2`, `3`, and `4`,
+ // we expec them to all see a common mask at `B()`, despite
+ // representing multiple distinct values.
+ //
+ // The desired mask could be synthesized in different ways.
+ //
+ // We could execute an extra `switch` before this one, to reduce
+ // the actual values of `condition` down to an artificial `caseNumber`,
+ // which only represents the unique code blocks being branched to:
+ //
+ // int caseNumber;
+ // switch(condition)
+ // {
+ // case 0: case 1: caseNumber = 0xA; break;
+ //
+ // case 2: default: caseNumber = 0xB; break;
+ // }
+ // switch( caseNumber ) // NOTE: changed condition here
+ // {
+ // case 0xA: A(WaveGetActiveMask()); break;
+ //
+ // case 0xB: B(WaveGetActiveMask()); break;
+ // }
+ //
+ // The main downside of that approach is that it adds additional
+ // control flow that wasn't in the input program (and that new
+ // control flow could complicate our interactions with the
+ // dominator tree and CFG edges).
+ //
+ // Another alternatie would be to move the synthesis of the mask
+ // down into the individual cases:
+ //
+ // switch(condition)
+ // {
+ // case 0: case 1:
+ // mask = WaveMaskMatch(activeMaskOnRegionEntry, 0xA);
+ // A(mask);
+ // break;
+ //
+ // case 2: default:
+ // mask = WaveMaskMatch(activeMaskOnRegionEntry, 0xB);
+ // B(mask);
+ // break;
+ // }
+ //
+ // This second approach avoids adding additional control flow
+ // operations before the `switch`, but does so at the cost of
+ // having distinct textual call sites for a collective that
+ // need to work together.
+ //
+ // We need to pick and implement one of these strategies
+ // (or something else entirely) in a future change.
+
+ // A `switch` instruction always has a `default` label, which
+ // is where control flow goes for values of the condition
+ // that weren't explicitly handled.
+ //
+ // Note: even if the high-level-language `switch` didn't have
+ // a `default` label, the IR one will. During initial IR generation,
+ // the `default` label for such a `switch` will be the same as
+ // the `break` label, but our pass to break pseudo-critical edges
+ // will guarantee that every `switch` has a `default` label distinct
+ // from its `break` label.
+ //
+ // We will handle the conditional edge leading the `default` label
+ // before looking at any of the explicit `case`s.
+ //
+ auto defaultLabel = switchInst->getDefaultLabel();
+ transformConditionalEdge(&switchRegion, terminator, defaultLabel, matchingMask);
+
+ // One wrinkle that arises when processing a `switch` statement is
+ // that multiple `case`s may branch to same target label, and the
+ // label for `case`s can be the same as the `default` label.
+ //
+ // Our approach in `transformConditionalEdge` currently assumes that
+ // for each block reached via a conditional edge, the `transformConditionalEdge`
+ // function is only called once.
+ //
+ // To avoid violating this assumption, we will make sure to only
+ // call `transformConditionalEdge` once for each unique target block
+ // of the `switch`.
+ //
+ // In order to make that guarantee, we rely on the invariant (provided
+ // by our IR generation, maintained by our IR passes, and currently already
+ // assumed by our emit logic) that the `case`s of a `switch` are stored
+ // in an order such that:
+ //
+ // * All cases that branch to the same label are contiguous in the list of cases
+ //
+ // * If the cases with label A can fall through to the cases with label B, then
+ // the A cases immediately precede the B cases in the list of cases.
+ //
+ // We can thus simply track the label of the last case we processed,
+ // and detect duplicates by comparing to it (and the `default` label).
+ //
+ IRBlock* prevLabel = nullptr;
+ UInt caseCount = switchInst->getCaseCount();
+ for( UInt ii = 0; ii < caseCount; ++ii )
+ {
+ auto caseLabel = switchInst->getCaseLabel(ii);
+
+ // As discussed above, we only process the first `case` with
+ // a given label, since all of the edges will pass along the
+ // same mask.
+ //
+ if(caseLabel == prevLabel)
+ continue;
+ prevLabel = caseLabel;
+
+ // Similarly, we skip `case`s that branch to the same label
+ // as a `default`, since the `default` label was already
+ // processed above.
+ //
+ if(caseLabel == defaultLabel)
+ continue;
+
+ transformConditionalEdge(&switchRegion, terminator, caseLabel, matchingMask);
+
+ // TODO: One issue that is getting ignored in the current
+ // code is "non-trivial fall-through," where one `case`
+ // executes some amount of code before falling through to
+ // another:
+ //
+ // switch(x)
+ // {
+ // case 0:
+ // A(WaveGetActiveMask());
+ // case 1:
+ // B(WaveGetActiveMask());
+ // break;
+ // ...
+ // }
+ //
+ // In the scenario above it is clear what the desired
+ // active mask is for the call to `A()` (the set of lanes
+ // that had `x==0`), but the active mask when calling
+ // `B()` is a bit thornier question.
+ //
+ // The easiest implementation choice is to just let
+ // `B()` see two active masks: one for the `x==0` lanes,
+ // and another for the `x==1` lanes, and it could be
+ // argued that this matches the programmer's view of
+ // how things diverged at the `switch`.
+ //
+ // The alternative view is that the lanes with `x==0`
+ // should re-converge with the lanes with `x==1` before
+ // executing the call to `B()`.
+ //
+ // Achieving the more intuitive behavior for fall-through
+ // naively seems to require rewriting the control flow
+ // and introducing nested conditionals (where the nesting
+ // depth relates to the length of the chain of fall-throughs).
+ //
+ // More work is required in order to figure out a good
+ // strategy for dealing with fall-through in the general case.
+ //
+ // For now we can ignore this issue since Slang
+ // does not support fall-through in `switch` statements,
+ // but if/when we do, we will need to make a policy
+ // decision on this issue.
+ }
+
+ // Just like for `ifElse`, once we have processed the outgoing
+ // conditional edges, we need to process all child regions.
+ //
+ transformChildRegions(&switchRegion, region);
+ }
+ break;
+
+ }
+ }
+
+ // During the presentation of `transformRegion` we deferred
+ // the details of how to deal with edges and child regions to
+ // subroutines, which we will begin to work through now.
+ //
+ // The case of conditional edges is the easiest.
+ //
+ void transformConditionalEdge(RegionInfo* fromRegion, IRTerminatorInst* terminator, IRBlock* toBlock, IRInst* fromActiveMask)
+ {
+ SLANG_UNUSED(fromRegion);
+ SLANG_UNUSED(terminator);
+
+ // Because of the way that we broke critical edges, block with the
+ // conditional branch (the entry block to `fromRegion`) had better
+ // be the immediate dominator of the block being branched to.
+ //
+ SLANG_ASSERT(m_dominatorTree->immediatelyDominates(fromRegion->entryBlock, toBlock));
+
+ // If the block being branched from immediately dominates the block being
+ // branched to, that makes it the only predecessor of the `toBlock`.
+ //
+ // As such, the `toBlock` can't have had an SSA parameter/phi introduced
+ // to receive the block, and the only value needed to represent the
+ // active mask on input to `toBlock` is the `fromActiveMask` being
+ // provided as part of the conditional branch.
+ //
+ m_activeMaskForBlock.Add(toBlock, fromActiveMask);
+ }
+
+ // Unconditional edges are more complicated than conditional
+ // edges, because they may branch out of a control-flow
+ // region and/or branch to a block with multiple predecessors.
+ //
+ void transformUnconditionalEdge(RegionInfo* fromRegion, IRTerminatorInst* terminator, IRBlock* toBlock, IRInst* fromActiveMask)
+ {
+ IRBuilder builder;
+ builder.sharedBuilder = m_sharedBuilder;
+ builder.setInsertBefore(terminator);
+
+ // The context here is that the `terminator` instruction,
+ // at the end of the first block of `fromRegion` is
+ // branching to `toBlock`. The `fromActiveMask` is the
+ // active mask that was in place when executing the
+ // first block of `fromRegion`, and thus the active
+ // mask used when executing the `terminator`.
+ //
+ // One task we need to deal with is setting up
+ // the active mask that should be used when executing
+ // the `toBlock`. We start by assuming that the active
+ // mask does not change when control flow moves from
+ // one block to the next (we will then deal with all
+ // the cases where this assuption isn't true).
+ //
+ IRInst* toActiveMask = fromActiveMask;
+
+ // The most important thing we need to deal with is
+ // that an unconditional edge may exit a control-flow
+ // region. We know that the block being branched from
+ // is inside `fromRegion`, and thus recursively inside
+ // all the ancestors of that region.
+ //
+ // We will walk through the regions from the inner-most
+ // to the outer-most and check if we are exiting each
+ // region in turn.
+ //
+ for( RegionInfo* r = fromRegion; r; r = r->parentRegion )
+ {
+ // To know if we are exiting the region `r`, we
+ // need to look at its merge block.
+ //
+ auto mergeBlock = r->mergeBlock;
+
+ // One subtle detail here is that we might technically
+ // be exiting a region A with merge block M, but the
+ // parent of A is a region B that *also* has M as
+ // its merge block.
+ //
+ // In such a case we really don't want/need to deal
+ // with any issues around reconvergence at M for A,
+ // since we can instead rely on the reconvergence logic
+ // for B to do all the work.
+ //
+ // In practical terms, this means that we skipp any
+ // region that has a parent region with the same merge
+ // block.
+ //
+ // TODO: We could try to find ways to eliminate this
+ // situation as part of the structure of regions,
+ // so that code like this doesn't need the ad hoc check.
+ //
+ auto parentRegion = r->parentRegion;
+ if(parentRegion && parentRegion->mergeBlock == mergeBlock)
+ continue;
+
+ // We need to know if the branch exits the region `r`
+ // and, if it does, we need to know whether it exits
+ // the region normally or not.
+ //
+ // If the block being branched to is *inside* region `r`
+ // (which can only be the case for a non-null target
+ // block), then we clearly aren't exiting region `r`.
+ //
+ if(toBlock && isBlockInRegion(toBlock, r))
+ {
+ // Furthermore, if we aren't exiting region `r`,
+ // then we must not be exiting any of its parent
+ // regions, since our regions are strictly
+ // nested.
+ //
+ // We thus don't need to process any more
+ // regions to check if we are exiting them.
+ //
+ break;
+ }
+ //
+ // A normal exit from the region can be detected easily:
+ // it is any case where the destination of the branch is
+ // the merge block for the region.
+ //
+ else if( toBlock == mergeBlock )
+ {
+ // In this case we are jumping to the dedicated
+ // merge point of region `r`, which means we need
+ // to re-converge with all the other threads/lanes
+ // that entered the region together, and which are
+ // also exiting normally.
+ //
+ // It is possible that the `mergeBlock` because
+ // it representing the merge point upon return from
+ // the function, so we guard against that case.
+ //
+ if( mergeBlock )
+ {
+ // Otherwise, if we are branching to a non-null
+ // merge point, then we emit a ballot operation
+ // that will detect all the other threads/lanes:
+ //
+ // toActiveMask = waveMaskBallot(activeMaskOnEntry, true);
+ //
+ // This call will synchronize with all the threads/lanes
+ // that entered region `r`, and will collect the
+ // mask representing the threads/lanes that passed
+ // in `true` because they want to re-converge. That
+ // mask should be used as the new active mask when
+ // executing the merge block.
+ //
+ // TODO: Instead of emitting this `waveMaskBallot`
+ // on every edge that branches to the merge point,
+ // we could instead emit a single `waveMaskBallot`
+ // at the start of the merge point instead. This
+ // would be a good optimization to make, but requires
+ // extra logic throughout this algorithm to manage.
+ //
+ toActiveMask = builder.emitWaveMaskBallot(
+ m_maskType,
+ r->activeMaskOnEntry,
+ builder.getBoolValue(true));
+ }
+
+ // If we are exiting a region normally, then we can't
+ // also be exiting its parent region(s), because we
+ // eliminated the case of regions that have the same
+ // merge point as their parent at the top of our loop.
+ //
+ // Thus we are done checking if regions have been
+ // exited.
+ //
+ break;
+ }
+ else
+ {
+ // Finally, if we aren't staying in the region,
+ // but we aren't jumping to the merge point, then
+ // we must be exiting the region "abnormally."
+ //
+ // In this case, we need to coordinate with the
+ // other threads/lanes that entered the region
+ // to let them know we won't be reconverging
+ // with them.
+ //
+ // The other threads will be executing a `waveMaskBallot`
+ // to compute the active mask to use, so we can
+ // participate in that same ballot:
+ //
+ // waveMaskBallot(activeMaskOnEntry, false);
+ //
+ // We don't care about the mask that is returned from
+ // the ballot operation, since it will only represent
+ // the active mask for threads that wanted to re-converge.
+ //
+ builder.emitWaveMaskBallot(m_maskType,
+ r->activeMaskOnEntry,
+ builder.getBoolValue(false));
+ }
+ }
+
+ // After we've checked all the outer regions to see
+ // which (if any) we are exiting, we should have
+ // a ussable value for `toActiveMask` that we want
+ // to pass along to the target block.
+ //
+ // The way that we pass things along to the target
+ // block will vary a lot based on its form.
+ //
+ if( !toBlock )
+ {
+ // First, if the target block is null, then
+ // the branch is exiting the current function
+ // completely, so we don't need to wire up an
+ // active mask at all.
+ }
+ else if( toBlock->getPredecessors().getCount() > 1 )
+ {
+ // If the target block is one with multiple
+ // predecessors, such that it will have an
+ // added block parameter (phi node) to select
+ // the corect mask value, then we need to
+ // pass along the mask value to use as an
+ // additional argument on the unconditional branch.
+ //
+ // If the old unconditional branch was:
+ //
+ // <op>(arg0, arg1, arg2, ...);
+ //
+ // Then our new branch will be:
+ //
+ // <op>(arg0, arg1, arg2, ..., toActiveMask);
+ //
+ List<IRInst*> newOperands;
+ UInt oldOperandCount = terminator->getOperandCount();
+ for( UInt i = 0; i < oldOperandCount; ++i )
+ {
+ newOperands.add(terminator->getOperand(i));
+ }
+ newOperands.add(toActiveMask);
+
+ IRInst* newTerminator = builder.emitIntrinsicInst(
+ terminator->getFullType(),
+ terminator->op,
+ newOperands.getCount(),
+ newOperands.getBuffer());
+
+ terminator->replaceUsesWith(newTerminator);
+ terminator->removeAndDeallocate();
+ }
+ else
+ {
+ // If the target block has only a single predecessor,
+ // then it means it must be immediately dominated
+ // by the block we are branching from.
+ //
+ // As such, this is the only branch that wants to
+ // establish an active mask value for the target
+ // block, and we can just bind the comptue value
+ // directly.
+ //
+ m_activeMaskForBlock.Add(toBlock, toActiveMask);
+ }
+ }
+
+ // During the course of transforming a control-flow region,
+ // we deferred the processing of its child regions to
+ // a subroutine.
+ //
+ void transformChildRegions(RegionInfo* innerRegion, RegionInfo* outerRegion)
+ {
+ // The input to this function represents a (closed) range of regions
+ // between `innerRegion` and `outerMergePoint`.
+ //
+ // The `innerRegion` represents the body of some control-flow construct,
+ // and can be used to decide which basic blocks are inside the construct
+ // and thus need to be processed as child regions.
+ //
+ // The `outerRegion` represents the outer context in which the
+ // control-flow construct was found, and determines how control-flow
+ // entered the construct.
+ //
+ // If `innerRegion == outerRegion` then there is only one region
+ // in the range, while if `innerRegion != outerREgion` then the range
+ // comprises: `innerRegion`, `innerRegion->parent`, ... `outerRegion`.
+ //
+ // There are two kinds of child regions we need to concern ourselves
+ // with, and they relate to two kinds of blocks:
+ //
+ // * Blocks that are inside the inner region represent the start
+ // of their own nested single-entry multi-exit regions. We need
+ // to process these while building up the correct parent/child hierarchy.
+ //
+ // * The merge block(s) for our range of regions (from inner to
+ // outer), which represent code "after" each of the regions,
+ // that is still nested in the same parent region.
+ //
+ // We start by enumerating the first kind of child region.
+ // To make sure that we only consider blocks that represent the
+ // immediate children of `innerRegion`, and not other descendents,
+ // we will only look for regions that start with blocks that
+ // are children in the dominator tree for the entry block of
+ // our overall range of regions.
+ //
+ IRBlock* entryBlock = outerRegion->entryBlock;
+ for( auto childBlock : m_dominatorTree->getImmediatelyDominatedBlocks(entryBlock) )
+ {
+ // We don't want to consider blocks that are outside of
+ // the inner region here (they could be, e.g., the merge
+ // point of the region, or just blocks inside some
+ // nested control-flow construct).
+ //
+ if(!isBlockInRegion(childBlock, innerRegion))
+ continue;
+
+ // If we do find a child region, then we want to process it as
+ // a direct child of our inner region.
+ //
+ transformChildRegion(childBlock, innerRegion);
+ }
+
+ // Once we've dealt with the child regions that involve
+ // traversing deeper down the tree of regions, we need
+ // to deal with merge blocks, which represent siblings
+ // of regions in our range.
+ //
+ // We will walk the regions in our range from the inner-most
+ // to the outer-most and look at their merge blocks.
+ //
+ for( auto mergeRegion = innerRegion; mergeRegion; mergeRegion = mergeRegion->parentRegion )
+ {
+ // The merge point of a region forms the start of a
+ // sibling region that shares the same parent region.
+ //
+ auto parentRegion = mergeRegion->parentRegion;
+
+ // Well, actually there is one special case we need to
+ // deal with: if the parent region of our region has
+ // the same merge point, then the merge region can't
+ // actually be one of its children.
+ //
+ if( parentRegion && parentRegion->mergeBlock == mergeRegion->mergeBlock)
+ {
+ // In the case where the parent region has the same
+ // merge point, we choose not to process it here
+ // (since that would be inaccurate), and assume that
+ // it will be handled when this loop processes that
+ // parent region (perhaps as part of the same call to
+ // `transformChildRegions()`, and perhaps as part of
+ // another call).
+ }
+ else
+ {
+ // In the ordinary case where the parent region has a different
+ // merge point, then we know that the merge block for our region
+ // needs to be processed as a child of that parent region.
+ //
+ transformChildRegion(mergeRegion->mergeBlock, parentRegion);
+ }
+
+ // We bail out of this loop once we've processed all the regions
+ // in the range we've been asked to handle.
+ //
+ // Note: This check can't just be folded into the `for` loop
+ // condition because we need to make sure that we process
+ // at least one region in the case where `innerRegion == outerRegion`.
+ //
+ if(mergeRegion == outerRegion)
+ break;
+ }
+ }
+
+ void transformChildRegion(IRBlock* regionEntry, RegionInfo* parentRegion)
+ {
+ // This function gets called when we've disocvered a child region
+ // that should be processed recursively. The region that starts
+ // with `regionEntry` needs to be processed as a child of the
+ // given `parentRegion`.
+ //
+ // It is posible for this routine to be called to process the
+ // merge point for the entire function body, which is a null
+ // region. There is nothing that needs to be done in that case.
+ //
+ if(!regionEntry)
+ return;
+
+ // Similarly, we don't need to do anything for regions that don't
+ // need or use the active mask (even to pass along to their
+ // successors).
+ //
+ if(!doesBlockNeedActiveMask(regionEntry))
+ return;
+
+ // An important invariant of our approach is that before a child
+ // region is processed, an active mask value will have been
+ // established for it. The logic can be thought of as follows:
+ //
+ // * If the child region is one with multiple predecessors, then
+ // its active mask value comes in as a block parameter (phi node),
+ // and was set up before the recursive walk even started.
+ //
+ // * If the child region has only one predecessor, then that predecessor
+ // is its immediate dominator, and would be part of a parent region
+ // that has already been processed up the call stack. That parent
+ // region would have established the active mask value to use on input
+ // to each of its successors.
+ //
+ IRInst* activeMaskOnRegionEntry = nullptr;
+ if( !m_activeMaskForBlock.TryGetValue(regionEntry, activeMaskOnRegionEntry) )
+ {
+ SLANG_UNEXPECTED("no active mask registered for block");
+ }
+
+ // Once we've done the sanity checks and looked up
+ // the mask value to use on entry to this region, we
+ // can recursively process it (and by extension, its children).
+ //
+ RegionInfo region;
+ region.entryBlock = regionEntry;
+ region.activeMaskOnEntry = activeMaskOnRegionEntry;
+ region.mergeBlock = parentRegion->mergeBlock;
+ region.parentRegion = parentRegion;
+ transformRegion(&region);
+ }
+};
+
+// Now that we've defined the context for function-level transformation,
+// we can finally circle back and fill in the definition of the function
+// that the module-level pass uses to transform each function.
+//
+void SynthesizeActiveMaskForModuleContext::transformFuncUsingActiveMask(IRFunc* func)
+{
+ SynthesizeActiveMaskForFunctionContext context;
+ context.m_func = func;
+ context.m_sharedBuilder = &m_sharedBuilder;
+ context.m_maskType = m_maskType;
+
+ context.transformFunc();
+}
+
+// The public entry point for this pass is just a wrapper around
+// the context type for the module-level pass.
+//
+void synthesizeActiveMask(
+ IRModule* module,
+ DiagnosticSink* sink)
+{
+ SynthesizeActiveMaskForModuleContext context;
+ context.m_module = module;
+ context.m_sink = sink;
+ context.processModule();
+}
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-synthesize-active-mask.h b/source/slang/slang-ir-synthesize-active-mask.h
new file mode 100644
index 000000000..3eb60b350
--- /dev/null
+++ b/source/slang/slang-ir-synthesize-active-mask.h
@@ -0,0 +1,26 @@
+// slang-ir-synthesize-active-mask.h
+#pragma once
+
+namespace Slang
+{
+
+class Session;
+struct IRModule;
+class DiagnosticSink;
+
+ /// Synthesize values to represent the "active mask" for warp-/wave-level operations.
+ ///
+ /// This pass will transform all* functions in `module` that make use of the active
+ /// mask (directly or indirectly) so that they receive an explicit active mask as
+ /// an input. It will also transform the bodies of those functions to use that input
+ /// mask, or values derived from it, as the explicit mask for wave intrinsics.
+ ///
+ /// * Entry point functions will not be transformed to take an explicit mask, and
+ /// will instead be changed to compute the active mask to use as the first operation
+ /// in their body.
+ ///
+void synthesizeActiveMask(
+ IRModule* module,
+ DiagnosticSink* sink);
+
+}
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index 0a52561c0..57dac80b6 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -3464,6 +3464,55 @@ namespace Slang
return inst;
}
+ IRInst* IRBuilder::emitWaveMaskBallot(IRType* type, IRInst* mask, IRInst* condition)
+ {
+ auto inst = createInst<IRInst>(
+ this,
+ kIROp_WaveMaskBallot,
+ type,
+ mask,
+ condition);
+ addInst(inst);
+ return inst;
+ }
+
+ IRInst* IRBuilder::emitWaveMaskMatch(IRType* type, IRInst* mask, IRInst* value)
+ {
+ auto inst = createInst<IRInst>(
+ this,
+ kIROp_WaveMaskMatch,
+ type,
+ mask,
+ value);
+ addInst(inst);
+ return inst;
+ }
+
+ IRInst* IRBuilder::emitBitAnd(IRType* type, IRInst* left, IRInst* right)
+ {
+ auto inst = createInst<IRInst>(
+ this,
+ kIROp_BitAnd,
+ type,
+ left,
+ right);
+ addInst(inst);
+ return inst;
+ }
+
+ IRInst* IRBuilder::emitBitNot(IRType* type, IRInst* value)
+ {
+ auto inst = createInst<IRInst>(
+ this,
+ kIROp_BitNot,
+ type,
+ value);
+ addInst(inst);
+ return inst;
+ }
+
+
+
//
// Decorations
//
diff --git a/source/slang/slang-ir.h b/source/slang/slang-ir.h
index fbe05e8e8..9799201b3 100644
--- a/source/slang/slang-ir.h
+++ b/source/slang/slang-ir.h
@@ -751,6 +751,31 @@ struct IRParam : IRInst
IR_LEAF_ISA(Param)
};
+ /// A control-flow edge from one basic block to another
+struct IREdge
+{
+public:
+ IREdge()
+ {}
+
+ explicit IREdge(IRUse* use)
+ : m_use(use)
+ {}
+
+ IRBlock* getPredecessor() const;
+ IRBlock* getSuccessor() const;
+
+ IRUse* getUse() const
+ {
+ return m_use;
+ }
+
+ bool isCritical() const;
+
+private:
+ IRUse* m_use = nullptr;
+};
+
// A basic block is a parent instruction that adds the constraint
// that all the children need to be "ordinary" instructions (so
// no function declarations, or nested blocks). We also expect
@@ -849,6 +874,7 @@ struct IRBlock : IRInst
return use != that.use;
}
+ IREdge getEdge() const { return IREdge(use); }
IRUse* use;
};
@@ -878,6 +904,8 @@ struct IRBlock : IRInst
return use != that.use;
}
+ IREdge getEdge() const { return IREdge(use); }
+
IRUse* use;
UInt stride;
};
diff --git a/source/slang/slang.vcxproj b/source/slang/slang.vcxproj
index 1b391cac0..8c815e1e1 100644
--- a/source/slang/slang.vcxproj
+++ b/source/slang/slang.vcxproj
@@ -243,6 +243,7 @@
<ClInclude Include="slang-ir-string-hash.h" />
<ClInclude Include="slang-ir-strip-witness-tables.h" />
<ClInclude Include="slang-ir-strip.h" />
+ <ClInclude Include="slang-ir-synthesize-active-mask.h" />
<ClInclude Include="slang-ir-type-set.h" />
<ClInclude Include="slang-ir-union.h" />
<ClInclude Include="slang-ir-validate.h" />
@@ -324,6 +325,7 @@
<ClCompile Include="slang-ir-string-hash.cpp" />
<ClCompile Include="slang-ir-strip-witness-tables.cpp" />
<ClCompile Include="slang-ir-strip.cpp" />
+ <ClCompile Include="slang-ir-synthesize-active-mask.cpp" />
<ClCompile Include="slang-ir-type-set.cpp" />
<ClCompile Include="slang-ir-union.cpp" />
<ClCompile Include="slang-ir-validate.cpp" />
@@ -378,25 +380,25 @@
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-generate.exe</AdditionalInputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-generate.exe</AdditionalInputs>
</CustomBuild>
- </ItemGroup>
- <ItemGroup>
- <Natvis Include="..\core\core.natvis" />
- <Natvis Include="slang.natvis" />
<CustomBuild Include="slang-ast-reflect.h">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">"../../bin/windows-x86/debug/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"../../bin/windows-x64/debug/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"../../bin/windows-x86/release/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"../../bin/windows-x64/release/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
- <Outputs>%(RootDir)%(Directory)/slang-ast-generated.h;%(RootDir)%(Directory)/slang-ast-generated-macro.h</Outputs>
+ <Outputs>slang-ast-generated.h;slang-ast-generated-macro.h</Outputs>
<Message>slang-cpp-extractor AST %(Identity)</Message>
- <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">../../bin/windows-x86/debug/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
- <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">../../bin/windows-x64/debug/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
- <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
- <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
+ <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">../../bin/windows-x86/debug/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+ <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">../../bin/windows-x64/debug/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+ <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+ <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
</CustomBuild>
</ItemGroup>
<ItemGroup>
+ <Natvis Include="..\core\core.natvis" />
+ <Natvis Include="slang.natvis" />
+ </ItemGroup>
+ <ItemGroup>
<ProjectReference Include="..\core\core.vcxproj">
<Project>{F9BE7957-8399-899E-0C49-E714FDDD4B65}</Project>
</ProjectReference>
diff --git a/source/slang/slang.vcxproj.filters b/source/slang/slang.vcxproj.filters
index c2c9268dc..31e03d73d 100644
--- a/source/slang/slang.vcxproj.filters
+++ b/source/slang/slang.vcxproj.filters
@@ -180,6 +180,9 @@
<ClInclude Include="slang-ir-strip.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="slang-ir-synthesize-active-mask.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
<ClInclude Include="slang-ir-type-set.h">
<Filter>Header Files</Filter>
</ClInclude>
@@ -419,6 +422,9 @@
<ClCompile Include="slang-ir-strip.cpp">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="slang-ir-synthesize-active-mask.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
<ClCompile Include="slang-ir-type-set.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@@ -505,6 +511,9 @@
<CustomBuild Include="hlsl.meta.slang">
<Filter>Source Files</Filter>
</CustomBuild>
+ <CustomBuild Include="slang-ast-reflect.h">
+ <Filter>Header Files</Filter>
+ </CustomBuild>
</ItemGroup>
<ItemGroup>
<Natvis Include="..\core\core.natvis">
@@ -513,11 +522,5 @@
<Natvis Include="slang.natvis">
<Filter>Source Files</Filter>
</Natvis>
- <CustomBuild Include="hlsl.meta.slang">
- <Filter>Source Files</Filter>
- </CustomBuild>
- <CustomBuild Include="slang-ast-reflect.h">
- <Filter>Header Files</Filter>
- </CustomBuild>
</ItemGroup>
</Project> \ No newline at end of file