39 files changed, 3612 insertions, 171 deletions
diff --git a/slang.sln b/slang.sln
index d710b902e..40e7beaa0 100644
--- a/slang.sln
+++ b/slang.sln
@@ -1,41 +1,41 @@
 
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 14
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "core", "source\core\core.vcxproj", "{F9BE7957-8399-899E-0C49-E714FDDD4B65}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello-world", "examples\hello-world\hello-world.vcxproj", "{010BE414-ED5B-CF56-16C0-BD18027062C0}"
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "model-viewer", "examples\model-viewer\model-viewer.vcxproj", "{2F8724C6-1BC3-2730-84D5-3F277030D04A}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu-hello-world", "examples\cpu-hello-world\cpu-hello-world.vcxproj", "{4B47A364-37C4-96A7-6041-97BB4C1D333B}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gpu-printing", "examples\gpu-printing\gpu-printing.vcxproj", "{57C81DD3-4304-213D-AC16-39349871C957}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu-hello-world", "examples\cpu-hello-world\cpu-hello-world.vcxproj", "{4B47A364-37C4-96A7-6041-97BB4C1D333B}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello-world", "examples\hello-world\hello-world.vcxproj", "{010BE414-ED5B-CF56-16C0-BD18027062C0}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "core", "source\core\core.vcxproj", "{F9BE7957-8399-899E-0C49-E714FDDD4B65}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "model-viewer", "examples\model-viewer\model-viewer.vcxproj", "{2F8724C6-1BC3-2730-84D5-3F277030D04A}"
 EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{FD47AE19-69FD-260F-F2F1-20E65EA61D13}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang", "source\slang\slang.vcxproj", "{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}"
+	ProjectSection(ProjectDependencies) = postProject
+		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC} = {CA8A30D1-8FA9-4330-B7F7-84709246D8DC}
+		{66174227-8541-41FC-A6DF-4764FC66F78E} = {66174227-8541-41FC-A6DF-4764FC66F78E}
+	EndProjectSection
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-cpp-extractor", "tools\slang-cpp-extractor\slang-cpp-extractor.vcxproj", "{CA8A30D1-8FA9-4330-B7F7-84709246D8DC}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slangc", "source\slangc\slangc.vcxproj", "{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-generate", "tools\slang-generate\slang-generate.vcxproj", "{66174227-8541-41FC-A6DF-4764FC66F78E}"
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test-tool", "test-tool", "{57B5AA5E-C340-1823-CC51-9B17385C7423}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-test", "tools\slang-test\slang-test.vcxproj", "{0C768A18-1D25-4000-9F37-DA5FE99E3B64}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "render-test-tool", "tools\render-test\render-test-tool.vcxproj", "{61F7EB00-7281-4BF3-9470-7C2EA92620C3}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfx", "tools\gfx\gfx.vcxproj", "{222F7498-B40C-4F3F-A704-DDEB91A4484A}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-reflection-test-tool", "tools\slang-reflection-test\slang-reflection-test-tool.vcxproj", "{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}"
 EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test-tool", "test-tool", "{57B5AA5E-C340-1823-CC51-9B17385C7423}"
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{FD47AE19-69FD-260F-F2F1-20E65EA61D13}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-reflection-test-tool", "tools\slang-reflection-test\slang-reflection-test-tool.vcxproj", "{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfx", "tools\gfx\gfx.vcxproj", "{222F7498-B40C-4F3F-A704-DDEB91A4484A}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "render-test-tool", "tools\render-test\render-test-tool.vcxproj", "{61F7EB00-7281-4BF3-9470-7C2EA92620C3}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-cpp-extractor", "tools\slang-cpp-extractor\slang-cpp-extractor.vcxproj", "{CA8A30D1-8FA9-4330-B7F7-84709246D8DC}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slangc", "source\slangc\slangc.vcxproj", "{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-generate", "tools\slang-generate\slang-generate.vcxproj", "{66174227-8541-41FC-A6DF-4764FC66F78E}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang", "source\slang\slang.vcxproj", "{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}"
-	ProjectSection(ProjectDependencies) = postProject
-		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC} = {CA8A30D1-8FA9-4330-B7F7-84709246D8DC}
-		{66174227-8541-41FC-A6DF-4764FC66F78E} = {66174227-8541-41FC-A6DF-4764FC66F78E}
-	EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang-test", "tools\slang-test\slang-test.vcxproj", "{0C768A18-1D25-4000-9F37-DA5FE99E3B64}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -45,6 +45,30 @@ Global
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|Win32.Build.0 = Debug|Win32
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|x64.ActiveCfg = Debug|x64
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|x64.Build.0 = Debug|x64
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|Win32.ActiveCfg = Release|Win32
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|Win32.Build.0 = Release|Win32
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|x64.ActiveCfg = Release|x64
+		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|x64.Build.0 = Release|x64
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|Win32.Build.0 = Debug|Win32
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|x64.ActiveCfg = Debug|x64
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|x64.Build.0 = Debug|x64
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|Win32.ActiveCfg = Release|Win32
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|Win32.Build.0 = Release|Win32
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|x64.ActiveCfg = Release|x64
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|x64.Build.0 = Release|x64
+		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|Win32.ActiveCfg = Debug|Win32
+		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|Win32.Build.0 = Debug|Win32
+		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|x64.ActiveCfg = Debug|x64
+		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|x64.Build.0 = Debug|x64
+		{57C81DD3-4304-213D-AC16-39349871C957}.Release|Win32.ActiveCfg = Release|Win32
+		{57C81DD3-4304-213D-AC16-39349871C957}.Release|Win32.Build.0 = Release|Win32
+		{57C81DD3-4304-213D-AC16-39349871C957}.Release|x64.ActiveCfg = Release|x64
+		{57C81DD3-4304-213D-AC16-39349871C957}.Release|x64.Build.0 = Release|x64
 		{010BE414-ED5B-CF56-16C0-BD18027062C0}.Debug|Win32.ActiveCfg = Debug|Win32
 		{010BE414-ED5B-CF56-16C0-BD18027062C0}.Debug|Win32.Build.0 = Debug|Win32
 		{010BE414-ED5B-CF56-16C0-BD18027062C0}.Debug|x64.ActiveCfg = Debug|x64
@@ -61,30 +85,46 @@ Global
 		{2F8724C6-1BC3-2730-84D5-3F277030D04A}.Release|Win32.Build.0 = Release|Win32
 		{2F8724C6-1BC3-2730-84D5-3F277030D04A}.Release|x64.ActiveCfg = Release|x64
 		{2F8724C6-1BC3-2730-84D5-3F277030D04A}.Release|x64.Build.0 = Release|x64
-		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|Win32.ActiveCfg = Debug|Win32
-		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|Win32.Build.0 = Debug|Win32
-		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|x64.ActiveCfg = Debug|x64
-		{57C81DD3-4304-213D-AC16-39349871C957}.Debug|x64.Build.0 = Debug|x64
-		{57C81DD3-4304-213D-AC16-39349871C957}.Release|Win32.ActiveCfg = Release|Win32
-		{57C81DD3-4304-213D-AC16-39349871C957}.Release|Win32.Build.0 = Release|Win32
-		{57C81DD3-4304-213D-AC16-39349871C957}.Release|x64.ActiveCfg = Release|x64
-		{57C81DD3-4304-213D-AC16-39349871C957}.Release|x64.Build.0 = Release|x64
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|Win32.ActiveCfg = Debug|Win32
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|Win32.Build.0 = Debug|Win32
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|x64.ActiveCfg = Debug|x64
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Debug|x64.Build.0 = Debug|x64
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|Win32.ActiveCfg = Release|Win32
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|Win32.Build.0 = Release|Win32
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|x64.ActiveCfg = Release|x64
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B}.Release|x64.Build.0 = Release|x64
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|Win32.ActiveCfg = Debug|Win32
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|Win32.Build.0 = Debug|Win32
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|x64.ActiveCfg = Debug|x64
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Debug|x64.Build.0 = Debug|x64
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|Win32.ActiveCfg = Release|Win32
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|Win32.Build.0 = Release|Win32
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|x64.ActiveCfg = Release|x64
-		{F9BE7957-8399-899E-0C49-E714FDDD4B65}.Release|x64.Build.0 = Release|x64
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|Win32.Build.0 = Debug|Win32
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|x64.ActiveCfg = Debug|x64
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|x64.Build.0 = Debug|x64
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|Win32.ActiveCfg = Release|Win32
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|Win32.Build.0 = Release|Win32
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|x64.ActiveCfg = Release|x64
+		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|x64.Build.0 = Release|x64
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|Win32.Build.0 = Debug|Win32
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|x64.ActiveCfg = Debug|x64
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|x64.Build.0 = Debug|x64
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|Win32.ActiveCfg = Release|Win32
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|Win32.Build.0 = Release|Win32
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|x64.ActiveCfg = Release|x64
+		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|x64.Build.0 = Release|x64
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|Win32.ActiveCfg = Debug|Win32
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|Win32.Build.0 = Debug|Win32
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|x64.ActiveCfg = Debug|x64
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|x64.Build.0 = Debug|x64
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|Win32.ActiveCfg = Release|Win32
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|Win32.Build.0 = Release|Win32
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|x64.ActiveCfg = Release|x64
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|x64.Build.0 = Release|x64
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|Win32.Build.0 = Debug|Win32
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|x64.ActiveCfg = Debug|x64
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|x64.Build.0 = Debug|x64
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|Win32.ActiveCfg = Release|Win32
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|Win32.Build.0 = Release|Win32
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|x64.ActiveCfg = Release|x64
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|x64.Build.0 = Release|x64
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|Win32.ActiveCfg = Debug|Win32
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|Win32.Build.0 = Debug|Win32
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|x64.ActiveCfg = Debug|x64
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|x64.Build.0 = Debug|x64
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|Win32.ActiveCfg = Release|Win32
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|Win32.Build.0 = Release|Win32
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|x64.ActiveCfg = Release|x64
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|x64.Build.0 = Release|x64
 		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC}.Debug|Win32.ActiveCfg = Debug|Win32
 		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC}.Debug|Win32.Build.0 = Debug|Win32
 		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC}.Debug|x64.ActiveCfg = Debug|x64
@@ -109,60 +149,20 @@ Global
 		{0C768A18-1D25-4000-9F37-DA5FE99E3B64}.Release|Win32.Build.0 = Release|Win32
 		{0C768A18-1D25-4000-9F37-DA5FE99E3B64}.Release|x64.ActiveCfg = Release|x64
 		{0C768A18-1D25-4000-9F37-DA5FE99E3B64}.Release|x64.Build.0 = Release|x64
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|Win32.ActiveCfg = Debug|Win32
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|Win32.Build.0 = Debug|Win32
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|x64.ActiveCfg = Debug|x64
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Debug|x64.Build.0 = Debug|x64
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|Win32.ActiveCfg = Release|Win32
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|Win32.Build.0 = Release|Win32
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|x64.ActiveCfg = Release|x64
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A}.Release|x64.Build.0 = Release|x64
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|Win32.Build.0 = Debug|Win32
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|x64.ActiveCfg = Debug|x64
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Debug|x64.Build.0 = Debug|x64
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|Win32.ActiveCfg = Release|Win32
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|Win32.Build.0 = Release|Win32
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|x64.ActiveCfg = Release|x64
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F}.Release|x64.Build.0 = Release|x64
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|Win32.ActiveCfg = Debug|Win32
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|Win32.Build.0 = Debug|Win32
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|x64.ActiveCfg = Debug|x64
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Debug|x64.Build.0 = Debug|x64
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|Win32.ActiveCfg = Release|Win32
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|Win32.Build.0 = Release|Win32
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|x64.ActiveCfg = Release|x64
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3}.Release|x64.Build.0 = Release|x64
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|Win32.ActiveCfg = Debug|Win32
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|Win32.Build.0 = Debug|Win32
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|x64.ActiveCfg = Debug|x64
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Debug|x64.Build.0 = Debug|x64
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|Win32.ActiveCfg = Release|Win32
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|Win32.Build.0 = Release|Win32
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|x64.ActiveCfg = Release|x64
-		{D56CBCEB-1EB5-4CA8-AEC4-48EA35ED61C7}.Release|x64.Build.0 = Release|x64
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|Win32.ActiveCfg = Debug|Win32
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|Win32.Build.0 = Debug|Win32
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|x64.ActiveCfg = Debug|x64
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Debug|x64.Build.0 = Debug|x64
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|Win32.ActiveCfg = Release|Win32
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|Win32.Build.0 = Release|Win32
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|x64.ActiveCfg = Release|x64
-		{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(NestedProjects) = preSolution
+		{4B47A364-37C4-96A7-6041-97BB4C1D333B} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
+		{57C81DD3-4304-213D-AC16-39349871C957} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
 		{010BE414-ED5B-CF56-16C0-BD18027062C0} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
 		{2F8724C6-1BC3-2730-84D5-3F277030D04A} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
-		{57C81DD3-4304-213D-AC16-39349871C957} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
-		{4B47A364-37C4-96A7-6041-97BB4C1D333B} = {EB5FC2C6-D72D-B6CC-C0C1-26F3AC2E9231}
+		{61F7EB00-7281-4BF3-9470-7C2EA92620C3} = {57B5AA5E-C340-1823-CC51-9B17385C7423}
+		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F} = {57B5AA5E-C340-1823-CC51-9B17385C7423}
+		{222F7498-B40C-4F3F-A704-DDEB91A4484A} = {FD47AE19-69FD-260F-F2F1-20E65EA61D13}
 		{CA8A30D1-8FA9-4330-B7F7-84709246D8DC} = {FD47AE19-69FD-260F-F2F1-20E65EA61D13}
 		{66174227-8541-41FC-A6DF-4764FC66F78E} = {FD47AE19-69FD-260F-F2F1-20E65EA61D13}
 		{0C768A18-1D25-4000-9F37-DA5FE99E3B64} = {FD47AE19-69FD-260F-F2F1-20E65EA61D13}
-		{222F7498-B40C-4F3F-A704-DDEB91A4484A} = {FD47AE19-69FD-260F-F2F1-20E65EA61D13}
-		{C5ACCA6E-C04D-4B36-8516-3752B3C13C2F} = {57B5AA5E-C340-1823-CC51-9B17385C7423}
-		{61F7EB00-7281-4BF3-9470-7C2EA92620C3} = {57B5AA5E-C340-1823-CC51-9B17385C7423}
 	EndGlobalSection
 EndGlobal
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index fcd028b6e..ac49402a1 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2528,12 +2528,17 @@ __target_intrinsic(cuda, "__activemask()")
 __target_intrinsic(hlsl, "WaveActiveBallot(true).x")
 WaveMask WaveGetConvergedMask();
 
+__intrinsic_op($(kIROp_WaveGetActiveMask))
+WaveMask __WaveGetActiveMask();
+
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBallot(true).x")
 __target_intrinsic(hlsl, "WaveActiveBallot(true).x")
-__target_intrinsic(cuda, "__activemask()") // Note: semantically incorrect, but best we can do for now.
-WaveMask WaveGetActiveMask();
+WaveMask WaveGetActiveMask()
+{
+    return __WaveGetActiveMask();
+}
 
 __glsl_extension(GL_KHR_shader_subgroup_basic)
 __spirv_version(1.3)
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index fa63ba255..d8f910faa 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -518,6 +518,31 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             m_writer->emit("\n}");
             return true;
         }
+        case kIROp_WaveMaskBallot:
+        {
+            m_writer->emit("__ballot_sync(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(", ");
+            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+            m_writer->emit(")");
+            return true;
+        }
+        case kIROp_WaveMaskMatch:
+        {
+            SemanticVersion version;
+            version.set(7, 0);
+            if (version > m_extensionTracker->m_smVersion)
+            {
+                m_extensionTracker->m_smVersion = version;
+            }
+
+            m_writer->emit("__match_any_sync(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(", ");
+            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+            m_writer->emit(")");
+            return true;
+        }
         default: break;
     }
 
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index efa56c261..6c29485cd 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -17,6 +17,7 @@
 #include "slang-ir-specialize-resources.h"
 #include "slang-ir-ssa.h"
 #include "slang-ir-strip-witness-tables.h"
+#include "slang-ir-synthesize-active-mask.h"
 #include "slang-ir-union.h"
 #include "slang-ir-validate.h"
 #include "slang-ir-wrap-structured-buffers.h"
@@ -500,6 +501,31 @@ Result linkAndOptimizeIR(
         legalizeByteAddressBufferOps(session, irModule, byteAddressBufferOptions);
     }
 
+    // For CUDA targets only, we will need to turn operations
+    // the implicitly reference the "active mask" into ones
+    // that use (and pass around) an explicit mask instead.
+    //
+    switch(target)
+    {
+    case CodeGenTarget::CUDASource:
+    case CodeGenTarget::PTX:
+        {
+            synthesizeActiveMask(
+                irModule,
+                compileRequest->getSink());
+
+#if 0
+            dumpIRIfEnabled(compileRequest, irModule, "AFTER synthesizeActiveMask");
+#endif
+            validateIRModuleIfEnabled(compileRequest, irModule);
+
+        }
+        break;
+
+    default:
+        break;
+    }
+
     // For GLSL only, we will need to perform "legalization" of
     // the entry point and any entry-point parameters.
     //
diff --git a/source/slang/slang-ir-dominators.cpp b/source/slang/slang-ir-dominators.cpp
index 7960bcaf1..bae9f772f 100644
--- a/source/slang/slang-ir-dominators.cpp
+++ b/source/slang/slang-ir-dominators.cpp
@@ -34,6 +34,27 @@ bool IRDominatorTree::immediatelyDominates(IRBlock* dominator, IRBlock* dominate
 
 bool IRDominatorTree::properlyDominates(IRBlock* dominator, IRBlock* dominated)
 {
+    // We need to deal with the cases where `dominator` and/or
+    // `dominated` are unreachable, and thus not represtend
+    // in the nodes of the dominator tree we constructed.
+    //
+    // If `dominated` is unreachable, then there are zero
+    // control flow paths that can reach it, so that *all*
+    // of those (zero) control flow paths go through
+    // `dominator`.
+    //
+    if(isUnreachable(dominated))
+        return true;
+
+    // If `dominated` is reachable then there must exist at least
+    // one control-flow path to it. Thus if `dominator` is not
+    // reachable, it cannot be on that path, and thus must
+    // not be a dominator.
+    //
+    if(isUnreachable(dominator))
+        return false;
+
+
     // Because of how we laid out the tree, we can test if one node
     // properly dominates another in constant time.
     //
@@ -67,6 +88,11 @@ bool IRDominatorTree::dominates(IRBlock* dominator, IRBlock* dominated)
 
 IRBlock* IRDominatorTree::getImmediateDominator(IRBlock* block)
 {
+    // An unreachable block has no immediate dominator.
+    //
+    if(isUnreachable(block))
+        return nullptr;
+
     // The immediate dominator of a block is its parent
     // in the dominator tree. Looking this up is straightforward,
     // and we just need to be a bit careful to deal with
@@ -83,6 +109,11 @@ IRBlock* IRDominatorTree::getImmediateDominator(IRBlock* block)
 
 IRDominatorTree::DominatedList IRDominatorTree::getImmediatelyDominatedBlocks(IRBlock* block)
 {
+    // An unreachable block doesn't immediately dominate anything.
+    //
+    if(isUnreachable(block))
+        return DominatedList();
+
     // Because of our representation, the immediately dominated blocks
     // for a node are contiguous, and we store their range in the
     // node already.
@@ -99,6 +130,13 @@ IRDominatorTree::DominatedList IRDominatorTree::getImmediatelyDominatedBlocks(IR
 
 IRDominatorTree::DominatedList IRDominatorTree::getProperlyDominatedBlocks(IRBlock* block)
 {
+    // Technically each unreachable block dominates all the other
+    // unreachable blocks, but setting things up to answer that
+    // query "correctly" would be a hassle.
+    //
+    if(isUnreachable(block))
+        return DominatedList();
+
     // Because of our representation, the properly dominated blocks
     // for a node are contiguous, and we store their range in the
     // node already.
@@ -123,6 +161,12 @@ Int IRDominatorTree::getBlockIndex(IRBlock* block)
     return index;
 }
 
+bool IRDominatorTree::isUnreachable(IRBlock* block)
+{
+    return !mapBlockToIndex.ContainsKey(block);
+}
+
+
 // IRDominatorTree::DominatedList
 
 IRDominatorTree::DominatedList::DominatedList()
@@ -181,6 +225,12 @@ bool IRDominatorTree::DominatedList::Iterator::operator==(Iterator const& that)
     return mIndex == that.mIndex;
 }
 
+bool IRDominatorTree::DominatedList::Iterator::operator!=(Iterator const& that) const
+{
+    SLANG_ASSERT(mTree == that.mTree);
+    return mIndex != that.mIndex;
+}
+
 //
 // The dominance computation algorithm we are using relies on being able to compute
 // a reverse postorder traversal of the nodes in the CFG, which is done using a depth-first
diff --git a/source/slang/slang-ir-dominators.h b/source/slang/slang-ir-dominators.h
index 7ec31b821..f4ecd0cd0 100644
--- a/source/slang/slang-ir-dominators.h
+++ b/source/slang/slang-ir-dominators.h
@@ -54,6 +54,9 @@ namespace Slang
         /// These are the descendents of the block in the dominator tree.
         DominatedList getProperlyDominatedBlocks(IRBlock* block);
 
+            /// Is `block` unrechable in the control flow graph?
+        bool isUnreachable(IRBlock* block);
+
         struct DominatedList
         {
         public:
@@ -67,6 +70,7 @@ namespace Slang
                 IRBlock* operator*() const;
                 void operator++();
                 bool operator==(Iterator const& that) const;
+                bool operator!=(Iterator const& that) const;
 
             private:
                 friend struct DominatedList;
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index 46ad566fd..b07703503 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -417,6 +417,14 @@ INST(Dot, dot, 2, 0)
 
 INST(GetStringHash, getStringHash, 1, 0)
 
+INST(WaveGetActiveMask, waveGetActiveMask, 0, 0)
+
+    /// trueMask = waveMaskBallot(mask, condition)
+INST(WaveMaskBallot, waveMaskBallot, 2, 0)
+
+    /// matchMask = waveMaskBallot(mask, value)
+INST(WaveMaskMatch, waveMaskMatch, 2, 0)
+
 // Texture sampling operation of the form `t.Sample(s,u)`
 INST(Sample, sample, 3, 0)
 
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index 881d7a93b..8b13f48a1 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -1509,6 +1509,9 @@ struct SharedIRBuilder
 
     // TODO: We probably shouldn't use this in the long run.
     Dictionary<void*,           IRLayout*>        layoutMap;
+
+
+    void insertBlockAlongEdge(IREdge const& edge);
 };
 
 struct IRBuilderSourceLocRAII;
@@ -2019,6 +2022,12 @@ struct IRBuilder
         IRType* type,
         IRInst* val);
 
+    IRInst* emitWaveMaskBallot(IRType* type, IRInst* mask, IRInst* condition);
+    IRInst* emitWaveMaskMatch(IRType* type, IRInst* mask, IRInst* value);
+
+    IRInst* emitBitAnd(IRType* type, IRInst* left, IRInst* right);
+    IRInst* emitBitNot(IRType* type, IRInst* value);
+
     //
     // Decorations
     //
diff --git a/source/slang/slang-ir-ssa.cpp b/source/slang/slang-ir-ssa.cpp
index 08a5ffa67..19f85eaa9 100644
--- a/source/slang/slang-ir-ssa.cpp
+++ b/source/slang/slang-ir-ssa.cpp
@@ -891,63 +891,127 @@ void processBlock(
     }
 }
 
-static void breakCriticalEdges(
-    ConstructSSAContext*    context)
+IRBlock* IREdge::getPredecessor() const
+{
+    return cast<IRBlock>(getUse()->getUser()->parent);
+}
+
+IRBlock* IREdge::getSuccessor() const
+{
+    return cast<IRBlock>(getUse()->get());
+}
+
+void SharedIRBuilder::insertBlockAlongEdge(
+    IREdge const&       edge)
+{
+    auto pred = edge.getPredecessor();
+    auto succ = edge.getSuccessor();
+    auto edgeUse = edge.getUse();
+
+    IRBuilder builder;
+    builder.sharedBuilder = this;
+    builder.setInsertInto(pred);
+
+    // Create a new block that will sit "along" the edge
+    IRBlock* edgeBlock = builder.createBlock();
+
+    edgeUse->debugValidate();
+
+    // The predecessor block should now branch to
+    // the edge block.
+    edgeUse->set(edgeBlock);
+
+    // The edge block should branch (unconditionally)
+    // to the successor block.
+    builder.setInsertInto(edgeBlock);
+    builder.emitBranch(succ);
+
+    // Insert the new block into the block list
+    // for the function.
+    //
+    // In principle, the order of this list shouldn't
+    // affect the semantics of a program, but we
+    // might want to be careful about ordering anyway.
+    edgeBlock->insertAfter(pred);
+}
+
+bool IREdge::isCritical() const
 {
     // A critical edge is an edge P -> S where
     // P has multiple sucessors, and S has multiple
     // predecessors.
     //
-    // In the context of our CFG representation, such an edge
-    // will be an `IRUse` in the terminator instruction of block P,
-    // which refers to block S.
+    auto pred = getPredecessor();
+    auto succ = getSuccessor();
+
+    // If the predecessor block doesn't have multiple successors,
+    // then this can't be a critical edge.
     //
-    // We will make a pass over the CFG to collect all the critical
-    // edges, and then we will break them in a follow-up pass.
+    if(pred->getSuccessors().getCount() <= 1)
+        return false;
 
-    List<IRUse*> criticalEdges;
+    // For the edge to be critical, the successor must have
+    // more than one predecessor.
+    //
+    // More than that, we require that it has more than one
+    // *unique* predecessor, to handle the case where multiple
+    // cases of a `switch` might lead to the same block.
+    //
+    // To implement this, we test if it has any predecessor
+    // other than `pred` which we already know about.
+    //
+    bool multiplePreds = false;
+    for (auto pp : succ->getPredecessors())
+    {
+        if (pp != pred)
+        {
+            multiplePreds = true;
+            break;
+        }
+    }
+    if(!multiplePreds)
+        return false;
 
+    // At this point we have confirmed that `edgeUse`
+    // leads from a block with multiple successors
+    // to one with multiple (distinct) predecessors,
+    // so we are sure it is a critical edge.
+    //
+    return true;
+}
+
+static void breakCriticalEdges(
+    ConstructSSAContext*    context)
+{
     auto globalVal = context->globalVal;
+
+    // We will make a pass over the CFG to collect all the critical
+    // edges, and then we will break them in a follow-up pass.
+    //
+    List<IREdge> criticalEdges;
     for (auto pred = globalVal->getFirstBlock(); pred; pred = pred->getNextBlock())
     {
+        // As an early out: if the predecessor block
+        // has one (or fewer) successors, then no
+        // edge from that block can be critical.
+        //
         auto successors = pred->getSuccessors();
         if (successors.getCount() <= 1)
             continue;
 
+        // Otherwise, we iterate over its outgoing
+        // edges (represented with its successor list),
+        // and record those that are critical.
+        //
         auto succIter = successors.begin();
         auto succEnd = successors.end();
-
         for (; succIter != succEnd; ++succIter)
         {
-            auto succ = *succIter;
-
-            // For the edge to be critical, the successor must have
-            // more than one predecessor.
-            // More than that, we require that it has more than one
-            // *unique* predecessor, to handle the case where multiple
-            // cases of a `switch` might lead to the same block.
-            //
-            // To implement this, we test if it has any predecessor
-            // other than `pred` which we already know about.
-
-            bool multiplePreds = false;
-            for (auto pp : succ->getPredecessors())
+            auto edge = succIter.getEdge();
+            if( edge.isCritical() )
             {
-                if (pp != pred)
-                {
-                    multiplePreds = true;
-                    break;
-                }
+                criticalEdges.add(edge);
             }
-            if (!multiplePreds)
-                continue;
-
-            // We have found a critical edge from `pred` to `succ`.
-            //
-            // Furthermore, the `IRUse` embedded in `succIter` represents
-            // that edge directly.
-            auto edgeUse = succIter.use;
-            criticalEdges.add(edgeUse);
         }
     }
 
@@ -956,36 +1020,9 @@ static void breakCriticalEdges(
     // to break the edges while doing the initial walk, because
     // that would change the CFG while we are walking it.
 
-    for (auto edgeUse : criticalEdges)
+    for (auto edge : criticalEdges)
     {
-        auto pred = cast<IRBlock>(edgeUse->getUser()->parent);
-        auto succ = cast<IRBlock>(edgeUse->get());
-
-        IRBuilder builder;
-        builder.sharedBuilder = &context->sharedBuilder;
-        builder.setInsertInto(pred);
-
-        // Create a new block that will sit "along" the edge
-        IRBlock* edgeBlock = builder.createBlock();
-
-        edgeUse->debugValidate();
-
-        // The predecessor block should now branch to
-        // the edge block.
-        edgeUse->set(edgeBlock);
-
-        // The edge block should branch (unconditionally)
-        // to the successor block.
-        builder.setInsertInto(edgeBlock);
-        builder.emitBranch(succ);
-
-        // Insert the new block into the block list
-        // for the function.
-        //
-        // In principle, the order of this list shouldn't
-        // affect the semantics of a program, but we
-        // might want to be careful about ordering anyway.
-        edgeBlock->insertAfter(pred);
+        context->sharedBuilder.insertBlockAlongEdge(edge);
     }
 }
 
diff --git a/source/slang/slang-ir-synthesize-active-mask.cpp b/source/slang/slang-ir-synthesize-active-mask.cpp
new file mode 100644
index 000000000..a92ef2c80
--- /dev/null
+++ b/source/slang/slang-ir-synthesize-active-mask.cpp
@@ -0,0 +1,2109 @@
+// slang-ir-synthesize-active-mask.cpp
+#include "slang-ir-synthesize-active-mask.h"
+
+#include "slang-ir-dominators.h"
+#include "slang-ir-insts.h"
+
+namespace Slang
+{
+
+// This file implements a pass on the IR to make the "active mask" concept
+// from HLSL explicit in the code, so that we can generate code for
+// targets like CUDA where explicit masks are used for warp-/wave-level
+// collective operations.
+//
+// At the time this pass was written, the exact semantics of the implicit
+// active mask in HLSL (and GLSL/SPIR-V) had not been specified, so this
+// file had to define what the semantics ought to be.
+//
+// The main guideline for these semantics is that it should conform to
+// the user's view of control flow based on the high-level language.
+//
+// The intuitive version of the semantics provided by this pass are:
+//
+// * Code is conceptually executed by "shards" of threads/lanes, where
+//   each shard consists of a subset of the threads/lanes in the warp/wave
+//   that are executing "together."
+//
+// * When a shard encounters a structured control-flow statement, the
+//   shard may fragment into sub-shards that partition its set of threads/lanes.
+//   E.g. for a two-way `if` we end up with a sub-shard for threads/lanes
+//   where the condition was `true`, and another sub-shard for the
+//   threads/lanes where the condition was `false`.
+//
+// * When the threads in a shard exit some structured control-flow
+//   statement normally (e.g., execution reaches the end of the "then"
+//   statement for an `if`), those threads *re-converge* with all the other
+//   threads that entered the structured statement together and which
+//   exited normally.
+//
+// * When threads in a shard exit some structured control-flow statement
+//   "abnormally" (bypassing the statements that logically come after),
+//   those threads do *not* re-converge with the other threads that exited
+//   normally, although they may still re-converge at the normal exit of
+//   some lexically outer statement.
+//
+// Turning these rules into something formal mostly comes down to codifying:
+//
+// 1. when threads exit a structured control flow statement, and
+// 2. which of those exits are "normal" vs. "abnormal."
+//
+// The Slang IR maintains structure information in its IR (similar to the
+// information maintained by SPIR-V for Vulkan). For a structured control
+// flow statement like an `if`, the IR-level representation encodes both
+// the conditional branch for the `if`, but also the label/block representing
+// the code that logically comes after the `if` in the high-level-language
+// code. We can refer to a block like that as a "merge point."
+//
+// Our more formal divergence and re-convergence behavior then depends
+// on two definitions:
+//
+// 1. Code is "inside" a control-flow structure if it is dominated by
+// the entry block, but not dominated by the merge block. Control flow
+// thus exits the structure whenever it branchs from a block inside the
+// structure to some place not inside the structure.
+//
+// 2. An edge that exits a control-flow structure represents a "normal"
+// exit if it branches to the merge block for the structure, and an
+// "abnormal" exit otherwise.
+//
+// These definitions result in divergence and re-convergence behavior
+// that closely matches what programmers expect from their high-level
+// language code.
+//
+// (Note: because the definition of what code is inside a control-flow
+// structure depends on dominance in the CFG, the existence of arbitrary
+// `goto` operations that can jump "into" a control-flow statement would
+// break our assumptions. This pass would only be compatible with
+// `goto` operations that jump "out" of control-flow statements.)
+
+// The translation of code to make masks explicit can be broadly
+// broken up into:
+//
+// * Analysis and transformation of the entire module and its call
+//   graph, to identify and update functions that need an explicit
+//   active mask.
+//
+// * Analysis and transformation of a single function, that was
+//   identified by the above logic.
+//
+// We will start with the whole-module logic.
+//
+// As it typical for our IR passes, we will wrap up the module-level
+// synthesis work in a "context" type.
+//
+struct SynthesizeActiveMaskForModuleContext
+{
+    // The context needs to know the module it will process, and to
+    // have a sink where it can report any errors it might run into
+    // (e.g., any cases where a mask is needed but cannot be synthesized)
+    //
+    IRModule* m_module;
+    DiagnosticSink* m_sink;
+
+    // We use a single shared IR builder for the entire pass, to
+    // make sure we deduplicate types/values as much as possible.
+    //
+    SharedIRBuilder m_sharedBuilder;
+
+    // Because we are introducing explicit masks, it is important
+    // for all the code to agree on the type of mask that will be
+    // used.
+    //
+    IRType* m_maskType;
+
+    void processModule()
+    {
+        // When asked to process an IR module, our pass first
+        // sets up the shared state.
+        //
+        m_sharedBuilder.session = m_module->getSession();
+        m_sharedBuilder.module = m_module;
+
+        // We use the plain 32-bit `uint` type masks we
+        // generate here since it matches the current
+        // definition of `WaveMask` in the standard library.
+        //
+        // TODO: If/when the `WaveMask` type in the stdlib is
+        // made opaque, this should use the opaque type instead,
+        // so that the pass is compatible with all targets that
+        // support  a wave mask.
+        //
+        IRBuilder builder;
+        builder.sharedBuilder = &m_sharedBuilder;
+        m_maskType = builder.getBasicType(BaseType::UInt);
+
+        // With the setup out of the way, the job of the module
+        // level pass is simple to describe:
+        //
+        // First we want to identify and mark all of the functions
+        // in the module that use the active mask.
+        //
+        markFuncsUsingActiveMask();
+        //
+        // Then we want to transform all of those functions that
+        // were marked so that they use an explicitly synthesized
+        // value for the active mask.
+        //
+        transformFuncsUsingActiveMask();
+    }
+
+    // During the marking process we will build up a list of functions
+    // that need the active mask, and we will also maintain a set of
+    // those functions so that we don't waste time redundantly considering
+    // functions we've already marked.
+    //
+    List<IRFunc*> m_funcsUsingActiveMask;
+    HashSet<IRFunc*> m_funcsUsingActiveMaskSet;
+
+    // When the code finds an IR function that seems to use the active
+    // mask, we will mark it by adding it to the list and set, but
+    // only if we haven't already done so previously.
+    //
+    void markFuncUsingActiveMask(IRFunc* func)
+    {
+        if(m_funcsUsingActiveMaskSet.Contains(func))
+            return;
+
+        m_funcsUsingActiveMask.add(func);
+        m_funcsUsingActiveMaskSet.Add(func);
+    }
+
+    // The easiest way to know that a function uses the active
+    // mask is if it contains an *instruction* that uses the active
+    // mask.
+    //
+    // Because it is easiest to detect use of the active mask at
+    // an instruciton level, it is convenient to have a utility
+    // that, given an instruction which uses the active mask,
+    // marks the function that contains it (if any) as using
+    // the active mask.
+    //
+    void markInstUsingActiveMask(IRInst* inst)
+    {
+        // We expect the immedaite parent of an ordinary
+        // instruction to be a basic block (although in
+        // unlikely cases an instruction can be directly nested
+        // in the `IRModule`).
+        //
+        auto parent = inst->getParent();
+        if( auto block = as<IRBlock>(parent) )
+        {
+            parent = block->getParent();
+        }
+        //
+        // We expect the immediate parent of that block
+        // to be an IR function (again, in unlikely cases
+        // the block could be nested in an IR generic,
+        // or a global variable with an initializer, etc.).
+        //
+        if( auto func = as<IRFunc>(parent) )
+        {
+            markFuncUsingActiveMask(func);
+        }
+    }
+
+    void markFuncsUsingActiveMask()
+    {
+        // We break down the task of identifying all functions
+        // that use the active mask into two steps.
+        //
+        // First we identify those functions that *directly* use
+        // the active mask, by virtuae of having a `getActiveMask`
+        // instruction in their body.
+        //
+        markFuncsDirectlyUsingActiveMaskRec(m_module->getModuleInst());
+
+        // Second, we identify any function that indirectly use
+        // the active mask.
+        //
+        // The detecting of indirect use is handled by looking
+        // at all the functions we've already marked, and identifying
+        // unmarked functions that call marked functions.
+        //
+        // We also modify these call sites (from unmarked functions
+        // to marked functions) so that they pass along the explicit
+        // active mask value to the callee.
+        //
+        // Note: we are iterating over `m_funcsUsingActiveMask` here
+        // while also invoking code that could add additional
+        // functions to that list. As such it is *intentional* that
+        // we are querying `getCount()` on the list on every iteration
+        // of the loop, rather than querying it once and using a
+        // variable (or using a range-based `for` loop).
+        //
+        for( Index i = 0; i < m_funcsUsingActiveMask.getCount(); ++i )
+        {
+            markAndModifyFuncsIndirectlyUsingActiveMask(m_funcsUsingActiveMask[i]);
+        }
+
+        // TODO: The logic here isn't robust against cases where we
+        // might have indirect calls.
+        //
+        // One possible solution would be to always treat indirect
+        // calls as if they require an active mask, and pass one along.
+        // Then all functions that might be used as the callee of an
+        // indirect call site (those whose values "escape") would need
+        // to have a variant that takes the active mask generated (if
+        // they didn't actually need the active mask based on their body).
+        // All function pointer values would be based on the mask-taking
+        // variant.
+        //
+        // Another alternative would be to make dependency on the
+        // active mask an explicit part of the type/signature of functions
+        // in the case where masks need to be passed, so that even indirect
+        // call sites could be decomposed into mask-taking and non-mask-taking
+        // cases just based on type information.
+        //
+        // There are probably other options to consider, each with
+        // its own trade-offs. This is a design question that we aren't
+        // really equipped to tackle right now, so the easiest solution
+        // is to defer the decision until we actually need an answer.
+    }
+
+    void markFuncsDirectlyUsingActiveMaskRec(IRInst* inst)
+    {
+        // Detecting functions that directly use the active
+        // mask is fairly simple: we recursively walk the
+        // IR module and look for instances of `waveGetActiveMask`
+        // instructions. When we find one we mark the function
+        // that contains it.
+
+        if( inst->op == kIROp_WaveGetActiveMask )
+        {
+            markInstUsingActiveMask(inst);
+        }
+
+        for( auto child : inst->getChildren() )
+        {
+            markFuncsDirectlyUsingActiveMaskRec(child);
+        }
+    }
+
+    void markAndModifyFuncsIndirectlyUsingActiveMask(IRFunc* callee)
+    {
+        // In order to detect functions that indirectly use the active
+        // mask through `callee`, we need to identify call sites.
+        //
+        // We will build up a list of call sites during the marking
+        // step, so that we can also modify those call sites later
+        // in this function.
+
+        List<IRCall*> calls;
+        for( IRUse* use = callee->firstUse; use; use = use->nextUse )
+        {
+            // We are looking for instructions that use `callee`,
+            // where the instruction is a call, and `callee` is used
+            // as, well, the callee.
+            //
+            IRInst* user = use->getUser();
+            IRCall* call = as<IRCall>(user);
+            if(!call)
+                continue;
+            if(call->getCallee() != callee)
+                continue;
+
+            // Once we have found a call site to `callee`,
+            // we mark the function that contains the call
+            // site as one we need to process, and also
+            // add the call site to our list of call sites
+            // we need to modify.
+            //
+            markInstUsingActiveMask(call);
+            calls.add(call);
+        }
+
+        // Once we've discovered all the call sites for `callee`,
+        // we will go ahead and modify them.
+        //
+        // (We don't mix up marking and modification because
+        // changing the use/def list of `callee` while also
+        // walking it would be dangerous)
+        //
+        for(auto call : calls)
+        {
+            // The basic situation here is that we have a call of the form:
+            //
+            //      callee(arg0, arg1, arg2, ...);
+            //
+            // and we want to transform it to:
+            //
+            //      mask = waveGetActiveMask();
+            //      callee(arg0, arg1, arg2, ..., m);
+            //
+            IRBuilder builder;
+            builder.sharedBuilder = &m_sharedBuilder;
+            builder.setInsertBefore(call);
+
+            // First we synthesize the mask to pass down by
+            // directly invoking `waveGetActiveMask()`.
+            //
+            // Note that after this instruction has been inserted,
+            // the function containing `call` is now a function
+            // that directly uses the active mask.
+            //
+            auto mask = builder.emitIntrinsicInst(builder.getBasicType(BaseType::UInt), kIROp_WaveGetActiveMask, 0, nullptr);
+
+            // Next we synthesize the new argument list for the
+            // call, which consists of the original arguments,
+            // followed by the `mask`.
+            //
+            List<IRInst*> newArgs;
+            Int originalArgCount = call->getArgCount();
+            for(Int a = 0; a < originalArgCount; ++a)
+            {
+                newArgs.add(call->getArg(a));
+            }
+            newArgs.add(mask);
+
+            // Finally we emit a new call instruction to the same
+            // callee with our new arguments, and use the new call
+            // to replace the original `call`.
+            //
+            // Note: We can't just modify the `call` in-place because
+            // of the way that the operands to an `IRInst` are allocated
+            // contiguously in the same storage as the `IRInst` itself.
+            //
+            auto newCall = builder.emitCallInst(call->getFullType(), call->getCallee(), newArgs);
+            call->replaceUsesWith(newCall);
+            call->removeAndDeallocate();
+        }
+    }
+
+    void transformFuncsUsingActiveMask()
+    {
+        // Once all the functions that need the active mask have
+        // been marked (and all call sites to them have
+        // been modified), we can transform each of the marked
+        // functions one by one.
+        //
+        for( auto func : m_funcsUsingActiveMask )
+        {
+            transformFuncUsingActiveMask(func);
+        }
+    }
+
+    // We will get to the details of how we transform each
+    // function shortly, but for now we just forward-declare
+    // the operation that makes it happen.
+    //
+    void transformFuncUsingActiveMask(IRFunc* func);
+};
+
+// The module-level pass isn't too complicated, and its main job is
+// to reduce the scope of the problem down to individual functions.
+// As a result the function-level pass, which has to deal with much
+// more complicated issues, at least doesn't have to deal with
+// distractions related to the whole-module processing.
+//
+struct SynthesizeActiveMaskForFunctionContext
+{
+    // The funciton-level pass unsurprisngly operates on one IR function.
+    //
+    IRFunc* m_func;
+
+    // Some of the state used for building up mask operations is
+    // passed down form the module-level pass.
+    //
+    SharedIRBuilder* m_sharedBuilder;
+    IRType* m_maskType;
+
+    void transformFunc()
+    {
+        // The first thing we will do to our function is preprocess it
+        // so that it satisfies several properties that will make our
+        // pass easier to implement correctly.
+        //
+        // Note that given a CFG edge P->S from a predecessor P to
+        // a successor S, we can always rewrite the CFG by removing
+        // the edge P->S, creating a new block E, and then inserting
+        // edges P->E and E->S. That transformation is referred to
+        // as "breaking" the edge P->S, and it always preserves the
+        // semantics of the original program.
+        //
+        // We will start out by breaking several kinds of edges in
+        // the CFG of the function. Specifically, we break:
+        //
+        // * *Critical* edges which are those from a block P with
+        //  multiple successors to a block S with multiple predecessors.
+        //
+        // * What we are calling *pseudo-critical* edges, which are
+        //   those from a block P with multiple successors to a block
+        //   M that is used as the merge point of a structured control-flow
+        //   construct.
+        //
+        // The benefit of these transformations is the invariants they
+        // provide us for the rest of this pass:
+        //
+        // * When looking at a conditional branch instruction, we can assume:
+        //
+        //   * The block with the branch is the only predecessor of the target blocks
+        //
+        //   * The block with the branch is the immediate dominator of its target blocks
+        //
+        //   * None of the target blocks is the merge point for the conditional branch
+        //
+        // * As a corallary of the above, all branches to a merge block are uncondtiional branches
+        //
+        // * Any block with multiple predecessors is only targetted by unconditional branches
+        //
+        // We will hightlight our use of these assumptions in the code
+        // that takes advantage of them.
+        //
+        breakCriticalAndPseudoCriticalEdges();
+
+        // Given that we are processing a function that had a `waveGetActiveMask` instruction
+        // in its body, we expect to find an entry block on the function.
+        //
+        auto funcEntryBlock = m_func->getFirstBlock();
+        SLANG_ASSERT(funcEntryBlock);
+
+        // We don't want to inject logic for computing and maintaining the
+        // active mask into parts of a function that don't actually use it,
+        // so one of the first things we do (once we've finished messing with
+        // the CFG of the function) is to mark which blocks (transitively) need
+        // the active mask, and which don't.
+        //
+        markBlocksNeedingActiveMask();
+        //
+        // We expect to find that the entry block to the function needs the
+        // active mask, or else that would imply nothing else in the function
+        // did, and we shouldn't be processing this function at all.
+        //
+        SLANG_ASSERT(m_blocksNeedingActiveMask.Contains(funcEntryBlock));
+
+        // Our basic approach will be to associate an `IRInst*` that represents
+        // the active mask value to use with each basic block of the function.
+        //
+        // The active mask to use for the entry block of a function is a special
+        // case, since it can't be derived from the value in other blocks.
+        //
+        createInitialActiveMask(funcEntryBlock);
+
+        // For blocks wtih multiple predecessors, it is possible that the correct
+        // active mask to use will depend on the predecessor.
+        //
+        // We will thus add an extra parameter (a phi node in more traditional
+        // SSA representations) to any block with multiple distinct predecessors,
+        // that can be used to represent the incoming active mask.
+        //
+        addActiveMaskParametersToBlocksWithMultiplePredecessors();
+
+        // The remaining work of assigning active masks to blocks will
+        // be handled by a recursive traversal of the function's CFG
+        // in terms of control-flow regions, where the entry block
+        // for the function serves as the starting point of the outer-most region.
+        //
+        transformRegions(funcEntryBlock);
+    }
+
+    void breakCriticalAndPseudoCriticalEdges()
+    {
+        // In order to break all the critical pseudo-critical
+        // edges, we will first identify them and build a list
+        // of `IREdge`s, and then go along and break them all.
+        //
+        // This approach ensures things don't get tripped up
+        // by modifying the CFG while also walking its structure.
+        //
+        List<IREdge> edgesToBreak;
+        //
+        // We are going to consider each block in turn as a
+        // candidate predecessor block, and look at its
+        // outgoing edges.
+        //
+        for (auto pred = m_func->getFirstBlock(); pred; pred = pred->getNextBlock())
+        {
+            auto successors = pred->getSuccessors();
+            auto succIter = successors.begin();
+            auto succEnd = successors.end();
+            for (; succIter != succEnd; ++succIter)
+            {
+                auto edge = succIter.getEdge();
+
+                // We want to collect the edges that are critical
+                // or psuedo-critical.
+                //
+                if( edge.isCritical() || isPseudoCriticalEdge(edge) )
+                {
+                    edgesToBreak.add(edge);
+                }
+            }
+        }
+
+        // Once we've identified the edges we want to break,
+        // we do so by inserting an edge along them.
+        //
+        for( auto& edge : edgesToBreak )
+        {
+            m_sharedBuilder->insertBlockAlongEdge(edge);
+        }
+    }
+
+    bool isPseudoCriticalEdge(IREdge const& edge)
+    {
+        // Our definition of a pseudo-critical edge is one
+        // where the prececessors is a conditional branch
+        // (meaning it has multiple outgoing edges), and
+        // where the successor is a merge point for some
+        // structured control-flow construct.
+        //
+        auto pred = edge.getPredecessor();
+        auto succ = edge.getSuccessor();
+
+        // The condition on the predecessor is easy
+        // enough to check.
+        //
+        if(pred->getSuccessors().getCount() <= 1)
+            return false;
+
+        // To check if the successor is used as a merge
+        // point, we must iterate over its uses.
+        //
+        for( auto use = succ->firstUse; use; use = use->nextUse )
+        {
+            // For each use, we need to consider if
+            // the user represents a structured control-flow
+            // construct, and then whether the `succ` block
+            // is being used as a merge point in that construct.
+            //
+            auto user = use->getUser();
+            if(auto ifElseInst = as<IRIfElse>(user))
+            {
+                // The merge point for an `if` or `if`/`else`
+                // if the block *after* the statement.
+                //
+                if(ifElseInst->getAfterBlock() == succ)
+                    return true;
+            }
+            else if( auto switchInst = as<IRSwitch>(user) )
+            {
+                // The merge point for a `switch` is the block
+                // after the statement, which is also the label
+                // where control flow goes on a `break`.
+                //
+                if(switchInst->getBreakLabel() == succ)
+                    return true;
+            }
+            else if( auto loopInst = as<IRLoop>(user) )
+            {
+                // A loop construct can actually have two
+                // merge points.
+                //
+                // First, the merge point for the entire loop
+                // is the block after the loop, which is also
+                // where control flow goes on a `break`.
+                //
+                if(loopInst->getBreakBlock() == succ)
+                    return true;
+                //
+                // Second, for a given iteration of the loop,
+                // the "continue clause" of a `for` loop
+                // is a merge point, and is where control flow
+                // goes on a `continue`.
+                //
+                if(loopInst->getContinueBlock() == succ)
+                    return true;
+            }
+        }
+
+        return false;
+    }
+
+    // We want to identify which blocks need the active mask,
+    // and will use a set to keep track of them.
+    //
+    HashSet<IRBlock*> m_blocksNeedingActiveMask;
+
+    // To make things convenient for downstream code, we will
+    // define a simple accessor to check if a block needs the
+    // active mask, as determined by our setup logic.
+    //
+    bool doesBlockNeedActiveMask(IRBlock* block)
+    {
+        return m_blocksNeedingActiveMask.Contains(block);
+    }
+
+    void markBlocksNeedingActiveMask()
+    {
+        // We can start by identifying the blocks that directly
+        // use the active because they contain a `waveGetActiveMask()`
+        // instruciton.
+        //
+        // Note: the module-level pass will have modified any
+        // function that indirectly uses the active mask (that is,
+        // a function that calls another functio nneeding the mask)
+        // so that it uses `waveGetActiveMask` at those call sites,
+        // so there whould always be at least one block that gets
+        // discovered this way, even in functions that did not
+        // initially contain direct uses of `waveGetActiveMask`.
+        //
+        for( auto block : m_func->getBlocks() )
+        {
+            for( auto inst : block->getOrdinaryInsts() )
+            {
+                if( inst->op == kIROp_WaveGetActiveMask )
+                {
+                    m_blocksNeedingActiveMask.Add(block);
+                    break;
+                }
+            }
+        }
+
+        // Once we've identified an initial set of blocks that need/use
+        // the active mask, we want to mark any blocks that lead to
+        // blocks we've already marked, since they will need to pass
+        // along the active mask.
+        //
+        // We will do this in an iterative fashion, by looping over
+        // all the blocks in the CFG and checking if they branch to
+        // a block that needs the active mask.
+        //
+        {
+            // Once we make a sweep over the CFG and don't see any change
+            // in the decision about what blocks use the active mask, we
+            // know we have converged.
+            //
+            bool change = true;
+            while( change )
+            {
+                change = false;
+
+                // Because we are solving a backwards dataflow problem,
+                // we iterate over the blocks of the function in reverse
+                // order to minimize the number of iterations required
+                // in the common case.
+                //
+                for(auto block = m_func->getLastBlock(); block; block = block->getPrevBlock())
+                {
+                    if(m_blocksNeedingActiveMask.Contains(block))
+                        continue;
+
+                    for( auto successor : block->getSuccessors() )
+                    {
+                        if( !m_blocksNeedingActiveMask.Contains(successor) )
+                            continue;
+
+                        // If we get here then `block` has *not* been marked
+                        // as needing the active mask, but it branches to
+                        // `successor` which *has* been marked, so we need
+                        // to mark `block` and keep looking for changes.
+                        //
+                        m_blocksNeedingActiveMask.Add(block);
+                        change = true;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    // As described previously, our main goal in this pass is to associated an
+    // `IRInst*` representing the active mask with each block in the CFG
+    // of the function (or rather, with each block that *needs* the active mask).
+    //
+    // We will keep track of the active mask values that have been registered
+    // so far in a dictionary.
+    //
+    Dictionary<IRBlock*, IRInst*> m_activeMaskForBlock;
+
+    void createInitialActiveMask(IRBlock* funcEntryBlock)
+    {
+        // The active mask value to use on entry to a function depends
+        // on whether the function is an entry point or not.
+        //
+        // The easy case is ordinary functions (ones that aren't entry
+        // points).
+        //
+        if( !m_func->findDecoration<IREntryPointDecoration>() )
+        {
+            // We simplyu need to add a new parameter to the entry block
+            // (which holds the parameters for the function itself).
+            // That parameter will receive the initial of the active
+            // mask as an argument at (modified) call sites to the function.
+            //
+            addActiveMaskParameter(funcEntryBlock);
+        }
+        else
+        {
+            // The case of a shader entry point is a bit trickier.
+            //
+            // We can't change the signature of a shader entry point (e.g., adding
+            // new varying parameters), so we need to compute the initial active
+            // mask value from first principles.
+            //
+            // We will insert the code we generate at the start of the entry block.
+            //
+            IRBuilder builder;
+            builder.sharedBuilder = m_sharedBuilder;
+            builder.setInsertBefore(funcEntryBlock->getFirstOrdinaryInst());
+            //
+            // A naive approach would be to set the active mask to an all-ones
+            // value representing the idea that all threads/lanes in the warp/wave
+            // will be active at the start of execution.
+            //
+            auto allLanesMask = builder.getIntValue(m_maskType, -1);
+            //
+            // That all-ones mask won't actually be correct in any case where
+            // a warp/wave is less than fully occupied (e.g., what if the thread
+            // group size is smaller than the warp/wave size?).
+            //
+            // We can refine that all-ones mask down to a more accurate one by
+            // using a ballot operation:
+            //
+            //      initialActiveMask = waveMaskBallot(allOnesMask, true);
+            //
+            auto initialActiveMask = builder.emitWaveMaskBallot(m_maskType, allLanesMask, builder.getBoolValue(true));
+            //
+            // Note: an important detail here is that we are invoking `waveMaskBallot`
+            // with the all-ones mask and *assuming* that a target will properly ignore
+            // the bits in `allOnesMask` that correspond to threads/lanes that aren't
+            // actually executing. Fortunately CUDA at least guarantees this
+            // behavior for `__ballot_sync()` and its other collective operations:
+            // bits corresponding to threads that aren't executing are ignored.
+            //
+            // Once we've computed the mask value to start with, we add it to
+            // our tracking structure so we can remember which value to use.
+            //
+            m_activeMaskForBlock.Add(funcEntryBlock, initialActiveMask);
+        }
+    }
+
+    void addActiveMaskParametersToBlocksWithMultiplePredecessors()
+    {
+        // Blocks with multiple (distinct) predecessors will
+        // recieve their input active mask as a block parameter
+        // (an SSA phi operation in more traditional representations).
+        //
+        for( auto block : m_func->getBlocks() )
+        {
+            // We only care about blocks that want the active mask,
+            // and that have multiple predecessors.
+            //
+            if(!doesBlockNeedActiveMask(block))
+                continue;
+            if(block->getPredecessors().getCount() <= 1)
+                continue;
+
+            // What is more, we only care about blocks with multiple
+            // *distinct* predecessors, which means we need to look
+            // out for things like a `switch` where multiple outgoing
+            // edges from a single block could lead to the same
+            // destination.
+            //
+            // A block with multiple distinct predecessors is one
+            // where the predecessors aren't all the same.
+            //
+            bool allPredsTheSame = true;
+            //
+            // We will check if all the predecessors of `block`
+            // are the same as the first predecessor (which we
+            // know must exist because we dissmissed blocks
+            // with <= 1 predecessor already).
+            //
+            IRBlock* firstPred = *block->getPredecessors().begin();
+            for(auto pred : block->getPredecessors())
+            {
+                if(pred != firstPred)
+                {
+                    allPredsTheSame = false;
+                    break;
+                }
+            }
+            if(allPredsTheSame)
+                continue;
+
+            // If we have identified a block with multiple distinct
+            // predecessors, then we know that it needs a new
+            // block parameter to be added to represent the active mask.
+            //
+            addActiveMaskParameter(block);
+        }
+    }
+
+    void addActiveMaskParameter(IRBlock* block)
+    {
+        // Adding a paramter to a block to represent the
+        // active mask is a straightforward application
+        // of `IRBuilder`.
+        //
+        // As a small sanity check, we make sure that the
+        // block we are modifying actually wants the active
+        // mask.
+        //
+        SLANG_ASSERT(doesBlockNeedActiveMask(block));
+
+        IRBuilder builder;
+        builder.sharedBuilder = m_sharedBuilder;
+        builder.setInsertBefore(block->getFirstOrdinaryInst());
+
+        auto activeMaskParam = builder.emitParam(m_maskType);
+
+        m_activeMaskForBlock.Add(block, activeMaskParam);
+    }
+
+    // The remainder of the work in this pass is going to be based
+    // on a recursive walk of the CFG for the function in terms of
+    // regions.
+    //
+    // Our definition of regions will rely on having built a dominator
+    // tree for the function.
+    //
+    RefPtr<IRDominatorTree> m_dominatorTree;
+    //
+    // Briefly, a node (basic block) A dominates block B if every
+    // possible path through the graph that ends in B goes through A.
+    //
+    // The dominator tree encodes dominance relationships by making
+    // it so that A is an ancestor node of B in the tree if and only if
+    // A dominates B. Furthermore, if A is the direct parent of B in
+    // the dominator tree, we say that A *immediately dominates* B.
+    //
+    // We will use a few wrapper/helper functions to make working
+    // with the dominator tree more convenient.
+    //
+    // First is a query to check if one block dominates another:
+    //
+    bool dominates(IRBlock* dominator, IRBlock* dominated)
+    {
+        return m_dominatorTree->dominates(dominator, dominated);
+    }
+    //
+    // Next is an operation to check if one block domiantes
+    // another *or* the child region is unreachable in the CFG.
+    //
+    // TODO: The `IRDominatorTree` type should now make this
+    // operation redundant, since it will treat an unreachable
+    // block as being domianted by any other block.
+    //
+    bool dominatesOrIsUnreachable(IRBlock* dominator, IRBlock* dominated)
+    {
+        return m_dominatorTree->isUnreachable(dominated)
+            || dominates(dominator, dominated);
+    }
+
+    // With the definition of dominance in hand, we are now ready
+    // to define the notion of regions that we will use.
+    //
+    // There are many ways to define regions on a CFG, and we are
+    // specifically interested in single-entry, multiple-exit
+    // regions.
+    //
+    struct RegionInfo
+    {
+        // Each region will have a dedicated entry block, which
+        // is the only way into the region (it is single-entry,
+        // after all). This means that the entry block must
+        // dominate everything inside the region.
+        //
+        IRBlock* entryBlock = nullptr;
+
+        // Each region will also have a dedicated *merge* block,
+        // which is where control flow goes when exiting the
+        // region normally. Code after the merge block is considered
+        // to be outside the region.
+        //
+        IRBlock* mergeBlock = nullptr;
+
+        // Regions in the CFG will nest with a parent-child relationship,
+        // and during our processing it will be important to track
+        // the chain of ancestor regions that enclosue a piece of code.
+        //
+        // Note: it is possible that `parentRegion` will have the same
+        // `mergeBlock` as this region, and code that handles regions
+        // needs to deal with that case.
+        //
+        RegionInfo* parentRegion = nullptr;
+
+        // As a convenience, we also make our region type hold
+        // the active mask for the entry block of the region.
+        // This could technically be accessed using `m_activeMaskForBlock`,
+        // but storing it here can avoid some extra lookups and
+        // error checks around them.
+        //
+        IRInst* activeMaskOnEntry = nullptr;
+    };
+
+    // Given the definition of a region, one of the most important
+    // queries to be able to answer is: given a block B and a region R,
+    // is B inside R?
+    //
+    bool isBlockInRegion(IRBlock* block, RegionInfo* region)
+    {
+        // By our definition, a region only contains blocks
+        // that are dominated by its entry block.
+        //
+        // Thus, any block not dominated by the entry block
+        // is outside the region.
+        //
+        if(!m_dominatorTree->dominates(region->entryBlock, block))
+            return false;
+
+        // In addition, if `block` is dominated by the merge
+        // block of `region` then it can only be reached by
+        // going through the merge block, which implies leaving
+        // the region.
+        //
+        // Thus any block that is dominated by the merge block
+        // of `region` (or any of its parents) is not in the region.
+        //
+        // TODO: It may not be necessary to check the mrege block
+        // of parent regions, so that checking just the one merge
+        // block would suffice. We need to double-check that
+        // nothing would be changed before removing that logic...
+        //
+        for( auto r = region; r; r = r->parentRegion )
+        {
+            // Note: It is possible for the merge block of a region to
+            // be null (specifically for the entry block of a function,
+            // where the logical merge point is after the function
+            // returns).
+            //
+            if(r->mergeBlock && dominates(r->mergeBlock, block))
+                return false;
+        }
+
+        return true;
+    }
+
+    void transformRegions(IRBlock* funcEntryBlock)
+    {
+        // Given the definition of regiosn we will use,
+        // we can process the entire function by recursively
+        // carving it up into regions.
+        //
+        // We start by computing a dominator tree for
+        // the function, since that will help us
+        // identify the regions.
+        //
+        m_dominatorTree = computeDominatorTree(m_func);
+
+        // Next we look up th active mask for the function's
+        // entry region, which had better be set before
+        // we run this code.
+        //
+        IRInst* activeMaskOnFuncEntry = nullptr;
+        m_activeMaskForBlock.TryGetValue(funcEntryBlock, activeMaskOnFuncEntry);
+        SLANG_ASSERT(activeMaskOnFuncEntry);
+
+        // The root region of our tree of regions will
+        // start at the entry block of the function, and
+        // its merge block will be left as null (because
+        // we merge upon return from the function, which isn't
+        // represented as an explicit block in the CFG).
+        //
+        RegionInfo rootRegion;
+        rootRegion.entryBlock = funcEntryBlock;
+        rootRegion.activeMaskOnEntry = activeMaskOnFuncEntry;
+
+        // We then kick off transformation of the whole
+        // CFG by transforming the root region.
+        //
+        transformRegion(&rootRegion);
+    }
+
+    void transformRegion(RegionInfo* region)
+    {
+        // The task of transforming a region with a given
+        // entry block and initial active mask comprises
+        // two steps.
+        //
+        IRBlock* regionEntry = region->entryBlock;
+        IRInst* activeMaskOnRegionEntry = region->activeMaskOnEntry;
+        //
+        // The first step is the easy one: any uses of
+        // the `waveGetActiveMask` instruction in the entry
+        // block of the region can be replaced with the value
+        // of `actvieMaskOnRegionEntry`.
+        //
+        // The only wrinkle here is carefully fetching the
+        // next instruction before processing each instruction
+        // so that we can continue to iterative while also
+        // potentially removing and replacing instructions along
+        // the way.
+        //
+        IRInst* nextInst = nullptr;
+        for( IRInst* inst = regionEntry->getFirstInst(); inst; inst = nextInst )
+        {
+            nextInst = inst->getNextInst();
+
+            if( inst->op == kIROp_WaveGetActiveMask )
+            {
+                inst->replaceUsesWith(activeMaskOnRegionEntry);
+                inst->removeAndDeallocate();
+            }
+        }
+
+        // The second and much more involved step is to process all
+        // the other blocks in the region, recursively.
+        //
+        // The way to proceed will be determined by the terminator
+        // instruction of the entry block.
+        //
+        auto terminator = regionEntry->getTerminator();
+        SLANG_ASSERT(terminator);
+        switch( terminator->op )
+        {
+            // There are some cases of terminator instructions that
+            // we explicit do not or cannot handle.
+            //
+            // A `conditionalBranch` instruction represents a two-way
+            // conditional branch *without* structured control-flow
+            // information. Right now our front-end doesn't produce
+            // such instructions.
+            //
+            // TODO: We could theoretically handle unstructured control
+            // flow either by running a restructuring pass, or by saying
+            // that unstructured branches can split the active mask,
+            // without any option to reconverge it.
+            //
+        case kIROp_conditionalBranch:
+            //
+            // A `discard` instruction can only appear in fragment shaders,
+            // and this pass is currently only being applied for the CUDA
+            // target (which doesn't support rasterization stages).
+            //
+            // The difficulty with `discard` is that a `discard` operation
+            // in a subroutine can affect the active mask seen by code
+            // after a call to that subroutine. Thus, the `discard` operation
+            // would violate our assumption that the active mask cannot change
+            // inside of a single basic block.
+            //
+            // Devising a solution to synthesize the active mask in the presence
+            // of `discard` would seem to require a different approach to this
+            // pass *or* some new constraints on the front-end representation
+            // (e.g., a `discard` is really more akin to a `throw` or other
+            // error-propagation operation, and might need to be explicit in
+            // the signature of any function that may `discard`).
+            //
+        case kIROp_discard:
+            //
+            // Finally, we also don't handle any control-flow op we might not
+            // have introduced at the time this pass was created.
+            //
+        default:
+            SLANG_UNEXPECTED("unhandled terminator op");
+            break;
+
+            // Next, there are cases of terminators that represent code that
+            // is or must be be unreachable. The runtime semantics of executing
+            // one of these instructions would be some kind of undefined behavior,
+            // so we can elect to simply do nothing.
+            //
+        case kIROp_MissingReturn:
+        case kIROp_Unreachable:
+            break;
+
+        case kIROp_unconditionalBranch:
+            {
+                auto branch = cast<IRUnconditionalBranch>(terminator);
+
+                // An `unconditionalBranch` instruction represents any kind of
+                // unconditonal branch in the code. This includes cases like:
+                //
+                // * Leaving a control-flow structure normally (e.g., ending the "then"
+                //   block of an `if` statement and moving on the code after the `if`;
+                //   jumping out of a loop with a `break`)
+                //
+                // * Leaving a control-flow structure abnormally (e.g., leaving the "then"
+                //   part of an `if` statement by `break`ing out of an outer loop).
+                //
+                // * Ordinary control flow that doesn't leave any region(s) (e.g., cycling
+                //   back to the top of a `while` loop; progressing through straight-line
+                //   non-branching control flow that happens to use multiple blocks)
+                //
+                // We factor most of the handling of unconditional branches out into
+                // a subroutine, so we just invoke it here and leave the explanation
+                // to later.
+                //
+                transformUnconditionalEdge(region, terminator, branch->getTargetBlock(), activeMaskOnRegionEntry);
+
+                // Once we've handled the control-flow edge at the end of the entry
+                // block of our region, we need to process any child regions that
+                // might exist.
+                //
+                // E.g. if we had straight-line control flow from block A->B->C->... and
+                // we were processing the region that starts with A, then the previous
+                // code would have handled all work related to the A->B edge, and then
+                // this step would handle recursively processing the B->C->... region.
+                //
+                transformChildRegions(region, region);
+            }
+            break;
+
+        case kIROp_ReturnVal:
+        case kIROp_ReturnVoid:
+            {
+                // A `return` instruction is akin to an unconditional branch,
+                // except that it is guaranteed to exit any structured control
+                // flow regions that we are nested under.
+                //
+                // We thus handle a `return` as a special case of an unconditional
+                // branch where the target block is null (which also happens to
+                // be the value used to represent the merge block for the region
+                // representing the entire function body).
+                //
+                transformUnconditionalEdge(region, terminator, nullptr, activeMaskOnRegionEntry);
+
+                // We also make a call to recusrively process any child regions here,
+                // although in practice there should be no child regions if the
+                // entry block ended with a `return`.
+                //
+                // TODO: Consider eliminating this call if it really isn't needed.
+                //
+                transformChildRegions(region, region);
+            }
+            break;
+
+        case kIROp_ifElse:
+            {
+                // A structured `ifElse` instruction is a two-way branch on a
+                // Boolean coniditon, along with a specific block representing
+                // the code after the high-level-language `if` statement.
+                //
+                auto ifElse = cast<IRIfElse>(terminator);
+                auto condition = ifElse->getCondition();
+                auto trueBlock = ifElse->getTrueBlock();
+                auto falseBlock = ifElse->getFalseBlock();
+                auto afterBlock = ifElse->getAfterBlock();
+                //
+                // We can picture the situation as something like:
+                //
+                //          I       // block with if/else           |
+                //         / \                                      |
+                //        T   F     // true and false blocks        |
+                //       /|\ /|\                                    |
+                //                                                  |
+                //         ...                                      |
+                //                                                  |
+                //         \|/                                      |
+                //          A       // "after" block                |
+                //                                                  |
+                //         ...                                      |
+                //
+                // Note that very few assumptions can or should be
+                // made about the code under T and F. In particular:
+                //
+                // * Code under T or F could exit the region under
+                //   consideration without going through A
+                //
+                // * There could exist some block X such that X is in
+                //   the region of our if/else, and control flow can reach
+                //   A from both T and F *without* going through A
+                //
+                // Our construction doesn't rely on many assumptions.
+                // All we need is that we can construct a sub-region
+                // representing the code dominated by I but not by A
+                // (which serves as the merge point).
+                //
+                RegionInfo ifElseRegion;
+                ifElseRegion.entryBlock = regionEntry;
+                ifElseRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+                ifElseRegion.mergeBlock = afterBlock;
+                ifElseRegion.parentRegion = region;
+                //
+                // This sub-region will be used as a parent region
+                // when recursively processing code reachable through
+                // the `ifElse`.
+
+                // Because we broke all critical edges as a pre-process, we
+                // can be certain that the entry block for `region` dominates
+                // the blocks for the `true` and `false` cases (it should 
+                // be their only predecessor, and thus the immediate
+                // dominator).
+                //
+                SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, trueBlock));
+                SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, falseBlock));
+
+                // Because we also broke all pseudo-critical edges, we also
+                // expect taht the blocks we branch to on `true` or `false`
+                // are not the same as the merge block.
+                //
+                SLANG_ASSERT(trueBlock != afterBlock);
+                SLANG_ASSERT(falseBlock != afterBlock);
+
+                // Because of the above assumptions, we know that this
+                // code can take responsibility for computing the mask
+                // value to use for both `trueBlock` and `falseBlock`.
+                //
+                IRBuilder builder;
+                builder.sharedBuilder = m_sharedBuilder;
+
+                // To establish the mask value for `trueBlock` we will
+                // insert a `waveMaskBallot` before the branch:
+                //
+                //      trueMask = waveMaskBallot(activeMaskOnRegionEntry, condition);
+                //
+                // That mask weill consist of all threads that entered the
+                // parent region and computed a `condition` value of `true`.
+                //
+                builder.setInsertBefore(ifElse);
+                auto trueMask = builder.emitWaveMaskBallot(m_maskType, activeMaskOnRegionEntry, condition);
+
+                // To establish the mask value for `falseBlock`, we will
+                // insert code into the false block that computes an inverted mask as:
+                //
+                //      falseMask = activeMaskOnRegionEntry & ~trueMask;
+                //
+                builder.setInsertBefore(falseBlock->getFirstOrdinaryInst());
+                auto falseMask = builder.emitBitAnd(
+                    m_maskType,
+                    activeMaskOnRegionEntry,
+                    builder.emitBitNot(m_maskType, trueMask));
+
+                // The task of associating the mask value comptued by a conditional branch
+                // with the successor block(s) is delegated to a subroutine, which we will
+                // detail later.
+                //
+                // For now it suffices that we need to transform each of the outgoing
+                // edges from our conditional branch.
+                //
+                // TODO: There is a pathological corner case that is not handled here,
+                // when `trueBlock == falseBlock`. Right now we have no optimizations
+                // that could make this case arise, but we should be careful about
+                // it if we ever add such passes.
+                //
+                transformConditionalEdge(&ifElseRegion, terminator, trueBlock, trueMask);
+                transformConditionalEdge(&ifElseRegion, terminator, falseBlock, falseMask);
+
+                // Next we need to transform any and all child regions of our `if`-`else`
+                // construct as well as the children of the overall `region`.
+                //
+                // Those children chould (at least) include child regions where `trueBlock`
+                // and `falseBlock` are the entry blocks.
+                //
+                transformChildRegions(&ifElseRegion, region);
+            }
+            break;
+
+        case kIROp_loop:
+            {
+                // At the most basic level, a `loop` instruction is just an uncondtional
+                // branch. What is stores above and beyond an `unconditionalBranch`
+                // instruction is the strucrutal information about the loop, including
+                // where the code "after" the loop starts (the same as the `break`
+                // target), and where the code that starts a new loop iteration goes
+                // (the `continue` target).
+                //
+                auto loopInst = cast<IRLoop>(terminator);
+                auto loopHeader = loopInst->getTargetBlock();
+                auto breakBlock = loopInst->getBreakBlock();
+                auto continueBlock = loopInst->getContinueBlock();
+                //
+                // We can visualize it as something like this:
+                //
+                //          L           // block with `loop` instruction    |
+                //          |                                               |
+                //          |  ___      // back edge(s) that start a        |
+                //          |/    \     //     new iteration                |
+                //          H     |     // loop header, where control flow  |
+                //         /|\    |     //     starts on each iteration     |
+                //         ...                                              |
+                //                                                          |
+                //               \|/                                        |
+                //                C     // continue label, which leads      |
+                //               ...    //     to any/all back edges        |
+                //                                                          |
+                //         \|/                                              |
+                //          B           // break label, which is the start  |
+                //         ...          //     of code after the loop       |
+                //
+                // Much like the case for `ifElse`, we can't make a lot
+                // of definition statements about the structure of the control
+                // flow after the `loop` instruction, so we need to be careful
+                // and construct regions that don't rely on unsafe assumptions.
+                //
+                // We can construct a region that represents all the code that
+                // is logically inside the loop. This regions starts with the
+                // loop header (*not* the block with the `loop` instruction),
+                // and ends at the block B where a `break` goes (to exit
+                // the loop).
+                //
+                RegionInfo loopRegion;
+                loopRegion.entryBlock = loopHeader;
+                loopRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+                loopRegion.mergeBlock = breakBlock;
+                loopRegion.parentRegion = region;
+
+                // In order for our structured control-flow information to
+                // mean anything, the `loop` instruction must dominate
+                // the loop body (starting with the header).
+                //
+                SLANG_ASSERT(m_dominatorTree->dominates(regionEntry, loopHeader));
+                //
+                // We also require that the region with the `loop` either
+                // dominates the `break` block (you can't exit the loop without
+                // first enterting it), *or* that block is unreachable (the
+                // loop never exits normally).
+                //
+                SLANG_ASSERT(dominatesOrIsUnreachable(regionEntry, breakBlock));
+                //
+                // Simlarly, we assume that the `continue` label is also either
+                // dominated by the `loop` instruction or is unreachable (the
+                // loop never actually iterates).
+                //
+                SLANG_ASSERT(dominatesOrIsUnreachable(regionEntry, continueBlock));
+                //
+                // Finally, our pre-pass on the CFG will have ruled out very
+                // silly cases like a `loop` that branches directly to the `break`
+                // label.
+                //
+                SLANG_ASSERT(loopHeader != breakBlock);
+
+                // Each iteration of the loop will execute under an active
+                // mask, and the mask may vary per iteration.
+                //
+                IRInst* iterEntryMask = nullptr;
+                //
+                // For a loop that actually *loops*, the loop header block
+                // must have multiple distinct predecessors: one for the
+                // `loop` instruction and one or more for back edges that
+                // continue execution of the loop.
+                //
+                if( loopHeader->getPredecessors().getCount() > 1 )
+                {
+                    iterEntryMask = loopHeader->getLastParam();
+                    SLANG_ASSERT(iterEntryMask);
+                }
+                else
+                {
+                    // It technically possible that a `loop` instruction
+                    // doesn't actually yield a loop. For example, one
+                    // can write:
+                    //
+                    //      for(;;)
+                    //      {
+                    //          doThings();
+                    //          if(someCondition) break;
+                    //          doOtherThings();
+                    //          break;
+                    //      }
+                    //
+                    // There are no cases where this code actually loops,
+                    // but the use of a `for` loop is still enabling some
+                    // non-trivial control flow with the early `break`.
+                    //
+                    // We cannot in general eliminate all `loop`s that don't
+                    // actually loop as a pre-process, without adding some
+                    // alternative kind of control-flow construct that
+                    // represents this kind of alternative use of looping
+                    // constructs.
+                    //
+                    // Fortunately, handling this case isn't complicated,
+                    // because the mask to use on entry to the (only)
+                    // iteration is well known.
+                    //
+                    iterEntryMask = activeMaskOnRegionEntry;
+                }
+
+                // Along with the outer region that represents the
+                // entire loop (which is exited with `break`) there
+                // is conceptually a nested inner region that represents
+                // the body of a single loop iteration, and which is
+                // exited with a `continue`.
+                //
+                // This detail primarily matters for a `for` loop where
+                // there can be non-trivial code (including code that
+                // depends on the active mask) that executes on a
+                // `continue`.
+                //
+                // The nested region thus begins at the loop header,
+                // and ends at the `continue` label. Note that it 
+                // is possible for this region to be empty, in the
+                // case where `loopHeader == continueBlock` (which
+                // will happen for any non-`for` loop).
+                //
+                RegionInfo iterRegion;
+                iterRegion.entryBlock = loopHeader;
+                iterRegion.activeMaskOnEntry = iterEntryMask;
+                iterRegion.mergeBlock = continueBlock;
+                iterRegion.parentRegion = &loopRegion;
+
+                // Once we've computed the child regions that are introduced
+                // by the loop structure, we can move on to processing
+                // its single outgoing edge, and the child control flow
+                // regions.
+                //
+                transformUnconditionalEdge(region, terminator, loopHeader, activeMaskOnRegionEntry);
+                transformChildRegions(&iterRegion, region);
+            }
+            break;
+
+
+        case kIROp_Switch:
+            {
+                // A `switch` instruction represents a structured N-way
+                // branch on an integer condition. The structural information
+                // gives us a block that represents code after the `switch`
+                // statement, and which is also the target for any `break`s
+                // inside the `switch`.
+                //
+                auto switchInst = cast<IRSwitch>(terminator);
+                auto condition = switchInst->getCondition();
+                auto mergeBlock = switchInst->getBreakLabel();
+
+                // A `switch` is mostly just a more involved and complicated
+                // version of an `if`-`else`, so the overall logical is similar
+                // but with a lot more details to handle.
+                //
+                // We can construct a child region that represents the code
+                // that is logically inside the `switch`, consisting of all
+                // code from teh entry block up to the merge (`break`) block.
+                //
+                RegionInfo switchRegion;
+                switchRegion.entryBlock = regionEntry;
+                switchRegion.activeMaskOnEntry = activeMaskOnRegionEntry;
+                switchRegion.mergeBlock = mergeBlock;
+                switchRegion.parentRegion = region;
+
+                // Next, we need to establish a mask value that will
+                // represent the active mask on entry to a given `case`.
+                //
+                IRBuilder builder;
+                builder.sharedBuilder = m_sharedBuilder;
+                builder.setInsertBefore(switchInst);
+
+                // For now we are computing a simple-but-inaccurate version
+                // of the active mask. We are using:
+                //
+                //      matchingMask = waveMaskMatch(activeMaskOnRegionEntry, condition);
+                //
+                // This value will yield a mask for each thread that consists of
+                // the threads who had exactly the same value for `condition`.
+                //
+                auto matchingMask = builder.emitWaveMaskMatch(m_maskType, activeMaskOnRegionEntry, condition);
+                //
+                // TODO: this mask computation yields a surprising value in cases where
+                // multiple values go to the same code:
+                //
+                //      switch(condition)
+                //      {
+                //      case 0: case 1:  A(WaveGetActiveMask()); break;
+                //
+                //      case 2: default: B(WaveGetActiveMask()); break;
+                //      }
+                //
+                // In this case we want the mask seen by `A()` to be all the threads
+                // where `condition` was `0` *or* `1` (and not to see two
+                // different masks, one for each value).
+                //
+                // Similarly, if we have threads where `condition` is `2`, `3`, and `4`,
+                // we expec them to all see a common mask at `B()`, despite
+                // representing multiple distinct values.
+                //
+                // The desired mask could be synthesized in different ways.
+                //
+                // We could execute an extra `switch` before this one, to reduce
+                // the actual values of `condition` down to an artificial `caseNumber`,
+                // which only represents the unique code blocks being branched to:
+                //
+                //      int caseNumber;
+                //      switch(condition)
+                //      {
+                //      case 0: case 1:  caseNumber = 0xA; break;
+                //
+                //      case 2: default: caseNumber = 0xB; break;
+                //      }
+                //      switch( caseNumber ) // NOTE: changed condition here
+                //      {
+                //      case 0xA: A(WaveGetActiveMask()); break;
+                //
+                //      case 0xB: B(WaveGetActiveMask()); break;
+                //      }
+                //
+                // The main downside of that approach is that it adds additional
+                // control flow that wasn't in the input program (and that new
+                // control flow could complicate our interactions with the
+                // dominator tree and CFG edges).
+                //
+                // Another alternatie would be to move the synthesis of the mask
+                // down into the individual cases:
+                //
+                //      switch(condition)
+                //      {
+                //      case 0: case 1:
+                //          mask = WaveMaskMatch(activeMaskOnRegionEntry, 0xA);
+                //          A(mask);
+                //          break;
+                //
+                //      case 2: default:
+                //          mask = WaveMaskMatch(activeMaskOnRegionEntry, 0xB);
+                //          B(mask);
+                //          break;
+                //      }
+                //
+                // This second approach avoids adding additional control flow
+                // operations before the `switch`, but does so at the cost of
+                // having distinct textual call sites for a collective that
+                // need to work together.
+                //
+                // We need to pick and implement one of these strategies
+                // (or something else entirely) in a future change.
+
+                // A `switch` instruction always has a `default` label, which
+                // is where control flow goes for values of the condition
+                // that weren't explicitly handled.
+                //
+                // Note: even if the high-level-language `switch` didn't have
+                // a `default` label, the IR one will. During initial IR generation,
+                // the `default` label for such a `switch` will be the same as
+                // the `break` label, but our pass to break pseudo-critical edges
+                // will guarantee that every `switch` has a `default` label distinct
+                // from its `break` label.
+                //
+                // We will handle the conditional edge leading the `default` label
+                // before looking at any of the explicit `case`s.
+                //
+                auto defaultLabel = switchInst->getDefaultLabel();
+                transformConditionalEdge(&switchRegion, terminator, defaultLabel, matchingMask);
+
+                // One wrinkle that arises when processing a `switch` statement is
+                // that multiple `case`s may branch to same target label, and the
+                // label for `case`s can be the same as the `default` label.
+                //
+                // Our approach in `transformConditionalEdge` currently assumes that
+                // for each block reached via a conditional edge, the `transformConditionalEdge`
+                // function is only called once.
+                //
+                // To avoid violating this assumption, we will make sure to only
+                // call `transformConditionalEdge` once for each unique target block
+                // of the `switch`.
+                //
+                // In order to make that guarantee, we rely on the invariant (provided
+                // by our IR generation, maintained by our IR passes, and currently already
+                // assumed by our emit logic) that the `case`s of a `switch` are stored
+                // in an order such that:
+                //
+                // * All cases that branch to the same label are contiguous in the list of cases
+                //
+                // * If the cases with label A can fall through to the cases with label B, then
+                //   the A cases immediately precede the B cases in the list of cases.
+                // 
+                // We can thus simply track the label of the last case we processed,
+                // and detect duplicates by comparing to it (and the `default` label).
+                //
+                IRBlock* prevLabel = nullptr;
+                UInt caseCount = switchInst->getCaseCount();
+                for( UInt ii = 0; ii < caseCount; ++ii )
+                {
+                    auto caseLabel = switchInst->getCaseLabel(ii);
+
+                    // As discussed above, we only process the first `case` with
+                    // a given label, since all of the edges will pass  along the
+                    // same mask.
+                    //
+                    if(caseLabel == prevLabel)
+                        continue;
+                    prevLabel = caseLabel;
+
+                    // Similarly, we skip `case`s that branch to the same label
+                    // as a `default`, since the `default` label was already
+                    // processed above.
+                    //
+                    if(caseLabel == defaultLabel)
+                        continue;
+
+                    transformConditionalEdge(&switchRegion, terminator, caseLabel, matchingMask);
+
+                    // TODO: One issue that is getting ignored in the current
+                    // code is "non-trivial fall-through," where one `case`
+                    // executes some amount of code before falling through to
+                    // another:
+                    //
+                    //      switch(x)
+                    //      {
+                    //      case 0:
+                    //          A(WaveGetActiveMask());
+                    //      case 1:
+                    //          B(WaveGetActiveMask());
+                    //          break;
+                    //      ...
+                    //      }
+                    //
+                    // In the scenario above it is clear what the desired
+                    // active mask is for the call to `A()` (the set of lanes
+                    // that had `x==0`), but the active mask when calling
+                    // `B()` is a bit thornier question.
+                    //
+                    // The easiest implementation choice is to just let
+                    // `B()` see two active masks: one for the `x==0` lanes,
+                    // and another for the `x==1` lanes, and it could be
+                    // argued that this matches the programmer's view of
+                    // how things diverged at the `switch`.
+                    //
+                    // The alternative view is that the lanes with `x==0`
+                    // should re-converge with the lanes with `x==1` before
+                    // executing the call to `B()`.
+                    //
+                    // Achieving the more intuitive behavior for fall-through
+                    // naively seems to require rewriting the control flow
+                    // and introducing nested conditionals (where the nesting
+                    // depth relates to the length of the chain of fall-throughs).
+                    //
+                    // More work is required in order to figure out a good
+                    // strategy for dealing with fall-through in the general case.
+                    //
+                    // For now we can ignore this issue since Slang
+                    // does not support fall-through in `switch` statements,
+                    // but if/when we do, we will need to make a policy
+                    // decision on this issue.
+                }
+
+                // Just like for `ifElse`, once we have processed the outgoing
+                // conditional edges, we need to process all child regions.
+                //
+                transformChildRegions(&switchRegion, region);
+            }
+            break;
+
+        }
+    }
+
+    // During the presentation of `transformRegion` we deferred
+    // the details of how to deal with edges and child regions to
+    // subroutines, which we will begin to work through now.
+    //
+    // The case of conditional edges is the easiest.
+    //
+    void transformConditionalEdge(RegionInfo* fromRegion, IRTerminatorInst* terminator, IRBlock* toBlock, IRInst* fromActiveMask)
+    {
+        SLANG_UNUSED(fromRegion);
+        SLANG_UNUSED(terminator);
+
+        // Because of the way that we broke critical edges, block with the
+        // conditional branch (the entry block to `fromRegion`) had better
+        // be the immediate dominator of the block being branched to.
+        //
+        SLANG_ASSERT(m_dominatorTree->immediatelyDominates(fromRegion->entryBlock, toBlock));
+
+        // If the block being branched from immediately dominates the block being
+        // branched to, that makes it the only predecessor of the `toBlock`.
+        //
+        // As such, the `toBlock` can't have had an SSA parameter/phi introduced
+        // to receive the block, and the only value needed to represent the
+        // active mask on input to `toBlock` is the `fromActiveMask` being
+        // provided as part of the conditional branch.
+        //
+        m_activeMaskForBlock.Add(toBlock, fromActiveMask);
+    }
+
+    // Unconditional edges are more complicated than conditional
+    // edges, because they may branch out of a control-flow
+    // region and/or branch to a block with multiple predecessors.
+    //
+    void transformUnconditionalEdge(RegionInfo* fromRegion, IRTerminatorInst* terminator, IRBlock* toBlock, IRInst* fromActiveMask)
+    {
+        IRBuilder builder;
+        builder.sharedBuilder = m_sharedBuilder;
+        builder.setInsertBefore(terminator);
+
+        // The context here is that the `terminator` instruction,
+        // at the end of the first block of `fromRegion` is
+        // branching to `toBlock`. The `fromActiveMask` is the
+        // active mask that was in place when executing the
+        // first block of `fromRegion`, and thus the active
+        // mask used when executing the `terminator`.
+        //
+        // One task we need to deal with is setting up
+        // the active mask that should be used when executing
+        // the `toBlock`. We start by assuming that the active
+        // mask does not change when control flow moves from
+        // one block to the next (we will then deal with all
+        // the cases where this assuption isn't true).
+        //
+        IRInst* toActiveMask = fromActiveMask;
+
+        // The most important thing we need to deal with is
+        // that an unconditional edge may exit a control-flow
+        // region. We know that the block being branched from
+        // is inside `fromRegion`, and thus recursively inside
+        // all the ancestors of that region.
+        //
+        // We will walk through the regions from the inner-most
+        // to the outer-most and check if we are exiting each
+        // region in turn.
+        //
+        for( RegionInfo* r = fromRegion; r; r = r->parentRegion )
+        {
+            // To know if we are exiting the region `r`, we
+            // need to look at its merge block.
+            //
+            auto mergeBlock = r->mergeBlock;
+
+            // One subtle detail here is that we might technically
+            // be exiting a region A with merge block M, but the
+            // parent of A is a region B that *also* has M as
+            // its merge block.
+            //
+            // In such a case we really don't want/need to deal
+            // with any issues around reconvergence at M for A,
+            // since we can instead rely on the reconvergence logic
+            // for B to do all the work.
+            //
+            // In practical terms, this means that we skipp any
+            // region that has a parent region with the same merge
+            // block.
+            //
+            // TODO: We could try to find ways to eliminate this
+            // situation as part of the structure of regions,
+            // so that code like this doesn't need the ad hoc check.
+            //
+            auto parentRegion = r->parentRegion;
+            if(parentRegion && parentRegion->mergeBlock == mergeBlock)
+                continue;
+
+            // We need to know if the branch exits the region `r`
+            // and, if it does, we need to know whether it exits
+            // the region normally or not.
+            //
+            // If the block being branched to is *inside* region `r`
+            // (which can only be the case for a non-null target
+            // block), then we clearly aren't exiting region `r`.
+            //
+            if(toBlock && isBlockInRegion(toBlock, r))
+            {
+                // Furthermore, if we aren't exiting region `r`,
+                // then we must not be exiting any of its parent
+                // regions, since our regions are strictly
+                // nested.
+                //
+                // We thus don't need to process any more
+                // regions to check if we are exiting them.
+                //
+                break;
+            }
+            //
+            // A normal exit from the region can be detected easily:
+            // it is any case where the destination of the branch is
+            // the merge block for the region.
+            //
+            else if( toBlock == mergeBlock )
+            {
+                // In this case we are jumping to the dedicated
+                // merge point of region `r`, which means we need
+                // to re-converge with all the other threads/lanes
+                // that entered the region together, and which are
+                // also exiting normally.
+                //
+                // It is possible that the `mergeBlock` because
+                // it representing the merge point upon return from
+                // the function, so we guard against that case.
+                //
+                if( mergeBlock )
+                {
+                    // Otherwise, if we are branching to a non-null
+                    // merge point, then we emit a ballot operation
+                    // that will detect all the other threads/lanes:
+                    //
+                    //      toActiveMask = waveMaskBallot(activeMaskOnEntry, true);
+                    //
+                    // This call will synchronize with all the threads/lanes
+                    // that entered region `r`, and will collect the
+                    // mask representing the threads/lanes that passed
+                    // in `true` because they want to re-converge. That
+                    // mask should be used as the new active mask when
+                    // executing the merge block.
+                    //
+                    // TODO: Instead of emitting this `waveMaskBallot`
+                    // on every edge that branches to the merge point,
+                    // we could instead emit a single `waveMaskBallot`
+                    // at the start of the merge point instead. This
+                    // would be a good optimization to make, but requires
+                    // extra logic throughout this algorithm to manage.
+                    //
+                    toActiveMask = builder.emitWaveMaskBallot(
+                        m_maskType,
+                        r->activeMaskOnEntry,
+                        builder.getBoolValue(true));
+                }
+
+                // If we are exiting a region normally, then we can't
+                // also be exiting its parent region(s), because we
+                // eliminated the case of regions that have the same
+                // merge point as their parent at the top of our loop.
+                //
+                // Thus we are done checking if regions have been
+                // exited.
+                //
+                break;
+            }
+            else
+            {
+                // Finally, if we aren't staying in the region,
+                // but we aren't jumping to the merge point, then
+                // we must be exiting the region "abnormally."
+                //
+                // In this case, we need to coordinate with the
+                // other threads/lanes that entered the region
+                // to let them know we won't be reconverging
+                // with them.
+                //
+                // The other threads will be executing a `waveMaskBallot`
+                // to compute the active mask to use, so we can
+                // participate in that same ballot:
+                //
+                //      waveMaskBallot(activeMaskOnEntry, false);
+                //
+                // We don't care about the mask that is returned from
+                // the ballot operation, since it will only represent
+                // the active mask for threads that wanted to re-converge.
+                //
+                builder.emitWaveMaskBallot(m_maskType,
+                    r->activeMaskOnEntry,
+                    builder.getBoolValue(false));
+            }
+        }
+
+        // After we've checked all the outer regions to see
+        // which (if any) we are exiting, we should have
+        // a ussable value for `toActiveMask` that we want
+        // to pass along to the target block.
+        //
+        // The way that we pass things along to the target
+        // block will vary a lot based on its form.
+        //
+        if( !toBlock )
+        {
+            // First, if the target block is null, then
+            // the branch is exiting the current function
+            // completely, so we don't need to wire up an
+            // active mask at all.
+        }
+        else if( toBlock->getPredecessors().getCount() > 1 )
+        {
+            // If the target block is one with multiple
+            // predecessors, such that it will have an
+            // added block parameter (phi node) to select
+            // the corect mask value, then we need to
+            // pass along the mask value to use as an
+            // additional argument on the unconditional branch.
+            //
+            // If the old unconditional branch was:
+            //
+            //      <op>(arg0, arg1, arg2, ...);
+            //
+            // Then our new branch will be:
+            //
+            //      <op>(arg0, arg1, arg2, ..., toActiveMask);
+            //
+            List<IRInst*> newOperands;
+            UInt oldOperandCount = terminator->getOperandCount();
+            for( UInt i = 0; i < oldOperandCount; ++i )
+            {
+                newOperands.add(terminator->getOperand(i));
+            }
+            newOperands.add(toActiveMask);
+
+            IRInst* newTerminator = builder.emitIntrinsicInst(
+                terminator->getFullType(),
+                terminator->op,
+                newOperands.getCount(),
+                newOperands.getBuffer());
+
+            terminator->replaceUsesWith(newTerminator);
+            terminator->removeAndDeallocate();
+        }
+        else
+        {
+            // If the target block has only a single predecessor,
+            // then it means it must be immediately dominated
+            // by the block we are branching from.
+            //
+            // As such, this is the only branch that wants to
+            // establish an active mask value for the target
+            // block, and we can just bind the comptue value
+            // directly.
+            //
+            m_activeMaskForBlock.Add(toBlock, toActiveMask);
+        }
+    }
+
+    // During the course of transforming a control-flow region,
+    // we deferred the processing of its child regions to
+    // a subroutine.
+    //
+    void transformChildRegions(RegionInfo* innerRegion, RegionInfo* outerRegion)
+    {
+        // The input to this function represents a (closed) range of regions
+        // between `innerRegion` and `outerMergePoint`.
+        //
+        // The `innerRegion` represents the body of some control-flow construct,
+        // and can be used to decide which basic blocks are inside the construct
+        // and thus need to be processed as child regions.
+        //
+        // The `outerRegion` represents the outer context in which the
+        // control-flow construct was found, and determines how control-flow
+        // entered the construct.
+        //
+        // If `innerRegion == outerRegion` then there is only one region
+        // in the range, while if `innerRegion != outerREgion` then the range
+        // comprises: `innerRegion`, `innerRegion->parent`, ... `outerRegion`.
+        //
+        // There are two kinds of child regions we need to concern ourselves
+        // with, and they relate to two kinds of blocks:
+        //
+        // * Blocks that are inside the inner region represent the start
+        //   of their own nested single-entry multi-exit regions. We need
+        //   to process these while building up the correct parent/child hierarchy.
+        //
+        // * The merge block(s) for our range of regions (from inner to
+        //   outer), which represent code "after" each of the regions,
+        //   that is still nested in the same parent region.
+        //
+        // We start by enumerating the first kind of child region.
+        // To make sure that we only consider blocks that represent the
+        // immediate children of `innerRegion`, and not other descendents,
+        // we will only look for regions that start with blocks that
+        // are children in the dominator tree for the entry block of
+        // our overall range of regions.
+        //
+        IRBlock* entryBlock = outerRegion->entryBlock;
+        for( auto childBlock : m_dominatorTree->getImmediatelyDominatedBlocks(entryBlock) )
+        {
+            // We don't want to consider blocks that are outside of
+            // the inner region here (they could be, e.g., the merge
+            // point of the region, or just blocks inside some
+            // nested control-flow construct).
+            //
+            if(!isBlockInRegion(childBlock, innerRegion))
+                continue;
+
+            // If we do find a child region, then we want to process it as
+            // a direct child of our inner region.
+            //
+            transformChildRegion(childBlock, innerRegion);
+        }
+
+        // Once we've dealt with the child regions that involve
+        // traversing deeper down the tree of regions, we need
+        // to deal with merge blocks, which represent siblings
+        // of regions in our range.
+        //
+        // We will walk the regions in our range from the inner-most
+        // to the outer-most and look at their merge blocks.
+        //
+        for( auto mergeRegion = innerRegion; mergeRegion; mergeRegion = mergeRegion->parentRegion )
+        {
+            // The merge point of a region forms the start of a
+            // sibling region that shares the same parent region.
+            //
+            auto parentRegion = mergeRegion->parentRegion;
+
+            // Well, actually there is one special case we need to
+            // deal with: if the parent region of our region has
+            // the same merge point, then  the merge region can't
+            // actually be one of its children.
+            //
+            if( parentRegion && parentRegion->mergeBlock == mergeRegion->mergeBlock)
+            {
+                // In the case where the parent region has the same
+                // merge point, we choose not to process it here
+                // (since that would be inaccurate), and assume that
+                // it will be handled when this loop processes that
+                // parent region (perhaps as part of the same call to
+                // `transformChildRegions()`, and perhaps as part of
+                // another call).
+            }
+            else
+            {
+                // In the ordinary case where the parent region has a different
+                // merge point, then we know that the merge block for our region
+                // needs to be processed as a child of that parent region.
+                //
+                transformChildRegion(mergeRegion->mergeBlock, parentRegion);
+            }
+
+            // We bail out of this loop once we've processed all the regions
+            // in the range we've been asked to handle.
+            //
+            // Note: This check can't just be folded into the `for` loop
+            // condition because we need to make sure that we process
+            // at least one region in the case where `innerRegion == outerRegion`.
+            //
+            if(mergeRegion == outerRegion)
+                break;
+        }
+    }
+
+    void transformChildRegion(IRBlock* regionEntry, RegionInfo* parentRegion)
+    {
+        // This function gets called when we've disocvered a child region
+        // that should be processed recursively. The region that starts
+        // with `regionEntry` needs to be processed as a child of the
+        // given `parentRegion`.
+        //
+        // It is posible for this routine to be called to process the
+        // merge point for the entire function body, which is a null
+        // region. There is nothing that needs to be done in that case.
+        //
+        if(!regionEntry)
+            return;
+
+        // Similarly, we don't need to do anything for regions that don't
+        // need or use the active mask (even to pass along to their
+        // successors).
+        //
+        if(!doesBlockNeedActiveMask(regionEntry))
+            return;
+
+        // An important invariant of our approach is that before a child
+        // region is processed, an active mask value will have been
+        // established for it. The logic can be thought of as follows:
+        //
+        // * If the child region is one with multiple predecessors, then
+        //   its active mask value comes in as a block parameter (phi node),
+        //   and was set up before the recursive walk even started.
+        //
+        // * If the child region has only one predecessor, then that predecessor
+        //   is its immediate dominator, and would be part of a parent region
+        //   that has already been processed up the call stack. That parent
+        //   region would have established the active mask value to use on input
+        //   to each of its successors.
+        //
+        IRInst* activeMaskOnRegionEntry = nullptr;
+        if( !m_activeMaskForBlock.TryGetValue(regionEntry, activeMaskOnRegionEntry) )
+        {
+            SLANG_UNEXPECTED("no active mask registered for block");
+        }
+
+        // Once we've done the sanity checks and looked up
+        // the mask value to use on entry to this region, we
+        // can recursively process it (and by extension, its children).
+        //
+        RegionInfo region;
+        region.entryBlock = regionEntry;
+        region.activeMaskOnEntry = activeMaskOnRegionEntry;
+        region.mergeBlock = parentRegion->mergeBlock;
+        region.parentRegion = parentRegion;
+        transformRegion(&region);
+    }
+};
+
+// Now that we've defined the context for function-level transformation,
+// we can finally circle back and fill in the definition of the function
+// that the module-level pass uses to transform each function.
+//
+void SynthesizeActiveMaskForModuleContext::transformFuncUsingActiveMask(IRFunc* func)
+{
+    SynthesizeActiveMaskForFunctionContext context;
+    context.m_func = func;
+    context.m_sharedBuilder = &m_sharedBuilder;
+    context.m_maskType = m_maskType;
+
+    context.transformFunc();
+}
+
+// The public entry point for this pass is just a wrapper around
+// the context type for the module-level pass.
+//
+void synthesizeActiveMask(
+    IRModule*       module,
+    DiagnosticSink* sink)
+{
+    SynthesizeActiveMaskForModuleContext context;
+    context.m_module = module;
+    context.m_sink = sink;
+    context.processModule();
+}
+
+} // namespace Slang
diff --git a/source/slang/slang-ir-synthesize-active-mask.h b/source/slang/slang-ir-synthesize-active-mask.h
new file mode 100644
index 000000000..3eb60b350
--- /dev/null
+++ b/source/slang/slang-ir-synthesize-active-mask.h
@@ -0,0 +1,26 @@
+// slang-ir-synthesize-active-mask.h
+#pragma once
+
+namespace Slang
+{
+
+class Session;
+struct IRModule;
+class DiagnosticSink;
+
+    /// Synthesize values to represent the "active mask" for warp-/wave-level operations.
+    ///
+    /// This pass will transform all* functions in `module` that make use of the active
+    /// mask (directly or indirectly) so that they receive an explicit active mask as
+    /// an input. It will also transform the bodies of those functions to use that input
+    /// mask, or values derived from it, as the explicit mask for wave intrinsics.
+    ///
+    /// * Entry point functions will not be transformed to take an explicit mask, and
+    /// will instead be changed to compute the active mask to use as the first operation
+    /// in their body.
+    ///
+void synthesizeActiveMask(
+    IRModule*       module,
+    DiagnosticSink* sink);
+
+}
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index 0a52561c0..57dac80b6 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -3464,6 +3464,55 @@ namespace Slang
         return inst;
     }
 
+    IRInst* IRBuilder::emitWaveMaskBallot(IRType* type, IRInst* mask, IRInst* condition)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_WaveMaskBallot,
+            type,
+            mask,
+            condition);
+        addInst(inst);
+        return inst;
+    }
+
+    IRInst* IRBuilder::emitWaveMaskMatch(IRType* type, IRInst* mask, IRInst* value)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_WaveMaskMatch,
+            type,
+            mask,
+            value);
+        addInst(inst);
+        return inst;
+    }
+
+    IRInst* IRBuilder::emitBitAnd(IRType* type, IRInst* left, IRInst* right)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_BitAnd,
+            type,
+            left,
+            right);
+        addInst(inst);
+        return inst;
+    }
+
+    IRInst* IRBuilder::emitBitNot(IRType* type, IRInst* value)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_BitNot,
+            type,
+            value);
+        addInst(inst);
+        return inst;
+    }
+
+
+
     //
     // Decorations
     //
diff --git a/source/slang/slang-ir.h b/source/slang/slang-ir.h
index fbe05e8e8..9799201b3 100644
--- a/source/slang/slang-ir.h
+++ b/source/slang/slang-ir.h
@@ -751,6 +751,31 @@ struct IRParam : IRInst
     IR_LEAF_ISA(Param)
 };
 
+    /// A control-flow edge from one basic block to another
+struct IREdge
+{
+public:
+    IREdge()
+    {}
+
+    explicit IREdge(IRUse* use)
+        : m_use(use)
+    {}
+
+    IRBlock* getPredecessor() const;
+    IRBlock* getSuccessor() const;
+
+    IRUse* getUse() const
+    {
+        return m_use;
+    }
+
+    bool isCritical() const;
+
+private:
+    IRUse* m_use = nullptr;
+};
+
 // A basic block is a parent instruction that adds the constraint
 // that all the children need to be "ordinary" instructions (so
 // no function declarations, or nested blocks). We also expect
@@ -849,6 +874,7 @@ struct IRBlock : IRInst
                 return use != that.use;
             }
 
+            IREdge getEdge() const { return IREdge(use); }
             IRUse* use;
         };
 
@@ -878,6 +904,8 @@ struct IRBlock : IRInst
                 return use != that.use;
             }
 
+            IREdge getEdge() const { return IREdge(use); }
+
             IRUse* use;
             UInt stride;
         };
diff --git a/source/slang/slang.vcxproj b/source/slang/slang.vcxproj
index 1b391cac0..8c815e1e1 100644
--- a/source/slang/slang.vcxproj
+++ b/source/slang/slang.vcxproj
@@ -243,6 +243,7 @@
     <ClInclude Include="slang-ir-string-hash.h" />
     <ClInclude Include="slang-ir-strip-witness-tables.h" />
     <ClInclude Include="slang-ir-strip.h" />
+    <ClInclude Include="slang-ir-synthesize-active-mask.h" />
     <ClInclude Include="slang-ir-type-set.h" />
     <ClInclude Include="slang-ir-union.h" />
     <ClInclude Include="slang-ir-validate.h" />
@@ -324,6 +325,7 @@
     <ClCompile Include="slang-ir-string-hash.cpp" />
     <ClCompile Include="slang-ir-strip-witness-tables.cpp" />
     <ClCompile Include="slang-ir-strip.cpp" />
+    <ClCompile Include="slang-ir-synthesize-active-mask.cpp" />
     <ClCompile Include="slang-ir-type-set.cpp" />
     <ClCompile Include="slang-ir-union.cpp" />
     <ClCompile Include="slang-ir-validate.cpp" />
@@ -378,25 +380,25 @@
       <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-generate.exe</AdditionalInputs>
       <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-generate.exe</AdditionalInputs>
     </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <Natvis Include="..\core\core.natvis" />
-    <Natvis Include="slang.natvis" />
     <CustomBuild Include="slang-ast-reflect.h">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">"../../bin/windows-x86/debug/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"../../bin/windows-x64/debug/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"../../bin/windows-x86/release/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"../../bin/windows-x64/release/slang-cpp-extractor" -d %(RootDir)%(Directory)/ slang-ast-base.h slang-ast-decl.h slang-ast-expr.h slang-ast-modifier.h slang-ast-stmt.h slang-ast-type.h slang-ast-val.h -strip-prefix slang-ast- -o slang-ast-generated -output-fields</Command>
-      <Outputs>%(RootDir)%(Directory)/slang-ast-generated.h;%(RootDir)%(Directory)/slang-ast-generated-macro.h</Outputs>
+      <Outputs>slang-ast-generated.h;slang-ast-generated-macro.h</Outputs>
       <Message>slang-cpp-extractor AST %(Identity)</Message>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">../../bin/windows-x86/debug/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">../../bin/windows-x64/debug/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-cpp-extractor.exe;%(RootDir)%(Directory)/slang-ast-base.h;%(RootDir)%(Directory)/slang-ast-decl.h;%(RootDir)%(Directory)/slang-ast-expr.h;%(RootDir)%(Directory)/slang-ast-modifier.h;%(RootDir)%(Directory)/slang-ast-stmt.h;%(RootDir)%(Directory)/slang-ast-type.h;%(RootDir)%(Directory)/slang-ast-val.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">../../bin/windows-x86/debug/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">../../bin/windows-x64/debug/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">../../bin/windows-x86/release/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">../../bin/windows-x64/release/slang-cpp-extractor.exe;slang-ast-base.h;slang-ast-decl.h;slang-ast-expr.h;slang-ast-modifier.h;slang-ast-stmt.h;slang-ast-type.h;slang-ast-val.h</AdditionalInputs>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
+    <Natvis Include="..\core\core.natvis" />
+    <Natvis Include="slang.natvis" />
+  </ItemGroup>
+  <ItemGroup>
     <ProjectReference Include="..\core\core.vcxproj">
       <Project>{F9BE7957-8399-899E-0C49-E714FDDD4B65}</Project>
     </ProjectReference>
diff --git a/source/slang/slang.vcxproj.filters b/source/slang/slang.vcxproj.filters
index c2c9268dc..31e03d73d 100644
--- a/source/slang/slang.vcxproj.filters
+++ b/source/slang/slang.vcxproj.filters
@@ -180,6 +180,9 @@
     <ClInclude Include="slang-ir-strip.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="slang-ir-synthesize-active-mask.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="slang-ir-type-set.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -419,6 +422,9 @@
     <ClCompile Include="slang-ir-strip.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="slang-ir-synthesize-active-mask.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="slang-ir-type-set.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -505,6 +511,9 @@
     <CustomBuild Include="hlsl.meta.slang">
       <Filter>Source Files</Filter>
     </CustomBuild>
+    <CustomBuild Include="slang-ast-reflect.h">
+      <Filter>Header Files</Filter>
+    </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <Natvis Include="..\core\core.natvis">
@@ -513,11 +522,5 @@
     <Natvis Include="slang.natvis">
       <Filter>Source Files</Filter>
     </Natvis>
-    <CustomBuild Include="hlsl.meta.slang">
-      <Filter>Source Files</Filter>
-    </CustomBuild>
-    <CustomBuild Include="slang-ast-reflect.h">
-      <Filter>Header Files</Filter>
-    </CustomBuild>
   </ItemGroup>
 </Project>
 \ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/README.md b/tests/hlsl-intrinsic/active-mask/README.md
new file mode 100644
index 000000000..862630433
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/README.md
@@ -0,0 +1,14 @@
+Active Mask Tests
+=================
+
+The tests in this directory are designed to ensure that the "active mask" used by HLSL wave-level operations matches what is expected, even on targets where the active mask must be synthesized.
+
+Note that the exact active mask that should be used on wave operations isn't precisely defined in documentation for HLSL. The nearest thing to a public statement of the intended behavior is this statement on the [wiki for dxc](https://github.com/Microsoft/DirectXShaderCompiler/wiki/Wave-Intrinsics) (emphasis ours):
+
+> These intrinsics are dependent on active lanes and therefore flow control. In the model of this document, implementations must enforce that the number of active lanes exactly corresponds to the *programmer’s view of flow control*. In a future version, there may be a compiler flag to relax this requirement as a default, but also enable applications to be explicit about the exact set of lanes to be used in a particular wave operation (see section Wave Handles in the Future Features section below).
+
+The requirement is then to compute an explicit mask that matches the "programmers view" of control flow, which is arguably something up to interpretation.
+
+The GLSL "subgroup" operations are slightly more precise in the language they use, but ultimately leaves the expected value of the active mask under-specified in many cases.
+
+The goal of these tests is to establish some empirical results for what the active mask is expected/required to be in various cases. We will do our best to match the observed behavior of APIs where the implicit "active mask" is an existing feature, but we also reserve the right to take a stand and define what the behavior *ought* to be based on the necessarily more precise definitions that we use in the Slang implementation.
diff --git a/tests/hlsl-intrinsic/active-mask/for-break.slang b/tests/hlsl-intrinsic/active-mask/for-break.slang
new file mode 100644
index 000000000..e9976afa8
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-break.slang
@@ -0,0 +1,71 @@
+// for-break.slang
+
+// Test active mask synthesis for a `for` loop that
+// has no ordinary exit condition and can thus
+// only be exited via a (single) `break`.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 4
+#define ITER_COUNT THREAD_COUNT
+#define WRITE(LOC, ITER) buffer[tid + (LOC)*THREAD_COUNT + (ITER)*THREAD_COUNT*LOC_COUNT] = 0xA0000000 | (tid << 24) | (ITER << 16) | (LOC << 8) | WaveGetActiveMask()
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+void test(int tid)
+{
+	int ii = 0;
+	for(;;)
+	{
+		WRITE(0, ii);
+		if(ii >= tid)
+		{
+			WRITE(1, ii);
+
+			// Note: This flag has been included to force
+			// D3D and Vulkan implementations to provide
+			// the expected/desired behavior for the active
+			// mask in the preceding code.
+			//
+			// It seems that without making the `break`
+			// conditional, some implementation will see that
+			// this block post-dominates the entire loop,
+			// and thus decide that the code is semantically
+			// "outside" the loop, despite the fact that
+			// it is clearly lexically *inside* the loop.
+			//
+			// Making the `break` conditional introduces
+			// a new control-flow edge that changes the
+			// post-dominator relationship and thus makes
+			// such implementations see this code as being
+			// "inside" the loop again.
+			//
+		#ifdef HACK
+			if(alwaysTrue)
+		#endif
+			break;
+		}
+		WRITE(2, ii);
+		ii++;
+	}
+	WRITE(3, ITER_COUNT);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/for-break.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/for-break.slang.expected.txt
new file mode 100644
index 000000000..9ddb7cd23
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-break.slang.expected.txt
@@ -0,0 +1,108 @@
+A000000F
+A100000F
+A200000F
+A300000F
+A0000101
+0
+0
+0
+0
+A100020E
+A200020E
+A300020E
+0
+0
+0
+0
+0
+A101000E
+A201000E
+A301000E
+0
+A1010102
+0
+0
+0
+0
+A201020C
+A301020C
+0
+0
+0
+0
+0
+0
+A202000C
+A302000C
+0
+0
+A2020104
+0
+0
+0
+0
+A3020208
+0
+0
+0
+0
+0
+0
+0
+A3030008
+0
+0
+0
+A3030108
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+A004030F
+A104030F
+A204030F
+A304030F
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang b/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang
new file mode 100644
index 000000000..8b8ff5540
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang
@@ -0,0 +1,74 @@
+// for-continue-ext.slang
+
+// Test case of a `for` loop that
+// has multiple paths to continue
+// the loop (both the ordinary one
+// and an explicit `continue`)
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 4
+#define ITER_COUNT THREAD_COUNT
+#define WRITE_VAL(LOC, ITER, VAL) buffer[tid + (LOC)*THREAD_COUNT + (ITER)*THREAD_COUNT*LOC_COUNT] = 0xA0000000 | (tid << 24) | (ITER << 16) | (LOC << 8) | VAL
+#define WRITE(LOC, ITER) WRITE_VAL(LOC, ITER, WaveGetActiveMask())
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+void test(uint tid)
+{
+	for(int ii = 0; ii < tid; ++ii)
+	{
+		WRITE(0, ii);
+		if(tid & 1)
+		{
+			if(tid & 2)
+			{
+				// Note: because of the two `if(alwaysFalse) break;` branches
+				// here, the `WRITE(3,ii)` at the end of the loop body no
+				// longer post-dominates the entry to the `if`, so that
+				// implemenations that use immediate post-dominator
+				// reconvergence will not properly reconvergence lanes
+				// one and two at that point.
+				//
+				// The Slang synthesis pass for the active mask produces
+				// the expected/desired result even in the presence of
+				// these additional control-flow edges.
+			#ifndef HACK
+				if(alwaysFalse) break;
+			#endif
+
+				WRITE(1, ii);
+				continue;
+			}
+			else
+			{
+			#ifndef HACK
+				if(alwaysFalse) break;
+			#endif
+
+				WRITE(2, ii);
+			}
+		}
+		WRITE(3, ii);
+	}
+	WRITE(4, 0);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang.expected.txt
new file mode 100644
index 000000000..41c2e2591
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-continue-ext.slang.expected.txt
@@ -0,0 +1,108 @@
+0
+A100000E
+A200000E
+A300000E
+0
+0
+0
+A3000108
+0
+A1000202
+0
+0
+0
+A1000306
+A2000306
+0
+A000040F
+A100040F
+A200040F
+A300040F
+0
+0
+0
+A3010108
+0
+0
+0
+0
+0
+0
+A2010304
+0
+0
+0
+0
+A3020008
+0
+0
+0
+A3020108
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/active-mask/for-continue.slang b/tests/hlsl-intrinsic/active-mask/for-continue.slang
new file mode 100644
index 000000000..a117832f7
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-continue.slang
@@ -0,0 +1,83 @@
+// for-continue.slang
+
+// Test case of a `for` loop that
+// has multiple paths to continue
+// the loop (both the ordinary one
+// and an explicit `continue`)
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 4
+#define ITER_COUNT THREAD_COUNT
+#define WRITE_VAL(LOC, ITER, VAL) buffer[tid + (LOC)*THREAD_COUNT + (ITER)*THREAD_COUNT*LOC_COUNT] = 0xA0000000 | (tid << 24) | (ITER << 16) | (LOC << 8) | VAL
+#define WRITE(LOC, ITER) WRITE_VAL(LOC, ITER, WaveGetActiveMask())
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+// In order to make a test of `continue` behavior
+// adversarial, we need to observe the value of
+// the active mask during the code that executes
+// on a `continue`.
+//
+// We therefore define a subroutine that
+// will perform the increment action for an
+// ordinary counted loop, allowing us to
+// observe the value of the active mask inside
+// the function.
+//
+void inc(uint tid, in out int ii)
+{
+	// NOTE: For our current HLSL and GLSL output
+	// strategies, we end up duplicating the
+	// "continue clause" of a `for` loop into
+	// each of the sites in the code where control
+	// flow continues the loop. Unsurprisingly,
+	// those copies mean that the active mask
+	// seen on those platforms is not the expected
+	// one.
+	//
+	// We will therefore write out the expected
+	// value directly instead of using `WaveGetActiveMask()`
+	// on those targets.
+	//
+#ifdef HACK
+	WRITE_VAL(3, ii, (0xE << ii) & 0xF);
+#else
+	WRITE(3, ii);
+#endif
+	ii++;
+}
+
+void test(uint tid)
+{
+	for(int ii = 0; ii < tid; inc(tid, ii))
+	{
+		WRITE(0, ii);
+		if(tid & 1)
+		{
+			WRITE(1, ii);
+			continue;
+		}
+		WRITE(2, ii);
+	}
+	WRITE(4, 0);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/for-continue.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/for-continue.slang.expected.txt
new file mode 100644
index 000000000..ad1ca4559
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for-continue.slang.expected.txt
@@ -0,0 +1,108 @@
+0
+A100000E
+A200000E
+A300000E
+0
+A100010A
+0
+A300010A
+0
+0
+A2000204
+0
+0
+A100030E
+A200030E
+A300030E
+A000040F
+A100040F
+A200040F
+A300040F
+0
+0
+0
+A3010108
+0
+0
+A2010204
+0
+0
+0
+A201030C
+A301030C
+0
+0
+0
+A3020008
+0
+0
+0
+A3020108
+0
+0
+0
+0
+0
+0
+0
+A3020308
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/active-mask/for.slang b/tests/hlsl-intrinsic/active-mask/for.slang
new file mode 100644
index 000000000..d620818f8
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for.slang
@@ -0,0 +1,32 @@
+// for.slang
+
+// Test active mask synthesis for a standard `for`
+// loop over an integer, with no `break` or `continue`
+// logic.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define WRITE(IDX) buffer[IDX*THREAD_COUNT + tid] = WaveGetActiveMask()
+
+void test(int tid)
+{
+	for(int ii = 0; ii < tid; ++ii)
+	{
+		WRITE(ii);
+	}
+	WRITE(THREAD_COUNT);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/for.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/for.slang.expected.txt
new file mode 100644
index 000000000..9d3c89743
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/for.slang.expected.txt
@@ -0,0 +1,27 @@
+0
+E
+E
+E
+0
+0
+C
+C
+0
+0
+0
+8
+0
+0
+0
+0
+F
+F
+F
+F
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang b/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang
new file mode 100644
index 000000000..99a400f96
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang
@@ -0,0 +1,45 @@
+// if-conditional-exit.slang
+
+// Test active mask synthesis for an `if` where one side
+// conditionally exits the current function/scope,
+// and thus may or may not "re-converge" after the
+// conditional.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define WRITE(IDX) buffer[IDX*THREAD_COUNT + tid] = WaveGetActiveMask()
+
+void test(int tid)
+{
+	WRITE(0);
+	if(tid & 1)
+	{
+		WRITE(1);
+		if(tid & 2)
+		{
+			WRITE(2);
+			return;
+		}
+		WRITE(3);
+	}
+	else
+	{
+		WRITE(4);
+	}
+	WRITE(5);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang.expected.txt
new file mode 100644
index 000000000..2880950d5
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-conditional-exit.slang.expected.txt
@@ -0,0 +1,24 @@
+F
+F
+F
+F
+0
+A
+0
+A
+0
+0
+0
+8
+0
+2
+0
+0
+5
+0
+5
+0
+7
+7
+7
+0
diff --git a/tests/hlsl-intrinsic/active-mask/if-early-exit.slang b/tests/hlsl-intrinsic/active-mask/if-early-exit.slang
new file mode 100644
index 000000000..75f438905
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-early-exit.slang
@@ -0,0 +1,38 @@
+// if-early-exit.slang
+
+// Test active mask synthesis for an `if` where one side
+// unconditionally exits the current function/scope,
+// and thus cannot "re-converge" after the conditional.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define WRITE(IDX) buffer[IDX*THREAD_COUNT + tid] = WaveGetActiveMask()
+
+void test(int tid)
+{
+	WRITE(0);
+	if(tid & 1)
+	{
+		WRITE(1);
+		return;
+	}
+	else
+	{
+		WRITE(2);
+	}
+	WRITE(3);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/if-early-exit.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/if-early-exit.slang.expected.txt
new file mode 100644
index 000000000..e4244443a
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-early-exit.slang.expected.txt
@@ -0,0 +1,16 @@
+F
+F
+F
+F
+0
+A
+0
+A
+5
+0
+5
+0
+5
+0
+5
+0
diff --git a/tests/hlsl-intrinsic/active-mask/if-one-sided.slang b/tests/hlsl-intrinsic/active-mask/if-one-sided.slang
new file mode 100644
index 000000000..760512863
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-one-sided.slang
@@ -0,0 +1,31 @@
+// if-one-sided.slang
+
+// Test active mask synthesis in the "easy case" of a one-sided `if`.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define WRITE(IDX) buffer[IDX*THREAD_COUNT + tid] = WaveGetActiveMask()
+
+void test(int tid)
+{
+	WRITE(0);
+	if(tid & 1)
+	{
+		WRITE(1);
+	}
+	WRITE(2);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/if-one-sided.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/if-one-sided.slang.expected.txt
new file mode 100644
index 000000000..637cfb771
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if-one-sided.slang.expected.txt
@@ -0,0 +1,12 @@
+F
+F
+F
+F
+0
+A
+0
+A
+F
+F
+F
+F
diff --git a/tests/hlsl-intrinsic/active-mask/if.slang b/tests/hlsl-intrinsic/active-mask/if.slang
new file mode 100644
index 000000000..b8aa30b03
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if.slang
@@ -0,0 +1,35 @@
+// if.slang
+
+// Test active mask synthesis in the "easy case" of a two-sided `if`.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define WRITE(IDX) buffer[IDX*THREAD_COUNT + tid] = WaveGetActiveMask()
+
+void test(int tid)
+{
+	WRITE(0);
+	if(tid & 1)
+	{
+		WRITE(1);
+	}
+	else
+	{
+		WRITE(2);
+	}
+	WRITE(3);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/if.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/if.slang.expected.txt
new file mode 100644
index 000000000..4f9fabfa6
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/if.slang.expected.txt
@@ -0,0 +1,16 @@
+F
+F
+F
+F
+0
+A
+0
+A
+5
+0
+5
+0
+F
+F
+F
+F
diff --git a/tests/hlsl-intrinsic/active-mask/switch-no-default.slang b/tests/hlsl-intrinsic/active-mask/switch-no-default.slang
new file mode 100644
index 000000000..8afc5ed68
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch-no-default.slang
@@ -0,0 +1,53 @@
+// switch-no-default.slang
+
+// Test active mask synthesis for a `switch` statement
+// with no `default` case (such that unhandled values
+// branch directly to the `break` target after the
+// `switch`).
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 5
+#define WRITE(LOC) buffer[tid + (LOC)*THREAD_COUNT] = 0xA0000000 | (tid << 24) | (LOC << 8) | WaveGetActiveMask()
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+void test(int tid)
+{
+	switch(tid)
+	{
+	case 0:
+		WRITE(0);
+		break;
+
+	case 1:
+		WRITE(1);
+		break;
+
+	case 2:
+		WRITE(2);
+		break;
+
+	// NOTE: no `default:`
+	}
+	WRITE(4);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/switch-no-default.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/switch-no-default.slang.expected.txt
new file mode 100644
index 000000000..f529cc9b2
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch-no-default.slang.expected.txt
@@ -0,0 +1,20 @@
+A0000001
+0
+0
+0
+0
+A1000102
+0
+0
+0
+0
+A2000204
+0
+0
+0
+0
+0
+A000040F
+A100040F
+A200040F
+A300040F
diff --git a/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang b/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang
new file mode 100644
index 000000000..ceb7d236d
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang
@@ -0,0 +1,53 @@
+// switch-trivial-fallthrough.slang
+
+// Test active mask synthesis for a `switch` statement
+// that exhibits "trivial fall-through" from one `case`
+// to another.
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+
+// Note: this test is currently disabled on the CUDA
+// target because we do not synthesize the active
+// mask value we want/expect to see.
+//
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 5
+#define WRITE(LOC) buffer[tid + (LOC)*THREAD_COUNT] = 0xA0000000 | (tid << 24) | (LOC << 8) | WaveGetActiveMask()
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+void test(int tid)
+{
+	switch(tid)
+	{
+	case 0:
+	case 1:
+		WRITE(0);
+		break;
+
+	case 2:
+	default:
+		WRITE(2);
+		break;
+	}
+	WRITE(4);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang.expected.txt
new file mode 100644
index 000000000..8d144548e
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch-trivial-fallthrough.slang.expected.txt
@@ -0,0 +1,20 @@
+A0000003
+A1000003
+0
+0
+0
+0
+0
+0
+0
+0
+A200020C
+A300020C
+0
+0
+0
+0
+A000040F
+A100040F
+A200040F
+A300040F
diff --git a/tests/hlsl-intrinsic/active-mask/switch.slang b/tests/hlsl-intrinsic/active-mask/switch.slang
new file mode 100644
index 000000000..cd959c59f
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch.slang
@@ -0,0 +1,52 @@
+// switch.slang
+
+// Test active mask synthesis for a trivial `switch` statement
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -xslang -DHACK
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -xslang -DHACK
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name buffer
+RWStructuredBuffer<int> buffer;
+
+#define THREAD_COUNT 4
+#define LOC_COUNT 5
+#define WRITE(LOC) buffer[tid + (LOC)*THREAD_COUNT] = 0xA0000000 | (tid << 24) | (LOC << 8) | WaveGetActiveMask()
+
+//TEST_INPUT:cbuffer(data=[0 1]):name C
+cbuffer C
+{
+	int alwaysFalse;
+	int alwaysTrue;
+}
+
+void test(int tid)
+{
+	switch(tid)
+	{
+	case 0:
+		WRITE(0);
+		break;
+
+	case 1:
+		WRITE(1);
+		break;
+
+	case 2:
+		WRITE(2);
+		break;
+
+	default:
+		WRITE(3);
+		break;
+	}
+	WRITE(4);
+}
+
+[numthreads(THREAD_COUNT, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	test(dispatchThreadID.x);
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/active-mask/switch.slang.expected.txt b/tests/hlsl-intrinsic/active-mask/switch.slang.expected.txt
new file mode 100644
index 000000000..d661b0bcb
--- /dev/null
+++ b/tests/hlsl-intrinsic/active-mask/switch.slang.expected.txt
@@ -0,0 +1,20 @@
+A0000001
+0
+0
+0
+0
+A1000102
+0
+0
+0
+0
+A2000204
+0
+0
+0
+0
+A3000308
+A000040F
+A100040F
+A200040F
+A300040F
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index e90b4c541..304784518 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -344,7 +344,7 @@ static int _calcSMCountPerMultiProcessor(int major, int minor)
     const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
 
     // It must be newer presumably
-    SLANG_ASSERT(sm > last.coreCount );
+    SLANG_ASSERT(sm > last.sm);
 
     // Default to the last entry
     return last.coreCount;