Initial partial support for WaveXXX intrinsics on CUDA (#1228)

* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
author: jsmall-nvidia <jsmall@nvidia.com> 2020-02-19 14:16:38 -0500
committer: GitHub <noreply@github.com> 2020-02-19 11:16:38 -0800
commit: 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree: f6581768499187bf1572079d65b99c15a95e7743 /source
parent: 1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
2 files changed, 33 insertions, 1 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
 
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
 bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
 bool WaveActiveAnyTrue(bool condition);
 
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
 uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
 uint WaveActiveCountBits(bool value);
 
+__target_intrinsic(cuda, "(warpSize)")
 uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
 uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
 bool WaveIsFirstLane();
 
 __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index c72374786..b39c4bbcb 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
+SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
+SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// seems to be appropriate.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
 SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
 SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
 SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n")
 SLANG_RAW("uint WaveActiveCountBits(bool value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n")
 SLANG_RAW("uint WaveGetLaneCount();\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
 SLANG_RAW("uint WaveGetLaneIndex();\n")
+SLANG_RAW("\n")
+SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
+SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
 SLANG_RAW("bool WaveIsFirstLane();\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
@@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
author	jsmall-nvidia <jsmall@nvidia.com>	2020-02-19 14:16:38 -0500
committer	GitHub <noreply@github.com>	2020-02-19 11:16:38 -0800
commit	46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree	f6581768499187bf1572079d65b99c15a95e7743 /source
parent	1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)