Initial partial support for WaveXXX intrinsics on CUDA (#1228)

* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
author: jsmall-nvidia <jsmall@nvidia.com> 2020-02-19 14:16:38 -0500
committer: GitHub <noreply@github.com> 2020-02-19 11:16:38 -0800
commit: 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree: f6581768499187bf1572079d65b99c15a95e7743 /source/slang/hlsl.meta.slang
parent: 1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
1 files changed, 16 insertions, 0 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
 
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
 bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
 bool WaveActiveAnyTrue(bool condition);
 
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
 uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
 uint WaveActiveCountBits(bool value);
 
+__target_intrinsic(cuda, "(warpSize)")
 uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
 uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
 bool WaveIsFirstLane();
 
 __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
author	jsmall-nvidia <jsmall@nvidia.com>	2020-02-19 14:16:38 -0500
committer	GitHub <noreply@github.com>	2020-02-19 11:16:38 -0800
commit	46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree	f6581768499187bf1572079d65b99c15a95e7743 /source/slang/hlsl.meta.slang
parent	1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)