summaryrefslogtreecommitdiff
path: root/source/slang/hlsl.meta.slang
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-02-19 14:16:38 -0500
committerGitHub <noreply@github.com>2020-02-19 11:16:38 -0800
commit46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
treef6581768499187bf1572079d65b99c15a95e7743 /source/slang/hlsl.meta.slang
parent1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
Initial partial support for WaveXXX intrinsics on CUDA (#1228)
* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
Diffstat (limited to 'source/slang/hlsl.meta.slang')
-rw-r--r--source/slang/hlsl.meta.slang16
1 files changed, 16 insertions, 0 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
bool WaveActiveAnyTrue(bool condition);
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
uint WaveActiveCountBits(bool value);
+__target_intrinsic(cuda, "(warpSize)")
uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
bool WaveIsFirstLane();
__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);