From 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Wed, 19 Feb 2020 14:16:38 -0500
Subject: Initial partial support for WaveXXX intrinsics on CUDA (#1228)

* Start work on wave intrinsics for CUDA.

* Add prelimary CUDA support for some Wave intrinsics.
Document the issue around WaveGetLaneIndex
---
 source/slang/hlsl.meta.slang   | 16 ++++++++++++++++
 source/slang/hlsl.meta.slang.h | 18 +++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'source')
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
 
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
 bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
 bool WaveActiveAnyTrue(bool condition);
 
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
 uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
 uint WaveActiveCountBits(bool value);
 
+__target_intrinsic(cuda, "(warpSize)")
 uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
 uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
 bool WaveIsFirstLane();
 
 __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index c72374786..b39c4bbcb 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
+SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
+SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// seems to be appropriate.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
 SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
 SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
 SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n")
 SLANG_RAW("uint WaveActiveCountBits(bool value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n")
 SLANG_RAW("uint WaveGetLaneCount();\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
 SLANG_RAW("uint WaveGetLaneIndex();\n")
+SLANG_RAW("\n")
+SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
+SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
 SLANG_RAW("bool WaveIsFirstLane();\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
@@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
-- 
cgit v1.2.3