summaryrefslogtreecommitdiffstats
path: root/source
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-02-19 14:16:38 -0500
committerGitHub <noreply@github.com>2020-02-19 11:16:38 -0800
commit46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
treef6581768499187bf1572079d65b99c15a95e7743 /source
parent1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
Initial partial support for WaveXXX intrinsics on CUDA (#1228)
* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
Diffstat (limited to 'source')
-rw-r--r--source/slang/hlsl.meta.slang16
-rw-r--r--source/slang/hlsl.meta.slang.h18
2 files changed, 33 insertions, 1 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
bool WaveActiveAnyTrue(bool condition);
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
uint WaveActiveCountBits(bool value);
+__target_intrinsic(cuda, "(warpSize)")
uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
bool WaveIsFirstLane();
__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index c72374786..b39c4bbcb 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n")
SLANG_RAW("\n")
+SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
+SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
+SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// seems to be appropriate.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n")
SLANG_RAW("uint WaveActiveCountBits(bool value);\n")
SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n")
SLANG_RAW("uint WaveGetLaneCount();\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
SLANG_RAW("uint WaveGetLaneIndex();\n")
+SLANG_RAW("\n")
+SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
+SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
SLANG_RAW("bool WaveIsFirstLane();\n")
SLANG_RAW("\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
@@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
sb << "};\n";
}
-SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("\n")