From 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Wed, 19 Feb 2020 14:16:38 -0500 Subject: Initial partial support for WaveXXX intrinsics on CUDA (#1228) * Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex --- source/slang/hlsl.meta.slang | 16 ++++++++++++++++ source/slang/hlsl.meta.slang.h | 18 +++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 8fd2a272a..988c6f69c 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1431,14 +1431,30 @@ __generic uint4 WaveMatch(T value); __generic uint4 WaveMatch(vector value); __generic uint4 WaveMatch(matrix value); +// TODO(JS): For CUDA the article claims mask has to be used carefully +// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ +// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask() +// seems to be appropriate. + +__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") bool WaveActiveAllTrue(bool condition); +__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)") bool WaveActiveAnyTrue(bool condition); +__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") uint4 WaveActiveBallot(bool condition); + +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))") uint WaveActiveCountBits(bool value); +__target_intrinsic(cuda, "(warpSize)") uint WaveGetLaneCount(); + +__target_intrinsic(cuda, "_getLaneId()") uint WaveGetLaneIndex(); + +// If there are no *active* lanes less than this one, we must be the lowest lane +__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)") bool WaveIsFirstLane(); __generic T WavePrefixProduct(T expr); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index c72374786..b39c4bbcb 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -1507,14 +1507,30 @@ SLANG_RAW("__generic uint4 WaveMatch(T value);\n") SLANG_RAW("__generic uint4 WaveMatch(vector value);\n") SLANG_RAW("__generic uint4 WaveMatch(matrix value);\n") SLANG_RAW("\n") +SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n") +SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n") +SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n") +SLANG_RAW("// seems to be appropriate.\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n") SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n") +SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n") SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n") SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n") SLANG_RAW("uint WaveActiveCountBits(bool value);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n") SLANG_RAW("uint WaveGetLaneCount();\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n") SLANG_RAW("uint WaveGetLaneIndex();\n") +SLANG_RAW("\n") +SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n") +SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n") SLANG_RAW("bool WaveIsFirstLane();\n") SLANG_RAW("\n") SLANG_RAW("__generic T WavePrefixProduct(T expr);\n") @@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1569 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1585 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") -- cgit v1.2.3