diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-02-19 14:16:38 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-02-19 11:16:38 -0800 |
| commit | 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch) | |
| tree | f6581768499187bf1572079d65b99c15a95e7743 /source | |
| parent | 1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff) | |
Initial partial support for WaveXXX intrinsics on CUDA (#1228)
* Start work on wave intrinsics for CUDA.
* Add prelimary CUDA support for some Wave intrinsics.
Document the issue around WaveGetLaneIndex
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 16 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 18 |
2 files changed, 33 insertions, 1 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 8fd2a272a..988c6f69c 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value); __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value); +// TODO(JS): For CUDA the article claims mask has to be used carefully +// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ +// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask() +// seems to be appropriate. + +__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") bool WaveActiveAllTrue(bool condition); +__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)") bool WaveActiveAnyTrue(bool condition); +__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") uint4 WaveActiveBallot(bool condition); + +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))") uint WaveActiveCountBits(bool value); +__target_intrinsic(cuda, "(warpSize)") uint WaveGetLaneCount(); + +__target_intrinsic(cuda, "_getLaneId()") uint WaveGetLaneIndex(); + +// If there are no *active* lanes less than this one, we must be the lowest lane +__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)") bool WaveIsFirstLane(); __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index c72374786..b39c4bbcb 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n") SLANG_RAW("\n") +SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n") +SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n") +SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n") +SLANG_RAW("// seems to be appropriate.\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n") SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n") +SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n") SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n") SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n") SLANG_RAW("uint WaveActiveCountBits(bool value);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n") SLANG_RAW("uint WaveGetLaneCount();\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n") SLANG_RAW("uint WaveGetLaneIndex();\n") +SLANG_RAW("\n") +SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n") +SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n") SLANG_RAW("bool WaveIsFirstLane();\n") SLANG_RAW("\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n") @@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1569 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1585 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") |
