diff options
| -rw-r--r-- | docs/cuda-target.md | 12 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 36 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 16 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 18 |
4 files changed, 81 insertions, 1 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md index 743e9830c..a9b35d735 100644 --- a/docs/cuda-target.md +++ b/docs/cuda-target.md @@ -18,6 +18,7 @@ These limitations apply to Slang transpiling to CUDA. * Samplers are not separate objects in CUDA - they are combined into a single 'TextureObject'. So samplers are effectively ignored on CUDA targets. * Whilst there is tex1Dfetch there are no equivalents for higher dimensions - so such accesses are not currently supported * When using a TextureArray (layered texture in CUDA) - the index will be treated as an int, as this is all CUDA allows +* Care must be used in using `WaveGetLaneIndex` wave intrinsic - it will only give the right results for appopriate launches The following are a work in progress or not implmented but are planned to be so in the future @@ -143,6 +144,17 @@ void setDownstreamCompilerPrelude(SlangPassThrough passThrough, const char* prel The code that sets up the prelude for the test infrastucture and command line usage can be found in ```TestToolUtil::setSessionDefaultPrelude```. Essentially this determines what the absolute path is to `slang-cpp-prelude.h` is and then just makes the prelude `#include "the absolute path"`. +Limitations +=========== + +## WaveGetLaneIndex + +This defaults to `threadIdx.x & SLANG_CUDA_WARP_MASK`. Depending on how the kernel is launched this could be incorrect. + +There other ways to get lane index, for example using inline assembly. This mechanism though is apparently slower than the simple method used here. + +There is potential to calculate the lane id using the [numthreads] markup in Slang/HLSL, but that also requires some assumptions of how that maps to a lane index. + Language aspects ================ diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 768e33d13..b81acba1e 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -7,6 +7,12 @@ // For now we'll disable any asserts in this prelude #define SLANG_PRELUDE_ASSERT(x) +#ifndef SLANG_CUDA_WARP_SIZE +# define SLANG_CUDA_WARP_SIZE 32 +#endif + +#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1) + // #define SLANG_FORCE_INLINE inline @@ -112,6 +118,36 @@ union Union64 double d; }; +// ---------------------- Miscellaneous -------------------------------------- + +// TODO(JS): It appears that cuda does not have a simple way to get a lane index. +// +// Another approach could be... +// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK +// If that is really true another way to do this, would be for code generator to add this function +// with the [numthreads] baked in. +// +// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() +#ifndef SLANG_USE_ASM_LANE_ID + __forceinline__ __device__ uint32_t _getLaneId() +{ + // If the launch is (or I guess some multiple of the warp size) + // we try this mechanism, which is apparently faster. + return threadIdx.x & SLANG_CUDA_WARP_MASK; +} +#else +__forceinline__ __device__ uint32_t _getLaneId() +{ + // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# + // This mechanism is not the fastest way to do it, and that is why the other mechanism + // is the default. But the other mechanism relies on a launch that makes the assumption + // true. + unsigned ret; + asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} +#endif + // ----------------------------- F32 ----------------------------------------- // Unary diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 8fd2a272a..988c6f69c 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value); __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value); +// TODO(JS): For CUDA the article claims mask has to be used carefully +// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ +// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask() +// seems to be appropriate. + +__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") bool WaveActiveAllTrue(bool condition); +__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)") bool WaveActiveAnyTrue(bool condition); +__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") uint4 WaveActiveBallot(bool condition); + +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))") uint WaveActiveCountBits(bool value); +__target_intrinsic(cuda, "(warpSize)") uint WaveGetLaneCount(); + +__target_intrinsic(cuda, "_getLaneId()") uint WaveGetLaneIndex(); + +// If there are no *active* lanes less than this one, we must be the lowest lane +__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)") bool WaveIsFirstLane(); __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index c72374786..b39c4bbcb 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n") SLANG_RAW("\n") +SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n") +SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n") +SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n") +SLANG_RAW("// seems to be appropriate.\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n") SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n") +SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n") SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n") SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n") SLANG_RAW("uint WaveActiveCountBits(bool value);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n") SLANG_RAW("uint WaveGetLaneCount();\n") +SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n") SLANG_RAW("uint WaveGetLaneIndex();\n") +SLANG_RAW("\n") +SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n") +SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n") SLANG_RAW("bool WaveIsFirstLane();\n") SLANG_RAW("\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n") @@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1569 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1585 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") |
