diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-03-02 16:18:20 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-02 16:18:20 -0500 |
| commit | 8899c149b05def1cce626ea649012c4c974861de (patch) | |
| tree | 77e97c2997a653ba9262b32f55e9e3f37e166653 /prelude | |
| parent | b85ca6f86d46ee3c4d5784d0bd4ebc8509e2a9bd (diff) | |
Additional Wave Intrinsic Support (#1252)
* Test for some wave intrinsics.
More wave intrinsic support on CUDA.
* Use shfl_xor_sync.
* Improvements around wave intrinsics.
Fix built in integer types belong to __BuiltinIntegerType.
* Improvements and fixes around Wave intrinsics.
* Added WaveIsFirstLane test.
No longer use __wavemask_lt, as appears not available as an intrinsic.
* Small fixes to CUDA prelude.
* Add wave-active-product test.
Handle the special case for arbitray sums.
* Used macro to implement CUDA wave intrinsics.
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 216 |
1 files changed, 186 insertions, 30 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 1938e3dc1..1ca93d9d1 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -128,36 +128,6 @@ union Union64 double d; }; -// ---------------------- Miscellaneous -------------------------------------- - -// TODO(JS): It appears that cuda does not have a simple way to get a lane index. -// -// Another approach could be... -// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK -// If that is really true another way to do this, would be for code generator to add this function -// with the [numthreads] baked in. -// -// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() -#ifndef SLANG_USE_ASM_LANE_ID - __forceinline__ __device__ uint32_t _getLaneId() -{ - // If the launch is (or I guess some multiple of the warp size) - // we try this mechanism, which is apparently faster. - return threadIdx.x & SLANG_CUDA_WARP_MASK; -} -#else -__forceinline__ __device__ uint32_t _getLaneId() -{ - // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# - // This mechanism is not the fastest way to do it, and that is why the other mechanism - // is the default. But the other mechanism relies on a launch that makes the assumption - // true. - unsigned ret; - asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); - return ret; -} -#endif - // ----------------------------- F32 ----------------------------------------- // Unary @@ -440,6 +410,192 @@ struct RWByteAddressBuffer size_t sizeInBytes; //< Must be multiple of 4 }; + +// ---------------------- Wave -------------------------------------- + +// TODO(JS): It appears that cuda does not have a simple way to get a lane index. +// +// Another approach could be... +// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK +// If that is really true another way to do this, would be for code generator to add this function +// with the [numthreads] baked in. +// +// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() +#ifndef SLANG_USE_ASM_LANE_ID + __forceinline__ __device__ uint32_t _getLaneId() +{ + // If the launch is (or I guess some multiple of the warp size) + // we try this mechanism, which is apparently faster. + return threadIdx.x & SLANG_CUDA_WARP_MASK; +} +#else +__forceinline__ __device__ uint32_t _getLaneId() +{ + // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# + // This mechanism is not the fastest way to do it, and that is why the other mechanism + // is the default. But the other mechanism relies on a launch that makes the assumption + // true. + unsigned ret; + asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} +#endif + +// Note! Note will return true if mask is 0, but thats okay, because there must be one +// lane active to execute anything +__inline__ __device__ bool _waveIsSingleLane(int mask) +{ + return (mask & (mask - 1)) == 0; +} + +// Returns the power of 2 size of run of set bits. Returns 0 if not a suitable run. +__inline__ __device__ int _waveCalcPow2Offset(int mask) +{ + // This should be the most common case, so fast path it + if (mask == SLANG_CUDA_WARP_MASK) + { + return SLANG_CUDA_WARP_SIZE; + } + // Is it a contiguous run of bits? + if ((mask & (mask + 1)) == 0) + { + // const int offsetSize = __ffs(mask + 1) - 1; + const int offset = 32 - __clz(mask); + // Is it a power of 2 size + if ((offset & (offset - 1)) == 0) + { + return offset; + } + } + return 0; +} + +__inline__ __device__ bool _waveIsFirstLane() +{ + const int mask = __activemask(); + // We special case bit 0, as that most warps are expected to be fully active. + + // mask & -mask, isolates the lowest set bit. + //return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId())); + + // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. + return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId()); +} + +// TODO(JS): NOTE! These functions only work across all lanes. +// Special handling will be needed if only some lanes are active. + +#define SLANG_CUDA_REDUCE_OP(INIT_VALUE, OP) \ + const int offsetSize = _waveCalcPow2Offset(mask); \ + if (offsetSize > 0) \ + { \ + for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \ + { \ + val = val OP __shfl_xor_sync( mask, val, offset); \ + } \ + return val; \ + } \ + else if (_waveIsSingleLane(mask)) \ + { \ + return val; \ + } \ + else \ + { \ + int result = INIT_VALUE; \ + int remaining = mask; \ + while (remaining) \ + { \ + const int laneBit = remaining & -remaining; \ + /* Get the sourceLane */ \ + const int srcLane = __ffs(laneBit) - 1; \ + /* Broadcast (can also broadcast to self) */ \ + result = result OP __shfl_sync(mask, val, srcLane); \ + remaining &= ~laneBit; \ + } \ + return result; \ + } + +#define SLANG_CUDA_REDUCE_FUNC(INIT_VALUE, FUNC) \ + const int offsetSize = _waveCalcPow2Offset(mask); \ + if (offsetSize > 0) \ + { \ + for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \ + { \ + val = FUNC(val, __shfl_xor_sync( mask, val, offset)); \ + } \ + return val; \ + } \ + else if (_waveIsSingleLane(mask)) \ + { \ + return val; \ + } \ + else \ + { \ + int result = INIT_VALUE; \ + int remaining = mask; \ + while (remaining) \ + { \ + const int laneBit = remaining & -remaining; \ + /* Get the sourceLane */ \ + const int srcLane = __ffs(laneBit) - 1; \ + /* Broadcast (can also broadcast to self) */ \ + result = FUNC(result, __shfl_sync(mask, val, srcLane)); \ + remaining &= ~laneBit; \ + } \ + return result; \ + } + + +__inline__ __device__ int _waveOr(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, |) +} + +__inline__ __device__ int _waveAnd(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(~int(0), &) +} + +__inline__ __device__ int _waveXor(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, ^) +} + +__inline__ __device__ int _waveProduct(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(1, *) +} + +__inline__ __device__ int _waveSum(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, +) +} + +__inline__ __device__ int _waveMin(int mask, int val) +{ + SLANG_CUDA_REDUCE_FUNC(val, I32_min) +} + +__inline__ __device__ int _waveMax(int mask, int val) +{ + SLANG_CUDA_REDUCE_FUNC(val, I32_max) +} + +__inline__ __device__ bool _waveAllEqual(int mask, int val) +{ + int pred; + __match_all_sync(mask, val, &pred); + return pred != 0; +} + +__inline__ __device__ int _waveReadFirst(int val) +{ + const int mask = __activemask(); + const int lowestLaneId = __ffs(mask) - 1; + return __shfl_sync(mask, val, lowestLaneId); +} + + /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ |
