summaryrefslogtreecommitdiffstats
path: root/prelude
diff options
context:
space:
mode:
authorNeil Bickford <57467222+NeilBickford-NV@users.noreply.github.com>2023-11-07 11:47:18 -0800
committerGitHub <noreply@github.com>2023-11-07 14:47:18 -0500
commit421941993d169c943f2c364bfe9c48b603339fd1 (patch)
tree29babcb8ffc0643f495f6eab75cb79455b9db82a /prelude
parent46529df2c4f73a4655bdd79d48004f29374a99a8 (diff)
CUDA: Fixes for NVRTC 12.x and warp mask ambiguity; adds CC 8.x warp reduction intrinsics. (#3314)
* CUDA: Fixes for NVRTC 12.x, warp mask ambiguity; add reduction partial specializations. * Fixes running NVRTC on CUDA 12 without a specified profile (used in testing, e.g. `slang-test -api cuda -category wave`) * Fixes mask ambiguity between getting the lane index from threadId.x and a full mask of threads. * Adds partial specializations for compute capability 8.x warp reduction intrinsics. * Fix formatting
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h41
1 files changed, 39 insertions, 2 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 24e400d2d..38f8a721a 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -53,7 +53,8 @@
# define SLANG_CUDA_WARP_SIZE 32
#endif
-#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1)
+#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1) // Used for masking threadIdx.x to the warp lane index
+#define SLANG_CUDA_WARP_BITMASK (~int(0))
//
#define SLANG_FORCE_INLINE inline
@@ -1418,10 +1419,16 @@ __inline__ __device__ bool _waveIsSingleLane(WarpMask mask)
}
// Returns the power of 2 size of run of set bits. Returns 0 if not a suitable run.
+// Examples:
+// 0b00000000'00000000'00000000'11111111 -> 8
+// 0b11111111'11111111'11111111'11111111 -> 32
+// 0b00000000'00000000'00000000'00011111 -> 0 (since 5 is not a power of 2)
+// 0b00000000'00000000'00000000'11110000 -> 0 (since the run of bits does not start at the LSB)
+// 0b00000000'00000000'00000000'00100111 -> 0 (since it is not a single contiguous run)
__inline__ __device__ int _waveCalcPow2Offset(WarpMask mask)
{
// This should be the most common case, so fast path it
- if (mask == SLANG_CUDA_WARP_MASK)
+ if (mask == SLANG_CUDA_WARP_BITMASK)
{
return SLANG_CUDA_WARP_SIZE;
}
@@ -1647,6 +1654,36 @@ __inline__ __device__ T _waveMin(WarpMask mask, T val) { return _waveReduceScala
template <typename T>
__inline__ __device__ T _waveMax(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMax<T>, T>(mask, val); }
+// Fast-path specializations when CUDA warp reduce operators are available
+#if __CUDA_ARCH__ >= 800 // 8.x or higher
+template<>
+__inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val) { return __reduce_or_sync(mask, val); }
+
+template<>
+__inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val) { return __reduce_and_sync(mask, val); }
+
+template<>
+__inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val) { return __reduce_xor_sync(mask, val); }
+
+template<>
+__inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val) { return __reduce_add_sync(mask, val); }
+
+template<>
+__inline__ __device__ int _waveSum<int>(WarpMask mask, int val) { return __reduce_add_sync(mask, val); }
+
+template<>
+__inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val) { return __reduce_min_sync(mask, val); }
+
+template<>
+__inline__ __device__ int _waveMin<int>(WarpMask mask, int val) { return __reduce_min_sync(mask, val); }
+
+template<>
+__inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val) { return __reduce_max_sync(mask, val); }
+
+template<>
+__inline__ __device__ int _waveMax<int>(WarpMask mask, int val) { return __reduce_max_sync(mask, val); }
+#endif
+
// Multiple