diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-03-10 12:31:25 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-10 12:31:25 -0400 |
| commit | a10d9cd8767e88a064719d71cc97144ba8b112d1 (patch) | |
| tree | c54745fb698c8cacfeb1c4440261eb899338f20e /prelude | |
| parent | 721d2e8a2d457081cd3d9b081979d436b7002c2c (diff) | |
WIP Prefix Sum for CUDA (#1268)
* Fix some typos.
* Add wave-prefix-sum.slang test
* First pass at implementing prefixSum.
* Small improvments to prefixSum CUDA.
* Small improvement to prefix sum.
* Enable prefix sum in stdlib.
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index c764afba1..6f2122934 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -823,6 +823,47 @@ __inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane) return outVal; } +__device__ int _wavePrefixSum(int val) +{ + const int mask = __activemask(); + const int offsetSize = _waveCalcPow2Offset(mask); + + const int laneId = _getLaneId(); + if (offsetSize > 0) + { + int sum = val; + for (int i = 1; i < offsetSize; i += i) + { + const int readVal = __shfl_up_sync(mask, sum, i, offsetSize); + if (laneId >= i) + { + sum += readVal; + } + } + return sum - val; + } + else + { + int result = 0; + int remaining = mask; + while (remaining) + { + const int laneBit = remaining & -remaining; + // Get the sourceLane + const int srcLane = __ffs(laneBit) - 1; + // Broadcast (can also broadcast to self) + int readValue = __shfl_sync(mask, val, srcLane); + // Only accumulate if srcLane is less than this lane + if (srcLane < laneId) + { + result += readValue; + } + remaining &= ~laneBit; + } + return result; + } +} + /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ |
