summaryrefslogtreecommitdiff
path: root/prelude
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-03-09 12:40:04 -0400
committerGitHub <noreply@github.com>2020-03-09 12:40:04 -0400
commit7e0aa9315f7f65033229c1f76d7df47ccd2da3d0 (patch)
tree28ca885d901526ae548895f354626844d305d16f /prelude
parentb1317cd16ab9c827596a28ccf4258ef1bb672d92 (diff)
CUDA support for vector/matrix Wave intrinsics (#1266)
* Distinguish between __activeMask and _getConvergedMask(). Remove need to pass in mask to CUDA wave impls. * Add support for vector/matrix Wave intrinsics for CUDA. Fix issue with CUDA parsing of errors. * Fix typo.
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h179
1 files changed, 163 insertions, 16 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 90e00a631..5f0dffd5c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -441,6 +441,33 @@ __forceinline__ __device__ uint32_t _getLaneId()
}
#endif
+// It appears that the __activemask() cannot always be used because
+// threads need to be converged.
+//
+// For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is no mask, and it's just the 'active lanes'.
+// __activemask() though does not require there is convergence, so that doesn't work.
+//
+// '__ballot_sync' produces a convergance.
+//
+// From the CUDA docs:
+// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the threads
+// participating in the call. A bit, representing the thread's lane ID, must be set for each participating thread
+// to ensure they are properly converged before the intrinsic is executed by the hardware. All active threads named
+// in mask must execute the same intrinsic with the same mask, or the result is undefined.```
+//
+// To get the right results we need to use the __activemask() within _ballot_sync it seems.
+//
+// Also note that __all_sync and __any_sync are listed with __ballot_sync. That if they have a similar synchronizing behavior
+// we can use __activemask() there (instead of _getConvergedMask), because they will converge too.
+__forceinline__ __device__ int _getConvergedMask()
+{
+ //return __activemask();
+ //return __ballot_sync(SLANG_CUDA_WARP_MASK, true);
+ return __ballot_sync(__activemask(), true);
+}
+
// Return mask of all the lanes less than the current lane
__forceinline__ __device__ int _getLaneLtMask()
{
@@ -488,7 +515,6 @@ __inline__ __device__ bool _waveIsFirstLane()
return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId());
}
-
template <typename T>
struct WaveOpOr
{
@@ -538,24 +564,63 @@ struct WaveOpMin
__inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; }
};
+template <typename T>
+struct ElementTypeTrait;
+
+// Scalar
+template <> struct ElementTypeTrait<int> { typedef int Type; };
+template <> struct ElementTypeTrait<uint> { typedef uint Type; };
+template <> struct ElementTypeTrait<float> { typedef float Type; };
+template <> struct ElementTypeTrait<double> { typedef double Type; };
+template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; };
+template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; };
+
+// Vector
+template <> struct ElementTypeTrait<int1> { typedef int Type; };
+template <> struct ElementTypeTrait<int2> { typedef int Type; };
+template <> struct ElementTypeTrait<int3> { typedef int Type; };
+template <> struct ElementTypeTrait<int4> { typedef int Type; };
+
+template <> struct ElementTypeTrait<uint1> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint2> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint3> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint4> { typedef uint Type; };
+
+template <> struct ElementTypeTrait<float1> { typedef float Type; };
+template <> struct ElementTypeTrait<float2> { typedef float Type; };
+template <> struct ElementTypeTrait<float3> { typedef float Type; };
+template <> struct ElementTypeTrait<float4> { typedef float Type; };
+
+template <> struct ElementTypeTrait<double1> { typedef double Type; };
+template <> struct ElementTypeTrait<double2> { typedef double Type; };
+template <> struct ElementTypeTrait<double3> { typedef double Type; };
+template <> struct ElementTypeTrait<double4> { typedef double Type; };
+
+// Matrix
+template <typename T, int ROWS, int COLS>
+struct ElementTypeTrait<Matrix<T, ROWS, COLS> >
+{
+ typedef T Type;
+};
+
// Scalar
template <typename INTF, typename T>
-__device__ T _waveReduce(int mask, T val)
+__device__ T _waveReduceScalar(T val)
{
+ // The shuffles appear to converge on set bits, so it appears ok to use __activemask()
+ //const int mask = _getConvergedMask();
+ const int mask = __activemask();
+
const int offsetSize = _waveCalcPow2Offset(mask);
if (offsetSize > 0)
{
+ // Fast path O(log2(activeLanes))
for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
{
val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset));
}
- return val;
- }
- else if (_waveIsSingleLane(mask))
- {
- return val;
}
- else
+ else if (!_waveIsSingleLane(mask))
{
T result = INTF::getInitial(val);
int remaining = mask;
@@ -570,33 +635,115 @@ __device__ T _waveReduce(int mask, T val)
}
return result;
}
+ return val;
+}
+
+
+// Multiple values
+template <typename INTF, typename T, size_t COUNT>
+__device__ void _waveReduceMultiple(T* val)
+{
+ // The shuffles appear to converge on set bits, so it appears ok to use __activemask()
+ //const int mask = _getConvergedMask();
+ const int mask = __activemask();
+
+ const int offsetSize = _waveCalcPow2Offset(mask);
+ if (offsetSize > 0)
+ {
+ // Fast path O(log2(activeLanes))
+ for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
+ {
+ for (size_t i = 0; i < COUNT; ++i)
+ {
+ val[i] = INTF::doOp(val[i], __shfl_xor_sync(mask, val[i], offset));
+ }
+ }
+ }
+ else if (!_waveIsSingleLane(mask))
+ {
+ // Copy the original
+ T originalVal[COUNT];
+ for (size_t i = 0; i < COUNT; ++i)
+ {
+ const T v = val[i];
+ originalVal[i] = v;
+ val[i] = INTF::getInitial(v);
+ }
+
+ int remaining = mask;
+ while (remaining)
+ {
+ const int laneBit = remaining & -remaining;
+ /* Get the sourceLane */
+ const int srcLane = __ffs(laneBit) - 1;
+ /* Broadcast (can also broadcast to self) */
+ for (size_t i = 0; i < COUNT; ++i)
+ {
+ val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane));
+ }
+ remaining &= ~laneBit;
+ }
+ }
+}
+
+template <typename INTF, typename T>
+__device__ void _waveReduceMultiple(T* val)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)val);
}
template <typename T>
-__inline__ __device__ T _waveOr(int mask, T val) { return _waveReduce<WaveOpOr<T>, T>(mask, val); }
+__inline__ __device__ T _waveOr(T val) { return _waveReduceScalar<WaveOpOr<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveAnd(T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveXor(T val) { return _waveReduceScalar<WaveOpXor<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveProduct(T val) { return _waveReduceScalar<WaveOpMul<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveSum(T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveMin(T val) { return _waveReduceScalar<WaveOpMin<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveMax(T val) { return _waveReduceScalar<WaveOpMax<T>, T>(val); }
+
+
+// Multiple
+
+template <typename T>
+__inline__ __device__ T _waveOrMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveAnd(int mask, T val) { return _waveReduce<WaveOpAnd<T>, T>(mask, val); }
+__inline__ __device__ T _waveAndMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveXor(int mask, T val) { return _waveReduce<WaveOpXor<T>, T>(mask, val); }
+__inline__ __device__ T _waveXorMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveProduct(int mask, T val) { return _waveReduce<WaveOpMul<T>, T>(mask, val); }
+__inline__ __device__ T _waveProductMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveSum(int mask, T val) { return _waveReduce<WaveOpAdd<T>, T>(mask, val); }
+__inline__ __device__ T _waveSumMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveMin(int mask, T val) { return _waveReduce<WaveOpMin<T>, T>(mask, val); }
+__inline__ __device__ T _waveMinMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ T _waveMax(int mask, T val) { return _waveReduce<WaveOpMax<T>, T>(mask, val); }
+__inline__ __device__ T _waveMaxMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(&val); return val; }
template <typename T>
-__inline__ __device__ bool _waveAllEqual(int mask, T val)
+__inline__ __device__ bool _waveAllEqual(T val)
{
+ // __match_all_sync is a synchronises so can use __activemask()
+ const int mask = __activemask();
int pred;
__match_all_sync(mask, val, &pred);
return pred != 0;