summaryrefslogtreecommitdiffstats
path: root/prelude
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-03-16 15:01:21 -0400
committerGitHub <noreply@github.com>2020-03-16 15:01:21 -0400
commit76b9ff6e65b4bd2be04a5bab0eb1464455c4b3ff (patch)
treea4aa6e8560984cbcbfd2c33df666b144b93418e8 /prelude
parent256a20a163ef6ee93a817472adcb24c076b0c0dc (diff)
CUDA support of MultiPrefix Wave intrinsics. (#1275)
Support for cs_6_5 cand cs_6_4 in profile Added wave-multi-prefix.slang etst
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h139
1 files changed, 97 insertions, 42 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 6a1d87183..dcc585b9c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -513,6 +513,18 @@ __forceinline__ __device__ int _getLaneLtMask()
return (int(1) << _getLaneId()) - 1;
}
+// Return a mask suitable for the straight 'Prefix' style ops
+__forceinline__ __device__ int _getPrefixMask()
+{
+ return __activemask();
+}
+
+// Return a mask suitable for the 'MultiPrefix' style functions
+__forceinline__ __device__ int _getMultiPrefixMask(int mask)
+{
+ return mask;
+}
+
// Note! Note will return true if mask is 0, but thats okay, because there must be one
// lane active to execute anything
__inline__ __device__ bool _waveIsSingleLane(int mask)
@@ -671,9 +683,9 @@ __device__ T _waveReduceScalar(T val)
while (remaining)
{
const int laneBit = remaining & -remaining;
- /* Get the sourceLane */
+ // Get the sourceLane
const int srcLane = __ffs(laneBit) - 1;
- /* Broadcast (can also broadcast to self) */
+ // Broadcast (can also broadcast to self)
result = INTF::doOp(result, __shfl_sync(mask, val, srcLane));
remaining &= ~laneBit;
}
@@ -718,9 +730,9 @@ __device__ void _waveReduceMultiple(T* val)
while (remaining)
{
const int laneBit = remaining & -remaining;
- /* Get the sourceLane */
+ // Get the sourceLane
const int srcLane = __ffs(laneBit) - 1;
- /* Broadcast (can also broadcast to self) */
+ // Broadcast (can also broadcast to self)
for (size_t i = 0; i < COUNT; ++i)
{
val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane));
@@ -786,7 +798,7 @@ __inline__ __device__ T _waveMaxMultiple(T val) { typedef typename ElementTypeT
template <typename T>
__inline__ __device__ bool _waveAllEqual(T val)
{
- // __match_all_sync is a synchronises so can use __activemask()
+ // __match_all_sync synchronizes so can use __activemask()
const int mask = __activemask();
int pred;
__match_all_sync(mask, val, &pred);
@@ -798,13 +810,10 @@ __inline__ __device__ bool _waveAllEqualMultiple(T inVal)
{
typedef typename ElementTypeTrait<T>::Type ElemType;
const size_t count = sizeof(T) / sizeof(ElemType);
-
- // __match_all_sync is a synchronises so can use __activemask()
+ // __match_all_sync synchronizes so can use __activemask()
const int mask = __activemask();
int pred;
-
const ElemType* src = (const ElemType*)&inVal;
-
for (size_t i = 0; i < count; ++i)
{
__match_all_sync(mask, src[i], &pred);
@@ -829,20 +838,15 @@ __inline__ __device__ T _waveReadFirstMultiple(T inVal)
{
typedef typename ElementTypeTrait<T>::Type ElemType;
const size_t count = sizeof(T) / sizeof(ElemType);
-
T outVal;
-
const ElemType* src = (const ElemType*)&inVal;
ElemType* dst = (ElemType*)&outVal;
-
const int mask = __activemask();
const int lowestLaneId = __ffs(mask) - 1;
-
for (size_t i = 0; i < count; ++i)
{
dst[i] = __shfl_sync(mask, src[i], lowestLaneId);
}
-
return outVal;
}
@@ -851,19 +855,14 @@ __inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane)
{
typedef typename ElementTypeTrait<T>::Type ElemType;
const size_t count = sizeof(T) / sizeof(ElemType);
-
T outVal;
-
const ElemType* src = (const ElemType*)&inVal;
ElemType* dst = (ElemType*)&outVal;
-
const int mask = __activemask();
-
for (size_t i = 0; i < count; ++i)
{
dst[i] = __shfl_sync(mask, src[i], lane);
}
-
return outVal;
}
@@ -872,9 +871,8 @@ __inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane)
// Invertable means that when we get to the end of the reduce, we can remove val (to make exclusive), using
// the inverse of the op.
template <typename INTF, typename T>
-__device__ T _wavePrefixInvertableScalar(T val)
+__device__ T _wavePrefixInvertableScalar(T val, const int mask)
{
- const int mask = __activemask();
const int offsetSize = _waveCalcPow2Offset(mask);
const int laneId = _getLaneId();
@@ -923,9 +921,8 @@ __device__ T _wavePrefixInvertableScalar(T val)
// This implementation separately tracks the value to be propogated, and the value
// that is the final result
template <typename INTF, typename T>
-__device__ T _wavePrefixScalar(T val)
+__device__ T _wavePrefixScalar(T val, const int mask)
{
- const int mask = __activemask();
const int offsetSize = _waveCalcPow2Offset(mask);
const int laneId = _getLaneId();
@@ -971,7 +968,7 @@ __device__ T _wavePrefixScalar(T val)
template <typename INTF, typename T, size_t COUNT>
-__device__ T _copy(T* dst, const T* src)
+__device__ T _waveOpCopy(T* dst, const T* src)
{
for (size_t j = 0; j < COUNT; ++j)
{
@@ -981,7 +978,7 @@ __device__ T _copy(T* dst, const T* src)
template <typename INTF, typename T, size_t COUNT>
-__device__ T _doInverse(T* inOut, const T* val)
+__device__ T _waveOpDoInverse(T* inOut, const T* val)
{
for (size_t j = 0; j < COUNT; ++j)
{
@@ -990,7 +987,7 @@ __device__ T _doInverse(T* inOut, const T* val)
}
template <typename INTF, typename T, size_t COUNT>
-__device__ T _setInitial(T* out, const T* val)
+__device__ T _waveOpSetInitial(T* out, const T* val)
{
for (size_t j = 0; j < COUNT; ++j)
{
@@ -999,14 +996,13 @@ __device__ T _setInitial(T* out, const T* val)
}
template <typename INTF, typename T, size_t COUNT>
-__device__ T _wavePrefixInvertableMultiple(T* val)
+__device__ T _wavePrefixInvertableMultiple(T* val, const int mask)
{
- const int mask = __activemask();
const int offsetSize = _waveCalcPow2Offset(mask);
const int laneId = _getLaneId();
T originalVal[COUNT];
- _copy<INTF, T, COUNT>(originalVal, val);
+ _waveOpCopy<INTF, T, COUNT>(originalVal, val);
if (offsetSize > 0)
{
@@ -1027,11 +1023,11 @@ __device__ T _wavePrefixInvertableMultiple(T* val)
}
}
// Remove originalVal from the result, by applyin inverse
- _doInverse<INTF, T, COUNT>(val, originalVal);
+ _waveOpDoInverse<INTF, T, COUNT>(val, originalVal);
}
else
{
- _setInitial<INTF, T, COUNT>(val, val);
+ _waveOpSetInitial<INTF, T, COUNT>(val, val);
if (!_waveIsSingleLane(mask))
{
int remaining = mask;
@@ -1058,16 +1054,15 @@ __device__ T _wavePrefixInvertableMultiple(T* val)
}
template <typename INTF, typename T, size_t COUNT>
-__device__ T _wavePrefixMultiple(T* val)
+__device__ T _wavePrefixMultiple(T* val, const int mask)
{
- const int mask = __activemask();
const int offsetSize = _waveCalcPow2Offset(mask);
const int laneId = _getLaneId();
T work[COUNT];
- _copy<INTF, T, COUNT>(work, val);
- _setInitial<INTF, T, COUNT>(val, val);
+ _waveOpCopy<INTF, T, COUNT>(work, val);
+ _waveOpSetInitial<INTF, T, COUNT>(val, val);
if (offsetSize > 0)
{
@@ -1113,29 +1108,89 @@ __device__ T _wavePrefixMultiple(T* val)
}
}
}
-
+
template <typename T>
-__inline__ __device__ T _wavePrefixProduct(T val) { return _wavePrefixScalar<WaveOpMul<T>, T>(val); }
+__inline__ __device__ T _wavePrefixProduct(T val, const int mask = _getPrefixMask()) { return _wavePrefixScalar<WaveOpMul<T>, T>(val, mask); }
template <typename T>
-__inline__ __device__ T _wavePrefixSum(T val) { return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(val); }
+__inline__ __device__ T _wavePrefixSum(T val, const int mask = _getPrefixMask()) { return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(val, mask); }
+
+template <typename T>
+__inline__ __device__ T _wavePrefixXor(T val, const int mask = _getPrefixMask()) { return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(val, mask); }
+
+template <typename T>
+__inline__ __device__ T _wavePrefixOr(T val, const int mask = _getPrefixMask()) { return _wavePrefixScalar<WaveOpOr<T>, T>(val, mask); }
+
+template <typename T>
+__inline__ __device__ T _wavePrefixAnd(T val, const int mask = _getPrefixMask()) { return _wavePrefixScalar<WaveOpAnd<T>, T>(val, mask); }
+
template <typename T>
-__inline__ __device__ T _wavePrefixProductMultiple(T val)
+__inline__ __device__ T _wavePrefixProductMultiple(T val, const int mask = _getPrefixMask())
{
typedef typename ElementTypeTrait<T>::Type ElemType;
- _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val);
+ _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val, mask);
return val;
}
template <typename T>
-__inline__ __device__ T _wavePrefixSumMultiple(T val)
+__inline__ __device__ T _wavePrefixSumMultiple(T val, const int mask = _getPrefixMask())
{
typedef typename ElementTypeTrait<T>::Type ElemType;
- _wavePrefixMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val);
+ _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val, mask);
return val;
}
+template <typename T>
+__inline__ __device__ T _wavePrefixXorMultiple(T val, const int mask = _getPrefixMask())
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val, mask);
+ return val;
+}
+
+template <typename T>
+__inline__ __device__ T _wavePrefixOrMultiple(T val, const int mask = _getPrefixMask())
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val, mask);
+ return val;
+}
+
+template <typename T>
+__inline__ __device__ T _wavePrefixAndMultiple(T val, const int mask = _getPrefixMask())
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)&val, mask);
+ return val;
+}
+
+template <typename T>
+__inline__ __device__ uint4 _waveMatchScalar(T val)
+{
+ // __match_all_sync synchronizes so can use __activemask()
+ const int mask = __activemask();
+ int pred;
+ return make_uint4(__match_all_sync(mask, val, &pred), 0, 0, 0);
+}
+
+template <typename T>
+__inline__ __device__ uint4 _waveMatchMultiple(const T& inVal)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ const size_t count = sizeof(T) / sizeof(ElemType);
+ // __match_all_sync synchronizes so can use __activemask()
+ const int mask = __activemask();
+ int pred;
+ const ElemType* src = (const ElemType*)&inVal;
+ uint matchBits = 0xffffffff;
+ for (size_t i = 0; i < count && matchBits; ++i)
+ {
+ matchBits = matchBits & __match_all_sync(mask, src[i], &pred);
+ }
+ return make_uint4(matchBits, 0, 0, 0);
+}
+
/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */