diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-03-09 12:40:04 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-09 12:40:04 -0400 |
| commit | 7e0aa9315f7f65033229c1f76d7df47ccd2da3d0 (patch) | |
| tree | 28ca885d901526ae548895f354626844d305d16f | |
| parent | b1317cd16ab9c827596a28ccf4258ef1bb672d92 (diff) | |
CUDA support for vector/matrix Wave intrinsics (#1266)
* Distinguish between __activeMask and _getConvergedMask().
Remove need to pass in mask to CUDA wave impls.
* Add support for vector/matrix Wave intrinsics for CUDA.
Fix issue with CUDA parsing of errors.
* Fix typo.
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 179 | ||||
| -rw-r--r-- | source/core/slang-nvrtc-compiler.cpp | 2 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 42 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-matrix.slang | 2 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-vector.slang | 2 |
5 files changed, 191 insertions, 36 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 90e00a631..5f0dffd5c 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -441,6 +441,33 @@ __forceinline__ __device__ uint32_t _getLaneId() } #endif +// It appears that the __activemask() cannot always be used because +// threads need to be converged. +// +// For CUDA the article claims mask has to be used carefully +// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ +// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. +// __activemask() though does not require there is convergence, so that doesn't work. +// +// '__ballot_sync' produces a convergance. +// +// From the CUDA docs: +// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the threads +// participating in the call. A bit, representing the thread's lane ID, must be set for each participating thread +// to ensure they are properly converged before the intrinsic is executed by the hardware. All active threads named +// in mask must execute the same intrinsic with the same mask, or the result is undefined.``` +// +// To get the right results we need to use the __activemask() within _ballot_sync it seems. +// +// Also note that __all_sync and __any_sync are listed with __ballot_sync. That if they have a similar synchronizing behavior +// we can use __activemask() there (instead of _getConvergedMask), because they will converge too. +__forceinline__ __device__ int _getConvergedMask() +{ + //return __activemask(); + //return __ballot_sync(SLANG_CUDA_WARP_MASK, true); + return __ballot_sync(__activemask(), true); +} + // Return mask of all the lanes less than the current lane __forceinline__ __device__ int _getLaneLtMask() { @@ -488,7 +515,6 @@ __inline__ __device__ bool _waveIsFirstLane() return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId()); } - template <typename T> struct WaveOpOr { @@ -538,24 +564,63 @@ struct WaveOpMin __inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; } }; +template <typename T> +struct ElementTypeTrait; + +// Scalar +template <> struct ElementTypeTrait<int> { typedef int Type; }; +template <> struct ElementTypeTrait<uint> { typedef uint Type; }; +template <> struct ElementTypeTrait<float> { typedef float Type; }; +template <> struct ElementTypeTrait<double> { typedef double Type; }; +template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; }; +template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; }; + +// Vector +template <> struct ElementTypeTrait<int1> { typedef int Type; }; +template <> struct ElementTypeTrait<int2> { typedef int Type; }; +template <> struct ElementTypeTrait<int3> { typedef int Type; }; +template <> struct ElementTypeTrait<int4> { typedef int Type; }; + +template <> struct ElementTypeTrait<uint1> { typedef uint Type; }; +template <> struct ElementTypeTrait<uint2> { typedef uint Type; }; +template <> struct ElementTypeTrait<uint3> { typedef uint Type; }; +template <> struct ElementTypeTrait<uint4> { typedef uint Type; }; + +template <> struct ElementTypeTrait<float1> { typedef float Type; }; +template <> struct ElementTypeTrait<float2> { typedef float Type; }; +template <> struct ElementTypeTrait<float3> { typedef float Type; }; +template <> struct ElementTypeTrait<float4> { typedef float Type; }; + +template <> struct ElementTypeTrait<double1> { typedef double Type; }; +template <> struct ElementTypeTrait<double2> { typedef double Type; }; +template <> struct ElementTypeTrait<double3> { typedef double Type; }; +template <> struct ElementTypeTrait<double4> { typedef double Type; }; + +// Matrix +template <typename T, int ROWS, int COLS> +struct ElementTypeTrait<Matrix<T, ROWS, COLS> > +{ + typedef T Type; +}; + // Scalar template <typename INTF, typename T> -__device__ T _waveReduce(int mask, T val) +__device__ T _waveReduceScalar(T val) { + // The shuffles appear to converge on set bits, so it appears ok to use __activemask() + //const int mask = _getConvergedMask(); + const int mask = __activemask(); + const int offsetSize = _waveCalcPow2Offset(mask); if (offsetSize > 0) { + // Fast path O(log2(activeLanes)) for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) { val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset)); } - return val; - } - else if (_waveIsSingleLane(mask)) - { - return val; } - else + else if (!_waveIsSingleLane(mask)) { T result = INTF::getInitial(val); int remaining = mask; @@ -570,33 +635,115 @@ __device__ T _waveReduce(int mask, T val) } return result; } + return val; +} + + +// Multiple values +template <typename INTF, typename T, size_t COUNT> +__device__ void _waveReduceMultiple(T* val) +{ + // The shuffles appear to converge on set bits, so it appears ok to use __activemask() + //const int mask = _getConvergedMask(); + const int mask = __activemask(); + + const int offsetSize = _waveCalcPow2Offset(mask); + if (offsetSize > 0) + { + // Fast path O(log2(activeLanes)) + for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) + { + for (size_t i = 0; i < COUNT; ++i) + { + val[i] = INTF::doOp(val[i], __shfl_xor_sync(mask, val[i], offset)); + } + } + } + else if (!_waveIsSingleLane(mask)) + { + // Copy the original + T originalVal[COUNT]; + for (size_t i = 0; i < COUNT; ++i) + { + const T v = val[i]; + originalVal[i] = v; + val[i] = INTF::getInitial(v); + } + + int remaining = mask; + while (remaining) + { + const int laneBit = remaining & -remaining; + /* Get the sourceLane */ + const int srcLane = __ffs(laneBit) - 1; + /* Broadcast (can also broadcast to self) */ + for (size_t i = 0; i < COUNT; ++i) + { + val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane)); + } + remaining &= ~laneBit; + } + } +} + +template <typename INTF, typename T> +__device__ void _waveReduceMultiple(T* val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)val); } template <typename T> -__inline__ __device__ T _waveOr(int mask, T val) { return _waveReduce<WaveOpOr<T>, T>(mask, val); } +__inline__ __device__ T _waveOr(T val) { return _waveReduceScalar<WaveOpOr<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveAnd(T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveXor(T val) { return _waveReduceScalar<WaveOpXor<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveProduct(T val) { return _waveReduceScalar<WaveOpMul<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveSum(T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveMin(T val) { return _waveReduceScalar<WaveOpMin<T>, T>(val); } + +template <typename T> +__inline__ __device__ T _waveMax(T val) { return _waveReduceScalar<WaveOpMax<T>, T>(val); } + + +// Multiple + +template <typename T> +__inline__ __device__ T _waveOrMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveAnd(int mask, T val) { return _waveReduce<WaveOpAnd<T>, T>(mask, val); } +__inline__ __device__ T _waveAndMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveXor(int mask, T val) { return _waveReduce<WaveOpXor<T>, T>(mask, val); } +__inline__ __device__ T _waveXorMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveProduct(int mask, T val) { return _waveReduce<WaveOpMul<T>, T>(mask, val); } +__inline__ __device__ T _waveProductMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveSum(int mask, T val) { return _waveReduce<WaveOpAdd<T>, T>(mask, val); } +__inline__ __device__ T _waveSumMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveMin(int mask, T val) { return _waveReduce<WaveOpMin<T>, T>(mask, val); } +__inline__ __device__ T _waveMinMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ T _waveMax(int mask, T val) { return _waveReduce<WaveOpMax<T>, T>(mask, val); } +__inline__ __device__ T _waveMaxMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(&val); return val; } template <typename T> -__inline__ __device__ bool _waveAllEqual(int mask, T val) +__inline__ __device__ bool _waveAllEqual(T val) { + // __match_all_sync is a synchronises so can use __activemask() + const int mask = __activemask(); int pred; __match_all_sync(mask, val, &pred); return pred != 0; diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp index 2f9944786..db4e4f32f 100644 --- a/source/core/slang-nvrtc-compiler.cpp +++ b/source/core/slang-nvrtc-compiler.cpp @@ -204,7 +204,7 @@ static SlangResult _parseNVRTCLine(const UnownedStringSlice& line, DownstreamDia StringUtil::split(line, ':', split); } - if (split.getCount() == 3) + if (split.getCount() >= 3) { // tests/cuda/cuda-compile.cu(7): warning: variable "c" is used before its value is set diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index d9e40dd4f..39cea9ba3 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2322,123 +2322,134 @@ __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)") +__target_intrinsic(cuda, "_waveAnd($0)") T WaveActiveBitAnd(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") +__target_intrinsic(cuda, "_waveAndMultiple($0)") vector<T,N> WaveActiveBitAnd(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveAndMultiple($0)") matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr); __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOr(__activemask(), $0)") +__target_intrinsic(cuda, "_waveOr($0)") T WaveActiveBitOr(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") +__target_intrinsic(cuda, "_waveOrMultiple($0)") vector<T,N> WaveActiveBitOr(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveOrMultiple($0)") matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr); __generic<T : __BuiltinIntegerType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXor(__activemask(), $0)") +__target_intrinsic(cuda, "_waveXor($0)") T WaveActiveBitXor(T expr); __generic<T : __BuiltinIntegerType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") +__target_intrinsic(cuda, "_waveXorMultiple($0)") vector<T,N> WaveActiveBitXor(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveXorMultiple($0)") matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMax(__activemask(), $0)") +__target_intrinsic(cuda, "_waveMax($0)") T WaveActiveMax(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") +__target_intrinsic(cuda, "_waveMaxMultiple($0)") vector<T,N> WaveActiveMax(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveMaxMultiple($0)") matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMin(__activemask(), $0)") +__target_intrinsic(cuda, "_waveMin($0)") T WaveActiveMin(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") +__target_intrinsic(cuda, "_waveMinMultiple($0)") vector<T,N> WaveActiveMin(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveMinMultiple($0)") matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)") +__target_intrinsic(cuda, "_waveProduct($0)") T WaveActiveProduct(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") +__target_intrinsic(cuda, "_waveProductMultiple($0)") vector<T,N> WaveActiveProduct(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveProductMultiple($0)") matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr); __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSum(__activemask(), $0)") +__target_intrinsic(cuda, "_waveSum($0)") T WaveActiveSum(T expr); __generic<T : __BuiltinArithmeticType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") +__target_intrinsic(cuda, "_waveSumMultiple($0)") vector<T,N> WaveActiveSum(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveSumMultiple($0)") matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr); __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") -__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)") +__target_intrinsic(cuda, "_waveAllEqual($0)") bool WaveActiveAllEqual(T value); __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") +__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") vector<bool,N> WaveActiveAllEqual(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value); __generic<T : __BuiltinType> uint4 WaveMatch(T value); __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value); -// TODO(JS): For CUDA the article claims mask has to be used carefully -// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ -// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask() -// seems to be appropriate. - __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAll($0)") @@ -2483,9 +2494,6 @@ __target_intrinsic(glsl, "subgroupElect()") __target_intrinsic(cuda, "_waveIsFirstLane()") bool WaveIsFirstLane(); -// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons) -// that would mean different lanes having a different mask, and they all have to have the same mask. - __generic<T : __BuiltinArithmeticType> __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -2585,7 +2593,7 @@ __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)") +__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)") T WaveReadLaneAt(T value, int lane); __generic<T : __BuiltinType, let N : int> __spirv_version(1.3) diff --git a/tests/hlsl-intrinsic/wave-matrix.slang b/tests/hlsl-intrinsic/wave-matrix.slang index 022182164..b5af69f5d 100644 --- a/tests/hlsl-intrinsic/wave-matrix.slang +++ b/tests/hlsl-intrinsic/wave-matrix.slang @@ -2,7 +2,7 @@ //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 //DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer RWStructuredBuffer<int> outputBuffer; diff --git a/tests/hlsl-intrinsic/wave-vector.slang b/tests/hlsl-intrinsic/wave-vector.slang index 808f0c5f6..8d2868600 100644 --- a/tests/hlsl-intrinsic/wave-vector.slang +++ b/tests/hlsl-intrinsic/wave-vector.slang @@ -2,7 +2,7 @@ //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer RWStructuredBuffer<int> outputBuffer; |
