diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-03-02 16:18:20 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-02 16:18:20 -0500 |
| commit | 8899c149b05def1cce626ea649012c4c974861de (patch) | |
| tree | 77e97c2997a653ba9262b32f55e9e3f37e166653 | |
| parent | b85ca6f86d46ee3c4d5784d0bd4ebc8509e2a9bd (diff) | |
Additional Wave Intrinsic Support (#1252)
* Test for some wave intrinsics.
More wave intrinsic support on CUDA.
* Use shfl_xor_sync.
* Improvements around wave intrinsics.
Fix built in integer types belong to __BuiltinIntegerType.
* Improvements and fixes around Wave intrinsics.
* Added WaveIsFirstLane test.
No longer use __wavemask_lt, as appears not available as an intrinsic.
* Small fixes to CUDA prelude.
* Add wave-active-product test.
Handle the special case for arbitray sums.
* Used macro to implement CUDA wave intrinsics.
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 216 | ||||
| -rw-r--r-- | source/core/slang-nvrtc-compiler.cpp | 3 | ||||
| -rw-r--r-- | source/slang/core.meta.slang | 8 | ||||
| -rw-r--r-- | source/slang/core.meta.slang.h | 16 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 58 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 62 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-active-product.slang | 31 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-active-product.slang.expected.txt | 16 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-is-first-lane.slang | 24 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt | 16 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave.slang | 36 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave.slang.expected.txt | 4 |
12 files changed, 421 insertions, 69 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 1938e3dc1..1ca93d9d1 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -128,36 +128,6 @@ union Union64 double d; }; -// ---------------------- Miscellaneous -------------------------------------- - -// TODO(JS): It appears that cuda does not have a simple way to get a lane index. -// -// Another approach could be... -// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK -// If that is really true another way to do this, would be for code generator to add this function -// with the [numthreads] baked in. -// -// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() -#ifndef SLANG_USE_ASM_LANE_ID - __forceinline__ __device__ uint32_t _getLaneId() -{ - // If the launch is (or I guess some multiple of the warp size) - // we try this mechanism, which is apparently faster. - return threadIdx.x & SLANG_CUDA_WARP_MASK; -} -#else -__forceinline__ __device__ uint32_t _getLaneId() -{ - // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# - // This mechanism is not the fastest way to do it, and that is why the other mechanism - // is the default. But the other mechanism relies on a launch that makes the assumption - // true. - unsigned ret; - asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); - return ret; -} -#endif - // ----------------------------- F32 ----------------------------------------- // Unary @@ -440,6 +410,192 @@ struct RWByteAddressBuffer size_t sizeInBytes; //< Must be multiple of 4 }; + +// ---------------------- Wave -------------------------------------- + +// TODO(JS): It appears that cuda does not have a simple way to get a lane index. +// +// Another approach could be... +// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK +// If that is really true another way to do this, would be for code generator to add this function +// with the [numthreads] baked in. +// +// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() +#ifndef SLANG_USE_ASM_LANE_ID + __forceinline__ __device__ uint32_t _getLaneId() +{ + // If the launch is (or I guess some multiple of the warp size) + // we try this mechanism, which is apparently faster. + return threadIdx.x & SLANG_CUDA_WARP_MASK; +} +#else +__forceinline__ __device__ uint32_t _getLaneId() +{ + // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# + // This mechanism is not the fastest way to do it, and that is why the other mechanism + // is the default. But the other mechanism relies on a launch that makes the assumption + // true. + unsigned ret; + asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} +#endif + +// Note! Note will return true if mask is 0, but thats okay, because there must be one +// lane active to execute anything +__inline__ __device__ bool _waveIsSingleLane(int mask) +{ + return (mask & (mask - 1)) == 0; +} + +// Returns the power of 2 size of run of set bits. Returns 0 if not a suitable run. +__inline__ __device__ int _waveCalcPow2Offset(int mask) +{ + // This should be the most common case, so fast path it + if (mask == SLANG_CUDA_WARP_MASK) + { + return SLANG_CUDA_WARP_SIZE; + } + // Is it a contiguous run of bits? + if ((mask & (mask + 1)) == 0) + { + // const int offsetSize = __ffs(mask + 1) - 1; + const int offset = 32 - __clz(mask); + // Is it a power of 2 size + if ((offset & (offset - 1)) == 0) + { + return offset; + } + } + return 0; +} + +__inline__ __device__ bool _waveIsFirstLane() +{ + const int mask = __activemask(); + // We special case bit 0, as that most warps are expected to be fully active. + + // mask & -mask, isolates the lowest set bit. + //return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId())); + + // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. + return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId()); +} + +// TODO(JS): NOTE! These functions only work across all lanes. +// Special handling will be needed if only some lanes are active. + +#define SLANG_CUDA_REDUCE_OP(INIT_VALUE, OP) \ + const int offsetSize = _waveCalcPow2Offset(mask); \ + if (offsetSize > 0) \ + { \ + for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \ + { \ + val = val OP __shfl_xor_sync( mask, val, offset); \ + } \ + return val; \ + } \ + else if (_waveIsSingleLane(mask)) \ + { \ + return val; \ + } \ + else \ + { \ + int result = INIT_VALUE; \ + int remaining = mask; \ + while (remaining) \ + { \ + const int laneBit = remaining & -remaining; \ + /* Get the sourceLane */ \ + const int srcLane = __ffs(laneBit) - 1; \ + /* Broadcast (can also broadcast to self) */ \ + result = result OP __shfl_sync(mask, val, srcLane); \ + remaining &= ~laneBit; \ + } \ + return result; \ + } + +#define SLANG_CUDA_REDUCE_FUNC(INIT_VALUE, FUNC) \ + const int offsetSize = _waveCalcPow2Offset(mask); \ + if (offsetSize > 0) \ + { \ + for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \ + { \ + val = FUNC(val, __shfl_xor_sync( mask, val, offset)); \ + } \ + return val; \ + } \ + else if (_waveIsSingleLane(mask)) \ + { \ + return val; \ + } \ + else \ + { \ + int result = INIT_VALUE; \ + int remaining = mask; \ + while (remaining) \ + { \ + const int laneBit = remaining & -remaining; \ + /* Get the sourceLane */ \ + const int srcLane = __ffs(laneBit) - 1; \ + /* Broadcast (can also broadcast to self) */ \ + result = FUNC(result, __shfl_sync(mask, val, srcLane)); \ + remaining &= ~laneBit; \ + } \ + return result; \ + } + + +__inline__ __device__ int _waveOr(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, |) +} + +__inline__ __device__ int _waveAnd(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(~int(0), &) +} + +__inline__ __device__ int _waveXor(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, ^) +} + +__inline__ __device__ int _waveProduct(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(1, *) +} + +__inline__ __device__ int _waveSum(int mask, int val) +{ + SLANG_CUDA_REDUCE_OP(0, +) +} + +__inline__ __device__ int _waveMin(int mask, int val) +{ + SLANG_CUDA_REDUCE_FUNC(val, I32_min) +} + +__inline__ __device__ int _waveMax(int mask, int val) +{ + SLANG_CUDA_REDUCE_FUNC(val, I32_max) +} + +__inline__ __device__ bool _waveAllEqual(int mask, int val) +{ + int pred; + __match_all_sync(mask, val, &pred); + return pred != 0; +} + +__inline__ __device__ int _waveReadFirst(int val) +{ + const int mask = __activemask(); + const int lowestLaneId = __ffs(mask) - 1; + return __shfl_sync(mask, val, lowestLaneId); +} + + /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp index f68c4dc01..27d269125 100644 --- a/source/core/slang-nvrtc-compiler.cpp +++ b/source/core/slang-nvrtc-compiler.cpp @@ -307,6 +307,9 @@ SlangResult NVRTCDownstreamCompiler::compile(const CompileOptions& options, RefP // This is arguably too much - but nvrtc does not appear to have a mechanism to switch off individual warnings. // I tried the -Xcudafe mechanism but that does not appear to work for nvrtc cmdLine.addArg("-w"); + + // + cmdLine.addArg("-arch=compute_70"); } nvrtcProgram program = nullptr; diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index 70bc90392..6822d304b 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -132,9 +132,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt) case BaseType::Half: case BaseType::Float: case BaseType::Double: - sb << "\n , __BuiltinFloatingPointType\n"; + sb << "\n , __BuiltinFloatingPointType\n"; sb << "\n , __BuiltinRealType\n"; - ; // fall through to: + sb << "\n , __BuiltinSignedArithmeticType\n"; + sb << "\n , __BuiltinArithmeticType\n"; + sb << "\n , __BuiltinType\n"; + break; case BaseType::Int8: case BaseType::Int16: case BaseType::Int: @@ -146,6 +149,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt) case BaseType::UInt: case BaseType::UInt64: sb << "\n , __BuiltinArithmeticType\n"; + sb << "\n , __BuiltinIntegerType\n"; ; // fall through to: case BaseType::Bool: sb << "\n , __BuiltinType\n"; diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h index 4c8da2a9a..3ff1fd243 100644 --- a/source/slang/core.meta.slang.h +++ b/source/slang/core.meta.slang.h @@ -135,9 +135,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt) case BaseType::Half: case BaseType::Float: case BaseType::Double: - sb << "\n , __BuiltinFloatingPointType\n"; + sb << "\n , __BuiltinFloatingPointType\n"; sb << "\n , __BuiltinRealType\n"; - ; // fall through to: + sb << "\n , __BuiltinSignedArithmeticType\n"; + sb << "\n , __BuiltinArithmeticType\n"; + sb << "\n , __BuiltinType\n"; + break; case BaseType::Int8: case BaseType::Int16: case BaseType::Int: @@ -149,6 +152,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt) case BaseType::UInt: case BaseType::UInt64: sb << "\n , __BuiltinArithmeticType\n"; + sb << "\n , __BuiltinIntegerType\n"; ; // fall through to: case BaseType::Bool: sb << "\n , __BuiltinType\n"; @@ -195,7 +199,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt) // TODO: should this cover the full gamut of integer types? case BaseType::Int: case BaseType::UInt: -SLANG_RAW("#line 195 \"core.meta.slang\"") +SLANG_RAW("#line 199 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW(" __generic<T:__EnumType>\n") SLANG_RAW(" __init(T value);\n") @@ -211,7 +215,7 @@ SLANG_RAW(" __init(T value);\n") // Declare built-in pointer type // (eventually we can have the traditional syntax sugar for this) -SLANG_RAW("#line 210 \"core.meta.slang\"") +SLANG_RAW("#line 214 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("__generic<T>\n") @@ -273,7 +277,7 @@ sb << " __init(T value);\n"; sb << " __init(vector<T,N> value);\n"; sb << "};\n"; -SLANG_RAW("#line 256 \"core.meta.slang\"") +SLANG_RAW("#line 260 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("__generic<T = float, let R : int = 4, let C : int = 4>\n") @@ -1509,7 +1513,7 @@ for (auto op : binaryOps) sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n"; } } -SLANG_RAW("#line 1491 \"core.meta.slang\"") +SLANG_RAW("#line 1495 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("// Specialized function\n") diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index c8aae9158..edb678ad6 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1395,35 +1395,51 @@ __generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue); __generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue); __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue); -__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr); +__generic<T : __BuiltinIntegerType> +__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)") +T WaveActiveBitAnd(T expr); __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr); -__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr); +__generic<T : __BuiltinIntegerType> +__target_intrinsic(cuda, "_waveOr(__activemask(), $0)") +T WaveActiveBitOr(T expr); __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr); -__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr); +__generic<T : __BuiltinIntegerType> +__target_intrinsic(cuda, "_waveXor(__activemask(), $0)") +T WaveActiveBitXor(T expr); __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr); __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(cuda, "_waveMax(__activemask(), $0)") +T WaveActiveMax(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(cuda, "_waveMin(__activemask(), $0)") +T WaveActiveMin(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)") +T WaveActiveProduct(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr); +__generic<T : __BuiltinArithmeticType> +__target_intrinsic(cuda, "_waveSum(__activemask(), $0)") +T WaveActiveSum(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr); -__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value); +__generic<T : __BuiltinType> +__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)") +bool WaveActiveAllEqual(T value); __generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value); @@ -1438,7 +1454,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T, __target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") bool WaveActiveAllTrue(bool condition); -__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)") +__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)") bool WaveActiveAnyTrue(bool condition); __target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)") @@ -1454,14 +1470,19 @@ __target_intrinsic(cuda, "_getLaneId()") uint WaveGetLaneIndex(); // If there are no *active* lanes less than this one, we must be the lowest lane -__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)") +__target_intrinsic(cuda, "_waveIsFirstLane()") bool WaveIsFirstLane(); -__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr); +// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons) +// that would mean different lanes having a different mask, and they all have to have the same mask. + +__generic<T : __BuiltinArithmeticType> +T WavePrefixProduct(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr); +__generic<T : __BuiltinArithmeticType> +T WavePrefixSum(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr); @@ -1473,11 +1494,14 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr); -__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr); +__generic<T : __BuiltinArithmeticType> +T WaveMultiPrefixBitXor(T expr); __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr); +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())") uint WavePrefixCountBits(bool value); + uint WaveMultiPrefixCountBits(bool value, uint4 mask); __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask); @@ -1488,11 +1512,15 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, uint4 mask) __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask); __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask); -__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr); +__generic<T : __BuiltinType> +__target_intrinsic(cuda, "_waveReadFirst($0)") +T WaveReadLaneFirst(T expr); __generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr); __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr); -__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane); +__generic<T : __BuiltinType> +__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)") +T WaveReadLaneAt(T value, int lane); __generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane); __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index 69349d9dc..16a3244ab 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -1471,35 +1471,51 @@ SLANG_RAW("__generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);\ SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinIntegerType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveAnd(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveBitAnd(T expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinIntegerType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveOr(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveBitOr(T expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinIntegerType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveXor(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveBitXor(T expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveMax(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveMax(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveMin(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveMin(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveProduct(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveProduct(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveSum(__activemask(), $0)\")\n") +SLANG_RAW("T WaveActiveSum(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value);\n") +SLANG_RAW("__generic<T : __BuiltinType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveAllEqual(__activemask(), $0)\")\n") +SLANG_RAW("bool WaveActiveAllEqual(T value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);\n") SLANG_RAW("\n") @@ -1514,7 +1530,7 @@ SLANG_RAW("// seems to be appropriate.\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n") SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n") -SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"(__any_sync(__activemask(), $0) != 0)\")\n") SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n") @@ -1530,14 +1546,19 @@ SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n") SLANG_RAW("uint WaveGetLaneIndex();\n") SLANG_RAW("\n") SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n") -SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveIsFirstLane()\")\n") SLANG_RAW("bool WaveIsFirstLane();\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n") +SLANG_RAW("// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)\n") +SLANG_RAW("// that would mean different lanes having a different mask, and they all have to have the same mask.\n") +SLANG_RAW("\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("T WavePrefixProduct(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("T WavePrefixSum(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);\n") SLANG_RAW("\n") @@ -1549,11 +1570,14 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr) SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n") +SLANG_RAW("T WaveMultiPrefixBitXor(T expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);\n") SLANG_RAW("\n") +SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())\")\n") SLANG_RAW("uint WavePrefixCountBits(bool value);\n") +SLANG_RAW("\n") SLANG_RAW("uint WaveMultiPrefixCountBits(bool value, uint4 mask);\n") SLANG_RAW("\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);\n") @@ -1564,11 +1588,15 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);\n") SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr);\n") +SLANG_RAW("__generic<T : __BuiltinType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"_waveReadFirst($0)\")\n") +SLANG_RAW("T WaveReadLaneFirst(T expr);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);\n") SLANG_RAW("\n") -SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane);\n") +SLANG_RAW("__generic<T : __BuiltinType>\n") +SLANG_RAW("__target_intrinsic(cuda, \"__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)\")\n") +SLANG_RAW("T WaveReadLaneAt(T value, int lane);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);\n") SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);\n") SLANG_RAW("\n") @@ -1658,7 +1686,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1585 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1613 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") @@ -1682,6 +1710,8 @@ SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x1 SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20;\n") SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_OPAQUE = 0x40;\n") SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_NON_OPAQUE = 0x80;\n") +SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_TRIANGLES = 0x100;\n") +SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES = 0x200;\n") SLANG_RAW("\n") SLANG_RAW("// 10.1.2 - Ray Description Structure\n") SLANG_RAW("\n") diff --git a/tests/hlsl-intrinsic/wave-active-product.slang b/tests/hlsl-intrinsic/wave-active-product.slang new file mode 100644 index 000000000..cacc0a539 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-active-product.slang @@ -0,0 +1,31 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +[numthreads(8, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + const int idx = int(dispatchThreadID.x); + +#if 1 + if (idx < 3) + { + // Diverge!! + outputBuffer[idx] = -1; + return; + } + outputBuffer[idx] = WaveActiveProduct(idx); +#else + + /// NOTE! Can't say I totally understand WaveActiveProduct. + /// The following returns 0x240 on CUDA - which is what I'd expect + /// On DX12, it returns 0 + + outputBuffer[idx] = WaveActiveProduct((idx & 3) + 1); +#endif +}
\ No newline at end of file diff --git a/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt b/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt new file mode 100644 index 000000000..dbe392009 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt @@ -0,0 +1,16 @@ +FFFFFFFF +FFFFFFFF +FFFFFFFF +9D8 +9D8 +9D8 +9D8 +9D8 +0 +0 +0 +0 +0 +0 +0 +0 diff --git a/tests/hlsl-intrinsic/wave-is-first-lane.slang b/tests/hlsl-intrinsic/wave-is-first-lane.slang new file mode 100644 index 000000000..39a19336d --- /dev/null +++ b/tests/hlsl-intrinsic/wave-is-first-lane.slang @@ -0,0 +1,24 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +[numthreads(8, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int idx = int(dispatchThreadID.x); + + if (idx < 3) + { + // Diverge!! + outputBuffer[idx] = -1; + return; + } + + int value = 0; + outputBuffer[idx] = WaveIsFirstLane(); +}
\ No newline at end of file diff --git a/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt b/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt new file mode 100644 index 000000000..43debbc9d --- /dev/null +++ b/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt @@ -0,0 +1,16 @@ +FFFFFFFF +FFFFFFFF +FFFFFFFF +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 diff --git a/tests/hlsl-intrinsic/wave.slang b/tests/hlsl-intrinsic/wave.slang new file mode 100644 index 000000000..bc30da4ad --- /dev/null +++ b/tests/hlsl-intrinsic/wave.slang @@ -0,0 +1,36 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int idx = int(dispatchThreadID.x); + + int value = 0; + + value |= WaveActiveAllTrue(idx < 4 ) ? 1 : 0; + value |= WaveActiveAnyTrue(idx == 2) ? 2 : 0; + value |= WaveActiveAnyTrue(idx == -1) ? 4 : 0; + value |= WaveActiveAllTrue(idx == 3) ? 8 : 0; + + int sum = WaveActiveSum(idx); + value |= (sum << 4); + + // TODO(JS): + // This result is unexpected. I expect 1 * 2 * 1 * 2 = 4. But we get 0 on DX (so disable for now). On CUDA I get 4. + // int product = WaveActiveProduct((idx & 1) + 1); + /// value |= (product << 8); + + // TODO(JS): NOTE! This only works with uint, *NOT* int on HLSL/DXC. + // We need to update the stdlib to reflect this. + uint xor = WaveActiveBitXor(uint(idx + 1)); + value |= int(xor << 12); + + outputBuffer[idx] = value; +}
\ No newline at end of file diff --git a/tests/hlsl-intrinsic/wave.slang.expected.txt b/tests/hlsl-intrinsic/wave.slang.expected.txt new file mode 100644 index 000000000..a3dff7d2d --- /dev/null +++ b/tests/hlsl-intrinsic/wave.slang.expected.txt @@ -0,0 +1,4 @@ +4063 +4063 +4063 +4063 |
