diff options
| author | Darren Wihandi <65404740+fairywreath@users.noreply.github.com> | 2025-05-25 12:58:08 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-05-25 09:58:08 -0700 |
| commit | 0476b57faad96bee61f59f27ddd48c6cb067cfa2 (patch) | |
| tree | d3fe49cd906c29b03b2a840dd2c057ccc331b4f7 /tests | |
| parent | 554be7a5f990df19a21db10b4e5dc0285cbe8168 (diff) | |
Add full support for SPV_NV_shader_subgroup_partitioned (#7103)
* Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics
* More partitioned intrinsics
* More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations
* Refactor HLSL WaveMultiPrefix* implementations
* fix cap atoms
* Clean up implementation
* Add GLSL intrinsics and cleanup
* Add tests
* Fix affected capability test
* Update and fix tests
* Move expected.txt file
* Refactor WaveMask* to call WaveMulti*
* Refactor SPIRV/GLSL preamble code
* Enable emit-via-glsl tests
* remove wave_multi_prefix capability in favor of subgroup_partitioned
* Update docs
* Update cap atoms doc
Diffstat (limited to 'tests')
17 files changed, 981 insertions, 13 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang index 8a47c5733..da94ad794 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -26,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) const WaveMask mask2 = mask0 & ~mask1; outputBuffer[idx] = WaveMaskProduct(mask2, idx); -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang index 3dd33f150..3a1c26f8e 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -30,4 +30,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) value = WaveMaskMin(mask2, idx + 1); outputBuffer[idx] = value; -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang index f333a59fb..fb5573bd1 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //DISABLE_TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -37,4 +37,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) matrix<int, 2, 2> r = r0 + matrix<int, 2, 2>(r1) + r6; outputBuffer[idx] = r[0][0] + r[0][1] + r[1][0] + r[1][1]; -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang index b12e9c1b3..e32524b1e 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -25,4 +25,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) outputBuffer[idx] = r0 + (r2 << 16); -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang index 51e9b7600..2e0fba746 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -23,4 +23,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) int r2 = int(r1.x) + int(r1.y) - idx; outputBuffer[idx] = r0 + (r2 << 16); -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang index b1f44f4fb..7c326e0f3 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -29,4 +29,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) int2 r = r0 + int2(r1) + r2 + r3 + r4; outputBuffer[idx] = r.x + r.y; -}
\ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang new file mode 100644 index 000000000..c2a292c14 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang @@ -0,0 +1,139 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedAnd subgroupPartitionedAndNV +#define __partitionedOr subgroupPartitionedOrNV +#define __partitionedXor subgroupPartitionedXorNV +#else +#define __partitionedAnd WaveMultiBitAnd +#define __partitionedOr WaveMultiBitOr +#define __partitionedXor WaveMultiBitXor +#endif + +static uint gAndValue = 0; +static uint gOrValue = 0; +static uint gOrResult = 0; +static uint gXorValue = 0; +static uint gXorResult = 0; + +__generic<T : __BuiltinLogicalType> +bool test1Bitwise(uint4 mask) +{ + let andValue = T(gAndValue); + let orValue = T(gOrValue); + let orResult = T(gOrResult); + let xorValue = T(gXorValue); + let xorResult = T(gXorResult); + + return true + & (__partitionedAnd(andValue, mask) == andValue) + & (__partitionedOr(orValue, mask) == orResult) + & (__partitionedXor(xorValue, mask) == xorResult) + ; +} + +__generic<T : __BuiltinLogicalType, let N : int> +bool testVBitwise(uint4 mask) { + typealias GVec = vector<T, N>; + + let andValue = GVec(T(gAndValue)); + let orValue = GVec(T(gOrValue)); + let orResult = GVec(T(gOrResult)); + let xorValue = GVec(T(gXorValue)); + let xorResult = GVec(T(gXorResult)); + + return true + & all(__partitionedAnd(andValue, mask) == andValue) + & all(__partitionedOr(orValue, mask) == orResult) + & all(__partitionedXor(xorValue, mask) == xorResult) + ; +} + +bool testBitwise(uint4 mask) +{ + return true + & test1Bitwise<int>(mask) + & testVBitwise<int, 2>(mask) + & testVBitwise<int, 3>(mask) + & testVBitwise<int, 4>(mask) + & test1Bitwise<uint>(mask) + & testVBitwise<uint, 2>(mask) + & testVBitwise<uint, 3>(mask) + & testVBitwise<uint, 4>(mask) + + // TODO: these are failing SPIRV validation and should be fixed. + // SPIRV's ops do not directly accept/return bool. + // & test1Bitwise<bool>(mask) + // & testVBitwise<bool, 2>(mask) + // & testVBitwise<bool, 3>(mask) + // & testVBitwise<bool, 4>(mask) + +#if !defined(CUDA) + & test1Bitwise<int8_t>(mask) + & testVBitwise<int8_t, 2>(mask) + & testVBitwise<int8_t, 3>(mask) + & testVBitwise<int8_t, 4>(mask) + & test1Bitwise<int16_t>(mask) + & testVBitwise<int16_t, 2>(mask) + & testVBitwise<int16_t, 3>(mask) + & testVBitwise<int16_t, 4>(mask) + & test1Bitwise<int64_t>(mask) + & testVBitwise<int64_t, 2>(mask) + & testVBitwise<int64_t, 3>(mask) + & testVBitwise<int64_t, 4>(mask) + & test1Bitwise<uint8_t>(mask) + & testVBitwise<uint8_t, 2>(mask) + & testVBitwise<uint8_t, 3>(mask) + & testVBitwise<uint8_t, 4>(mask) + & test1Bitwise<uint16_t>(mask) + & testVBitwise<uint16_t, 2>(mask) + & testVBitwise<uint16_t, 3>(mask) + & testVBitwise<uint16_t, 4>(mask) + & test1Bitwise<uint64_t>(mask) + & testVBitwise<uint64_t, 2>(mask) + & testVBitwise<uint64_t, 3>(mask) + & testVBitwise<uint64_t, 4>(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + let isSecondGroup = index >= 15; + let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + // One invocation in second group is different from others to test or and xor operations. + let isOrSet = (index == 15); + + gAndValue = isSecondGroup ? uint(1) : uint(0); + gOrValue = isOrSet ? uint(1) : uint(0); + gOrResult = isSecondGroup ? uint(1) : uint(0); + + // Alternate 0s and 1s for xor. + gXorValue = (index % 2 == 0) ? uint(0) : uint(1); + if (isOrSet) + { + // This is in second group - disrupt the alternating sequence. + gXorValue = uint(0); + } + gXorResult = isSecondGroup ? uint(0) : uint(1); + + bool result = true + & testBitwise(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang new file mode 100644 index 000000000..419ffecc5 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang @@ -0,0 +1,127 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedMin subgroupPartitionedMinNV +#define __partitionedMax subgroupPartitionedMaxNV +#else +#define __partitionedMin WaveMultiMin +#define __partitionedMax WaveMultiMax +#endif + + +static uint gMinResult = 0; +static uint gMaxResult = 0; +static uint gMinMaxValue = 0; + +__generic<T : __BuiltinArithmeticType> +bool test1MinMax(uint4 mask) +{ + let minResult = T(gMinResult); + let maxResult = T(gMaxResult); + let minMaxValue = T(gMinMaxValue); + + return true + & all(__partitionedMin(minMaxValue, mask) == minResult) + & all(__partitionedMax(minMaxValue, mask) == maxResult) + ; +} + +__generic<T : __BuiltinArithmeticType, let N : int> +bool testVMinMax(uint4 mask) { + typealias GVec = vector<T, N>; + + let minResult = GVec(T(gMinResult)); + let maxResult = GVec(T(gMaxResult)); + let minMaxValue = GVec(T(gMinMaxValue)); + + return true + & all(__partitionedMin(minMaxValue, mask) == minResult) + & all(__partitionedMax(minMaxValue, mask) == maxResult) + ; +} + +bool testMinMax(uint4 mask) +{ + return true + & test1MinMax<int>(mask) + & testVMinMax<int, 2>(mask) + & testVMinMax<int, 3>(mask) + & testVMinMax<int, 4>(mask) + & test1MinMax<uint>(mask) + & testVMinMax<uint, 2>(mask) + & testVMinMax<uint, 3>(mask) + & testVMinMax<uint, 4>(mask) + & test1MinMax<float>(mask) + & testVMinMax<float, 2>(mask) + & testVMinMax<float, 3>(mask) + & testVMinMax<float, 4>(mask) + & test1MinMax<double>(mask) + & testVMinMax<double, 2>(mask) + & testVMinMax<double, 3>(mask) + & testVMinMax<double, 4>(mask) + +#if !defined(CUDA) + & test1MinMax<int8_t>(mask) + & testVMinMax<int8_t, 2>(mask) + & testVMinMax<int8_t, 3>(mask) + & testVMinMax<int8_t, 4>(mask) + & test1MinMax<int16_t>(mask) + & testVMinMax<int16_t, 2>(mask) + & testVMinMax<int16_t, 3>(mask) + & testVMinMax<int16_t, 4>(mask) + & test1MinMax<int64_t>(mask) + & testVMinMax<int64_t, 2>(mask) + & testVMinMax<int64_t, 3>(mask) + & testVMinMax<int64_t, 4>(mask) + & test1MinMax<uint8_t>(mask) + & testVMinMax<uint8_t, 2>(mask) + & testVMinMax<uint8_t, 3>(mask) + & testVMinMax<uint8_t, 4>(mask) + & test1MinMax<uint16_t>(mask) + & testVMinMax<uint16_t, 2>(mask) + & testVMinMax<uint16_t, 3>(mask) + & testVMinMax<uint16_t, 4>(mask) + & test1MinMax<uint64_t>(mask) + & testVMinMax<uint64_t, 2>(mask) + & testVMinMax<uint64_t, 3>(mask) + & testVMinMax<uint64_t, 4>(mask) + & test1MinMax<half>(mask) + & testVMinMax<half, 2>(mask) + & testVMinMax<half, 3>(mask) + & testVMinMax<half, 4>(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + // Set min value on one invocation on each partition/mask. + let isMinInvocation = (index == 0) || (index == 15); + + gMinResult = isSecondGroup ? uint(2) : uint(0); + gMaxResult = isSecondGroup ? uint(3) : uint(1); + gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult; + + bool result = true + && testMinMax(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang new file mode 100644 index 000000000..bb1182e5e --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang @@ -0,0 +1,163 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveAnd subgroupPartitionedInclusiveAndNV +#define __partitionedInclusiveOr subgroupPartitionedInclusiveOrNV +#define __partitionedInclusiveXor subgroupPartitionedInclusiveXorNV +#define __partitionedExclusiveAnd subgroupPartitionedExclusiveAndNV +#define __partitionedExclusiveOr subgroupPartitionedExclusiveOrNV +#define __partitionedExclusiveXor subgroupPartitionedExclusiveXorNV +#else +#define __partitionedInclusiveAnd WaveMultiPrefixInclusiveBitAnd +#define __partitionedInclusiveOr WaveMultiPrefixInclusiveBitOr +#define __partitionedInclusiveXor WaveMultiPrefixInclusiveBitXor +#define __partitionedExclusiveAnd WaveMultiPrefixExclusiveBitAnd +#define __partitionedExclusiveOr WaveMultiPrefixExclusiveBitOr +#define __partitionedExclusiveXor WaveMultiPrefixExclusiveBitXor +#endif + + +static uint gAndValue = 0; +static uint gAndResultExclusive = 0; +static uint gOrValue = 0; +static uint gOrResult = 0; +static uint gXorValue = 0; +static uint gXorResultInclusive = 0; +static uint gXorResultExclusive = 0; + +__generic<T : __BuiltinLogicalType> +bool test1Bitwise(uint4 mask) +{ + let andValue = T(gAndValue); + let orValue = T(gOrValue); + let xorValue = T(gXorValue); + + return true + & (__partitionedInclusiveAnd(andValue, mask) == andValue) + & (__partitionedExclusiveAnd(andValue, mask) == T(gAndResultExclusive)) + & (__partitionedInclusiveOr(orValue, mask) == orValue) + & (__partitionedExclusiveOr(orValue, mask) == T(0)) + & (__partitionedInclusiveXor(xorValue, mask) == T(gXorResultInclusive)) + & (__partitionedExclusiveXor(xorValue, mask) == T(gXorResultExclusive)) + ; +} + +__generic<T : __BuiltinLogicalType, let N : int> +bool testVBitwise(uint4 mask) { + typealias GVec = vector<T, N>; + + let andValue = GVec(T(gAndValue)); + let orValue = GVec(T(gOrValue)); + let xorValue = GVec(T(gXorValue)); + + return true + & all(__partitionedInclusiveAnd(andValue, mask) == andValue) + & all(__partitionedExclusiveAnd(andValue, mask) == GVec(T(gAndResultExclusive))) + & all(__partitionedInclusiveOr(orValue, mask) == orValue) + & all(__partitionedExclusiveOr(orValue, mask) == GVec(T(0))) + & all(__partitionedInclusiveXor(xorValue, mask) == GVec(T(gXorResultInclusive))) + & all(__partitionedExclusiveXor(xorValue, mask) == GVec(T(gXorResultExclusive))) + ; +} + +bool testBitwise(uint4 mask) +{ + return true + & test1Bitwise<int>(mask) + & testVBitwise<int, 2>(mask) + & testVBitwise<int, 3>(mask) + & testVBitwise<int, 4>(mask) + & test1Bitwise<uint>(mask) + & testVBitwise<uint, 2>(mask) + & testVBitwise<uint, 3>(mask) + & testVBitwise<uint, 4>(mask) + + // TODO: these are failing SPIRV validation and should be fixed. + // SPIRV's ops do not directly accept/return bool. + // & test1Bitwise<bool>(mask) + // & testVBitwise<bool, 2>(mask) + // & testVBitwise<bool, 3>(mask) + // & testVBitwise<bool, 4>(mask) + +#if defined(VK) + & test1Bitwise<int8_t>(mask) + & testVBitwise<int8_t, 2>(mask) + & testVBitwise<int8_t, 3>(mask) + & testVBitwise<int8_t, 4>(mask) + & test1Bitwise<uint8_t>(mask) + & testVBitwise<uint8_t, 2>(mask) + & testVBitwise<uint8_t, 3>(mask) + & testVBitwise<uint8_t, 4>(mask) +#endif + +#if !defined(CUDA) + & test1Bitwise<int16_t>(mask) + & testVBitwise<int16_t, 2>(mask) + & testVBitwise<int16_t, 3>(mask) + & testVBitwise<int16_t, 4>(mask) + & test1Bitwise<int64_t>(mask) + & testVBitwise<int64_t, 2>(mask) + & testVBitwise<int64_t, 3>(mask) + & testVBitwise<int64_t, 4>(mask) + & test1Bitwise<uint16_t>(mask) + & testVBitwise<uint16_t, 2>(mask) + & testVBitwise<uint16_t, 3>(mask) + & testVBitwise<uint16_t, 4>(mask) + & test1Bitwise<uint64_t>(mask) + & testVBitwise<uint64_t, 2>(mask) + & testVBitwise<uint64_t, 3>(mask) + & testVBitwise<uint64_t, 4>(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + let isSecondGroup = index >= 15; + let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + let isLastInvocation = (index == 31); + let isLastInPartition = (index == 14) || (index == 31); + let isFirstInPartition = (index == 0) || (index == 15); + + // + // Prefix and. + // - Both groups use 1 except for the last invocation in each partition where input is 0. + // - For inclusive ops, result is 1 except for last invocation in each partition. + // - For exclusive ops, first in partition is always results to ~0(identity). Otherwise exclusive ops result to 1. + gAndValue = isLastInPartition ? uint(0) : uint(1); + gAndResultExclusive = isFirstInPartition ? uint(~0) : uint(1); + + // + // Prefix or. + // - Both groups use 0 except for the last invocation in each partition where input is 1. + // - For inclusive ops, result is 0 except for last invocation in each partition. + // - For exclusive ops, result is always 0. + gOrValue = isLastInPartition ? uint(1) : uint(0); + + // Prefix xor. + // - First group input is always 1. Inclusive results alternate between 1 and 0, starting at 1. Exclusive result is also alternates but starts at 0 (opposite of inclusive result). + // - Second group is always 0. Results are all 0. + gXorValue = isSecondGroup ? uint(0) : uint(1); + gXorResultInclusive = (isSecondGroup || (index % 2 != 0)) ? uint(0) : uint(1); + gXorResultExclusive = isSecondGroup ? uint(0) : (uint(1) - gXorResultInclusive); + + bool result = true + & testBitwise(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang new file mode 100644 index 000000000..654fd6130 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang @@ -0,0 +1,144 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveMax subgroupPartitionedInclusiveMaxNV +#define __partitionedExclusiveMax subgroupPartitionedExclusiveMaxNV +#else +#define __partitionedInclusiveMax WaveMultiPrefixInclusiveMax +#define __partitionedExclusiveMax WaveMultiPrefixExclusiveMax +#endif + +static bool isFirstInPartition = false; +static uint gSmaller = 0; +static uint gLarger = 0; +static uint gMaxValue = 0; + +__generic<T : __BuiltinArithmeticType> +bool test1MinMax(uint4 mask) +{ + let smaller = T(gSmaller); + let maxValue = T(gMaxValue); + + // The larger values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & (__partitionedExclusiveMax(maxValue, mask) == smaller) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & (__partitionedInclusiveMax(maxValue, mask) == maxValue) + & exclusiveRes + ; +} + +__generic<T : __BuiltinArithmeticType, let N : int> +bool testVMinMax(uint4 mask) { + typealias GVec = vector<T, N>; + + let smaller = GVec(T(gSmaller)); + let maxValue = GVec(T(gMaxValue)); + + // The larger values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & all(__partitionedExclusiveMax(maxValue, mask) == smaller) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & all(__partitionedInclusiveMax(maxValue, mask) == maxValue) + & exclusiveRes; + ; +} + +bool testMinMax(uint4 mask) +{ + return true + & test1MinMax<int>(mask) + & testVMinMax<int, 2>(mask) + & testVMinMax<int, 3>(mask) + & testVMinMax<int, 4>(mask) + & test1MinMax<uint>(mask) + & testVMinMax<uint, 2>(mask) + & testVMinMax<uint, 3>(mask) + & testVMinMax<uint, 4>(mask) + & test1MinMax<float>(mask) + & testVMinMax<float, 2>(mask) + & testVMinMax<float, 3>(mask) + & testVMinMax<float, 4>(mask) + & test1MinMax<double>(mask) + & testVMinMax<double, 2>(mask) + & testVMinMax<double, 3>(mask) + & testVMinMax<double, 4>(mask) + & test1MinMax<int8_t>(mask) + & testVMinMax<int8_t, 2>(mask) + & testVMinMax<int8_t, 3>(mask) + & testVMinMax<int8_t, 4>(mask) + & test1MinMax<int16_t>(mask) + & testVMinMax<int16_t, 2>(mask) + & testVMinMax<int16_t, 3>(mask) + & testVMinMax<int16_t, 4>(mask) + & test1MinMax<int64_t>(mask) + & testVMinMax<int64_t, 2>(mask) + & testVMinMax<int64_t, 3>(mask) + & testVMinMax<int64_t, 4>(mask) + & test1MinMax<uint8_t>(mask) + & testVMinMax<uint8_t, 2>(mask) + & testVMinMax<uint8_t, 3>(mask) + & testVMinMax<uint8_t, 4>(mask) + & test1MinMax<uint16_t>(mask) + & testVMinMax<uint16_t, 2>(mask) + & testVMinMax<uint16_t, 3>(mask) + & testVMinMax<uint16_t, 4>(mask) + & test1MinMax<uint64_t>(mask) + & testVMinMax<uint64_t, 2>(mask) + & testVMinMax<uint64_t, 3>(mask) + & testVMinMax<uint64_t, 4>(mask) + & test1MinMax<half>(mask) + & testVMinMax<half, 2>(mask) + & testVMinMax<half, 3>(mask) + & testVMinMax<half, 4>(mask) + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +[MaximallyReconverges] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + isFirstInPartition = (index == 0) || (index == 15); + let isLastInPartition = (index == 14) || (index == 31); + + gSmaller = isSecondGroup ? 2 : 0; + gLarger = isSecondGroup ? 3 : 1; + gMaxValue = isLastInPartition ? gLarger : gSmaller; + + bool result = true + & testMinMax(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang new file mode 100644 index 000000000..68e1e9c05 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang @@ -0,0 +1,144 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveMin subgroupPartitionedInclusiveMinNV +#define __partitionedExclusiveMin subgroupPartitionedExclusiveMinNV +#else +#define __partitionedInclusiveMin WaveMultiPrefixInclusiveMin +#define __partitionedExclusiveMin WaveMultiPrefixExclusiveMin +#endif + +static bool isFirstInPartition = false; +static uint gSmaller = 0; +static uint gLarger = 0; +static uint gMaxValue = 0; + +__generic<T : __BuiltinArithmeticType> +bool test1Min(uint4 mask) +{ + let larger = T(gLarger); + let minValue = T(gMaxValue); + + // The smaller values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & (__partitionedExclusiveMin(minValue, mask) == larger) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & (__partitionedInclusiveMin(minValue, mask) == minValue) + & exclusiveRes + ; +} + +__generic<T : __BuiltinArithmeticType, let N : int> +bool testVMin(uint4 mask) { + typealias GVec = vector<T, N>; + + let larger = GVec(T(gLarger)); + let minValue = GVec(T(gMaxValue)); + + // The smaller values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & all(__partitionedExclusiveMin(minValue, mask) == larger) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & all(__partitionedInclusiveMin(minValue, mask) == minValue) + & exclusiveRes + ; +} + +bool testMin(uint4 mask) +{ + return true + & test1Min<int>(mask) + & testVMin<int, 2>(mask) + & testVMin<int, 3>(mask) + & testVMin<int, 4>(mask) + & test1Min<uint>(mask) + & testVMin<uint, 2>(mask) + & testVMin<uint, 3>(mask) + & testVMin<uint, 4>(mask) + & test1Min<float>(mask) + & testVMin<float, 2>(mask) + & testVMin<float, 3>(mask) + & testVMin<float, 4>(mask) + & test1Min<double>(mask) + & testVMin<double, 2>(mask) + & testVMin<double, 3>(mask) + & testVMin<double, 4>(mask) + & test1Min<int8_t>(mask) + & testVMin<int8_t, 2>(mask) + & testVMin<int8_t, 3>(mask) + & testVMin<int8_t, 4>(mask) + & test1Min<int16_t>(mask) + & testVMin<int16_t, 2>(mask) + & testVMin<int16_t, 3>(mask) + & testVMin<int16_t, 4>(mask) + & test1Min<int64_t>(mask) + & testVMin<int64_t, 2>(mask) + & testVMin<int64_t, 3>(mask) + & testVMin<int64_t, 4>(mask) + & test1Min<uint8_t>(mask) + & testVMin<uint8_t, 2>(mask) + & testVMin<uint8_t, 3>(mask) + & testVMin<uint8_t, 4>(mask) + & test1Min<uint16_t>(mask) + & testVMin<uint16_t, 2>(mask) + & testVMin<uint16_t, 3>(mask) + & testVMin<uint16_t, 4>(mask) + & test1Min<uint64_t>(mask) + & testVMin<uint64_t, 2>(mask) + & testVMin<uint64_t, 3>(mask) + & testVMin<uint64_t, 4>(mask) + & test1Min<half>(mask) + & testVMin<half, 2>(mask) + & testVMin<half, 3>(mask) + & testVMin<half, 4>(mask) + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +[MaximallyReconverges] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + isFirstInPartition = (index == 0) || (index == 15); + let isLastInPartition = (index == 14) || (index == 31); + + bool result = true + & testMin(mask) + ; + + gSmaller = isSecondGroup ? 2 : 0; + gLarger = isSecondGroup ? 3 : 1; + gMaxValue = isLastInPartition ? gLarger : gSmaller; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang index 69240198e..5de34b20a 100644 --- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang @@ -10,6 +10,7 @@ RWStructuredBuffer<uint> outputBuffer; [numthreads(8, 1, 1)] +[shader("compute")] void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { uint index = int(dispatchThreadID.x); diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt index c80baa5b1..c80baa5b1 100644 --- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang new file mode 100644 index 000000000..bb641cab1 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang @@ -0,0 +1,136 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveSum subgroupPartitionedInclusiveAddNV +#define __partitionedInclusiveProduct subgroupPartitionedInclusiveMulNV +#define __partitionedExclusiveSum subgroupPartitionedExclusiveAddNV +#define __partitionedExclusiveProduct subgroupPartitionedExclusiveMulNV +#else +#define __partitionedInclusiveSum WaveMultiPrefixInclusiveSum +#define __partitionedInclusiveProduct WaveMultiPrefixInclusiveProduct +#define __partitionedExclusiveSum WaveMultiPrefixExclusiveSum +#define __partitionedExclusiveProduct WaveMultiPrefixExclusiveProduct +#endif + +static uint partitionedIndex = 0; +static uint gProductValue = 0; + +__generic<T : __BuiltinArithmeticType> +bool test1SumProduct(uint4 mask) +{ + let productValue = T(gProductValue); + + return true + & (__partitionedInclusiveSum(T(1), mask) == T(partitionedIndex + 1)) + & (__partitionedInclusiveProduct(productValue, mask) == productValue) + & (__partitionedExclusiveSum(T(1), mask) == T(partitionedIndex)) + & (__partitionedExclusiveProduct(productValue, mask) == T(1)) + ; +} + +__generic<T : __BuiltinArithmeticType, let N : int> +bool testVSumProduct(uint4 mask) { + typealias GVec = vector<T, N>; + + let productValue = GVec(T(gProductValue)); + + return true + & all(__partitionedInclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex + 1))) + & all(__partitionedInclusiveProduct(productValue, mask) == productValue) + & all(__partitionedExclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex))) + & all(__partitionedExclusiveProduct(productValue, mask) == GVec(T(1))) + ; +} + +bool testSumProduct(uint4 mask) +{ + return true + & test1SumProduct<int>(mask) + & testVSumProduct<int, 2>(mask) + & testVSumProduct<int, 3>(mask) + & testVSumProduct<int, 4>(mask) + & test1SumProduct<uint>(mask) + & testVSumProduct<uint, 2>(mask) + & testVSumProduct<uint, 3>(mask) + & testVSumProduct<uint, 4>(mask) + & test1SumProduct<float>(mask) + & testVSumProduct<float, 2>(mask) + & testVSumProduct<float, 3>(mask) + & testVSumProduct<float, 4>(mask) + & test1SumProduct<double>(mask) + & testVSumProduct<double, 2>(mask) + & testVSumProduct<double, 3>(mask) + & testVSumProduct<double, 4>(mask) + +#if defined(VK) + & test1SumProduct<int8_t>(mask) + & testVSumProduct<int8_t, 2>(mask) + & testVSumProduct<int8_t, 3>(mask) + & testVSumProduct<int8_t, 4>(mask) + & test1SumProduct<uint8_t>(mask) + & testVSumProduct<uint8_t, 2>(mask) + & testVSumProduct<uint8_t, 3>(mask) + & testVSumProduct<uint8_t, 4>(mask) +#endif + +#if !defined(CUDA) + & test1SumProduct<int16_t>(mask) + & testVSumProduct<int16_t, 2>(mask) + & testVSumProduct<int16_t, 3>(mask) + & testVSumProduct<int16_t, 4>(mask) + & test1SumProduct<int64_t>(mask) + & testVSumProduct<int64_t, 2>(mask) + & testVSumProduct<int64_t, 3>(mask) + & testVSumProduct<int64_t, 4>(mask) + & test1SumProduct<uint16_t>(mask) + & testVSumProduct<uint16_t, 2>(mask) + & testVSumProduct<uint16_t, 3>(mask) + & testVSumProduct<uint16_t, 4>(mask) + & test1SumProduct<uint64_t>(mask) + & testVSumProduct<uint64_t, 2>(mask) + & testVSumProduct<uint64_t, 3>(mask) + & testVSumProduct<uint64_t, 4>(mask) + & test1SumProduct<half>(mask) + & testVSumProduct<half, 2>(mask) + & testVSumProduct<half, 3>(mask) + & testVSumProduct<half, 4>(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + partitionedIndex = index; + bool isSecondGroup = false; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + uint4 mask = uint4(0x0007FFF, 0, 0, 0); + if (index >= 15) + { + isSecondGroup = true; + mask = uint4(0xFFFF8000, 0, 0, 0); + partitionedIndex -= 15; + } + + let isLastInPartition = (index == 14) || (index == 31); + gProductValue = isLastInPartition ? uint(0) : uint(1); + + bool result = true + & testSumProduct(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang index 99698e497..99698e497 100644 --- a/tests/hlsl-intrinsic/wave-multi-prefix.slang +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang new file mode 100644 index 000000000..b40b014f4 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang @@ -0,0 +1,114 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedSum subgroupPartitionedAddNV +#define __partitionedProduct subgroupPartitionedMulNV +#else +#define __partitionedSum WaveMultiSum +#define __partitionedProduct WaveMultiProduct +#endif + +static uint gSumResult = 0; + +__generic<T : __BuiltinArithmeticType> +bool test1SumProduct(uint4 mask) +{ + let sumResult = T(gSumResult); + + return true + & (__partitionedSum(T(1), mask) == sumResult) + & (__partitionedProduct(T(1), mask) == T(1)) + ; +} + +__generic<T : __BuiltinArithmeticType, let N : int> +bool testVSumProduct(uint4 mask) { + typealias GVec = vector<T, N>; + + let sumResult = GVec(T(gSumResult)); + + return true + & all(__partitionedSum(GVec(T(1)), mask) == sumResult) + & all(__partitionedProduct(GVec(T(1)), mask) == GVec(T(1))) + ; +} + +bool testSumProduct(uint4 mask) +{ + return true + & test1SumProduct<int>(mask) + & testVSumProduct<int, 2>(mask) + & testVSumProduct<int, 3>(mask) + & testVSumProduct<int, 4>(mask) + & test1SumProduct<uint>(mask) + & testVSumProduct<uint, 2>(mask) + & testVSumProduct<uint, 3>(mask) + & testVSumProduct<uint, 4>(mask) + & test1SumProduct<float>(mask) + & testVSumProduct<float, 2>(mask) + & testVSumProduct<float, 3>(mask) + & testVSumProduct<float, 4>(mask) + & test1SumProduct<double>(mask) + & testVSumProduct<double, 2>(mask) + & testVSumProduct<double, 3>(mask) + & testVSumProduct<double, 4>(mask) + +#if !defined(CUDA) + & test1SumProduct<int8_t>(mask) + & testVSumProduct<int8_t, 2>(mask) + & testVSumProduct<int8_t, 3>(mask) + & testVSumProduct<int8_t, 4>(mask) + & test1SumProduct<int16_t>(mask) + & testVSumProduct<int16_t, 2>(mask) + & testVSumProduct<int16_t, 3>(mask) + & testVSumProduct<int16_t, 4>(mask) + & test1SumProduct<int64_t>(mask) + & testVSumProduct<int64_t, 2>(mask) + & testVSumProduct<int64_t, 3>(mask) + & testVSumProduct<int64_t, 4>(mask) + & test1SumProduct<uint8_t>(mask) + & testVSumProduct<uint8_t, 2>(mask) + & testVSumProduct<uint8_t, 3>(mask) + & testVSumProduct<uint8_t, 4>(mask) + & test1SumProduct<uint16_t>(mask) + & testVSumProduct<uint16_t, 2>(mask) + & testVSumProduct<uint16_t, 3>(mask) + & testVSumProduct<uint16_t, 4>(mask) + & test1SumProduct<uint64_t>(mask) + & testVSumProduct<uint64_t, 2>(mask) + & testVSumProduct<uint64_t, 3>(mask) + & testVSumProduct<uint64_t, 4>(mask) + & test1SumProduct<half>(mask) + & testVSumProduct<half, 2>(mask) + & testVSumProduct<half, 3>(mask) + & testVSumProduct<half, 4>(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + gSumResult = isSecondGroup ? uint(17) : uint(15); + + bool result = true + & testSumProduct(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/language-feature/capability/testing-framework-with-profiles.slang b/tests/language-feature/capability/testing-framework-with-profiles.slang index 215ba887e..97ff32a9d 100644 --- a/tests/language-feature/capability/testing-framework-with-profiles.slang +++ b/tests/language-feature/capability/testing-framework-with-profiles.slang @@ -17,5 +17,5 @@ RWStructuredBuffer<uint> outputBuffer; void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { // BUF: 1 - outputBuffer[0] = WaveMaskSum(0xFF, 1); + outputBuffer[0] = WaveActiveSum(1); } |
