summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorDarren Wihandi <65404740+fairywreath@users.noreply.github.com>2025-05-25 12:58:08 -0400
committerGitHub <noreply@github.com>2025-05-25 09:58:08 -0700
commit0476b57faad96bee61f59f27ddd48c6cb067cfa2 (patch)
treed3fe49cd906c29b03b2a840dd2c057ccc331b4f7 /tests
parent554be7a5f990df19a21db10b4e5dc0285cbe8168 (diff)
Add full support for SPV_NV_shader_subgroup_partitioned (#7103)
* Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics * More partitioned intrinsics * More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations * Refactor HLSL WaveMultiPrefix* implementations * fix cap atoms * Clean up implementation * Add GLSL intrinsics and cleanup * Add tests * Fix affected capability test * Update and fix tests * Move expected.txt file * Refactor WaveMask* to call WaveMulti* * Refactor SPIRV/GLSL preamble code * Enable emit-via-glsl tests * remove wave_multi_prefix capability in favor of subgroup_partitioned * Update docs * Update cap atoms doc
Diffstat (limited to 'tests')
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-active-product.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-diverge.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-matrix.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-mask/wave-vector.slang4
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang139
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang127
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang163
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang144
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang144
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang (renamed from tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang)1
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt (renamed from tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt)0
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang136
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang (renamed from tests/hlsl-intrinsic/wave-multi-prefix.slang)0
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang114
-rw-r--r--tests/language-feature/capability/testing-framework-with-profiles.slang2
17 files changed, 981 insertions, 13 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
index 8a47c5733..da94ad794 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -26,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
const WaveMask mask2 = mask0 & ~mask1;
outputBuffer[idx] = WaveMaskProduct(mask2, idx);
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
index 3dd33f150..3a1c26f8e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -30,4 +30,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
value = WaveMaskMin(mask2, idx + 1);
outputBuffer[idx] = value;
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
index f333a59fb..fb5573bd1 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
//DISABLE_TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -37,4 +37,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
matrix<int, 2, 2> r = r0 + matrix<int, 2, 2>(r1) + r6;
outputBuffer[idx] = r[0][0] + r[0][1] + r[1][0] + r[1][1];
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
index b12e9c1b3..e32524b1e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -25,4 +25,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
outputBuffer[idx] = r0 + (r2 << 16);
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
index 51e9b7600..2e0fba746 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -23,4 +23,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
int r2 = int(r1.x) + int(r1.y) - idx;
outputBuffer[idx] = r0 + (r2 << 16);
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
index b1f44f4fb..7c326e0f3 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
@@ -29,4 +29,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
int2 r = r0 + int2(r1) + r2 + r3 + r4;
outputBuffer[idx] = r.x + r.y;
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
new file mode 100644
index 000000000..c2a292c14
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
@@ -0,0 +1,139 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedAnd subgroupPartitionedAndNV
+#define __partitionedOr subgroupPartitionedOrNV
+#define __partitionedXor subgroupPartitionedXorNV
+#else
+#define __partitionedAnd WaveMultiBitAnd
+#define __partitionedOr WaveMultiBitOr
+#define __partitionedXor WaveMultiBitXor
+#endif
+
+static uint gAndValue = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResult = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+ let andValue = T(gAndValue);
+ let orValue = T(gOrValue);
+ let orResult = T(gOrResult);
+ let xorValue = T(gXorValue);
+ let xorResult = T(gXorResult);
+
+ return true
+ & (__partitionedAnd(andValue, mask) == andValue)
+ & (__partitionedOr(orValue, mask) == orResult)
+ & (__partitionedXor(xorValue, mask) == xorResult)
+ ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let andValue = GVec(T(gAndValue));
+ let orValue = GVec(T(gOrValue));
+ let orResult = GVec(T(gOrResult));
+ let xorValue = GVec(T(gXorValue));
+ let xorResult = GVec(T(gXorResult));
+
+ return true
+ & all(__partitionedAnd(andValue, mask) == andValue)
+ & all(__partitionedOr(orValue, mask) == orResult)
+ & all(__partitionedXor(xorValue, mask) == xorResult)
+ ;
+}
+
+bool testBitwise(uint4 mask)
+{
+ return true
+ & test1Bitwise<int>(mask)
+ & testVBitwise<int, 2>(mask)
+ & testVBitwise<int, 3>(mask)
+ & testVBitwise<int, 4>(mask)
+ & test1Bitwise<uint>(mask)
+ & testVBitwise<uint, 2>(mask)
+ & testVBitwise<uint, 3>(mask)
+ & testVBitwise<uint, 4>(mask)
+
+ // TODO: these are failing SPIRV validation and should be fixed.
+ // SPIRV's ops do not directly accept/return bool.
+ // & test1Bitwise<bool>(mask)
+ // & testVBitwise<bool, 2>(mask)
+ // & testVBitwise<bool, 3>(mask)
+ // & testVBitwise<bool, 4>(mask)
+
+#if !defined(CUDA)
+ & test1Bitwise<int8_t>(mask)
+ & testVBitwise<int8_t, 2>(mask)
+ & testVBitwise<int8_t, 3>(mask)
+ & testVBitwise<int8_t, 4>(mask)
+ & test1Bitwise<int16_t>(mask)
+ & testVBitwise<int16_t, 2>(mask)
+ & testVBitwise<int16_t, 3>(mask)
+ & testVBitwise<int16_t, 4>(mask)
+ & test1Bitwise<int64_t>(mask)
+ & testVBitwise<int64_t, 2>(mask)
+ & testVBitwise<int64_t, 3>(mask)
+ & testVBitwise<int64_t, 4>(mask)
+ & test1Bitwise<uint8_t>(mask)
+ & testVBitwise<uint8_t, 2>(mask)
+ & testVBitwise<uint8_t, 3>(mask)
+ & testVBitwise<uint8_t, 4>(mask)
+ & test1Bitwise<uint16_t>(mask)
+ & testVBitwise<uint16_t, 2>(mask)
+ & testVBitwise<uint16_t, 3>(mask)
+ & testVBitwise<uint16_t, 4>(mask)
+ & test1Bitwise<uint64_t>(mask)
+ & testVBitwise<uint64_t, 2>(mask)
+ & testVBitwise<uint64_t, 3>(mask)
+ & testVBitwise<uint64_t, 4>(mask)
+#endif
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ let index = dispatchThreadID.x;
+
+ let isSecondGroup = index >= 15;
+ let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+ // One invocation in second group is different from others to test or and xor operations.
+ let isOrSet = (index == 15);
+
+ gAndValue = isSecondGroup ? uint(1) : uint(0);
+ gOrValue = isOrSet ? uint(1) : uint(0);
+ gOrResult = isSecondGroup ? uint(1) : uint(0);
+
+ // Alternate 0s and 1s for xor.
+ gXorValue = (index % 2 == 0) ? uint(0) : uint(1);
+ if (isOrSet)
+ {
+ // This is in second group - disrupt the alternating sequence.
+ gXorValue = uint(0);
+ }
+ gXorResult = isSecondGroup ? uint(0) : uint(1);
+
+ bool result = true
+ & testBitwise(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
new file mode 100644
index 000000000..419ffecc5
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
@@ -0,0 +1,127 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedMin subgroupPartitionedMinNV
+#define __partitionedMax subgroupPartitionedMaxNV
+#else
+#define __partitionedMin WaveMultiMin
+#define __partitionedMax WaveMultiMax
+#endif
+
+
+static uint gMinResult = 0;
+static uint gMaxResult = 0;
+static uint gMinMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+ let minResult = T(gMinResult);
+ let maxResult = T(gMaxResult);
+ let minMaxValue = T(gMinMaxValue);
+
+ return true
+ & all(__partitionedMin(minMaxValue, mask) == minResult)
+ & all(__partitionedMax(minMaxValue, mask) == maxResult)
+ ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let minResult = GVec(T(gMinResult));
+ let maxResult = GVec(T(gMaxResult));
+ let minMaxValue = GVec(T(gMinMaxValue));
+
+ return true
+ & all(__partitionedMin(minMaxValue, mask) == minResult)
+ & all(__partitionedMax(minMaxValue, mask) == maxResult)
+ ;
+}
+
+bool testMinMax(uint4 mask)
+{
+ return true
+ & test1MinMax<int>(mask)
+ & testVMinMax<int, 2>(mask)
+ & testVMinMax<int, 3>(mask)
+ & testVMinMax<int, 4>(mask)
+ & test1MinMax<uint>(mask)
+ & testVMinMax<uint, 2>(mask)
+ & testVMinMax<uint, 3>(mask)
+ & testVMinMax<uint, 4>(mask)
+ & test1MinMax<float>(mask)
+ & testVMinMax<float, 2>(mask)
+ & testVMinMax<float, 3>(mask)
+ & testVMinMax<float, 4>(mask)
+ & test1MinMax<double>(mask)
+ & testVMinMax<double, 2>(mask)
+ & testVMinMax<double, 3>(mask)
+ & testVMinMax<double, 4>(mask)
+
+#if !defined(CUDA)
+ & test1MinMax<int8_t>(mask)
+ & testVMinMax<int8_t, 2>(mask)
+ & testVMinMax<int8_t, 3>(mask)
+ & testVMinMax<int8_t, 4>(mask)
+ & test1MinMax<int16_t>(mask)
+ & testVMinMax<int16_t, 2>(mask)
+ & testVMinMax<int16_t, 3>(mask)
+ & testVMinMax<int16_t, 4>(mask)
+ & test1MinMax<int64_t>(mask)
+ & testVMinMax<int64_t, 2>(mask)
+ & testVMinMax<int64_t, 3>(mask)
+ & testVMinMax<int64_t, 4>(mask)
+ & test1MinMax<uint8_t>(mask)
+ & testVMinMax<uint8_t, 2>(mask)
+ & testVMinMax<uint8_t, 3>(mask)
+ & testVMinMax<uint8_t, 4>(mask)
+ & test1MinMax<uint16_t>(mask)
+ & testVMinMax<uint16_t, 2>(mask)
+ & testVMinMax<uint16_t, 3>(mask)
+ & testVMinMax<uint16_t, 4>(mask)
+ & test1MinMax<uint64_t>(mask)
+ & testVMinMax<uint64_t, 2>(mask)
+ & testVMinMax<uint64_t, 3>(mask)
+ & testVMinMax<uint64_t, 4>(mask)
+ & test1MinMax<half>(mask)
+ & testVMinMax<half, 2>(mask)
+ & testVMinMax<half, 3>(mask)
+ & testVMinMax<half, 4>(mask)
+#endif
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint index = dispatchThreadID.x;
+
+ // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+ let isSecondGroup = index >= 15;
+ uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+ // Set min value on one invocation on each partition/mask.
+ let isMinInvocation = (index == 0) || (index == 15);
+
+ gMinResult = isSecondGroup ? uint(2) : uint(0);
+ gMaxResult = isSecondGroup ? uint(3) : uint(1);
+ gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult;
+
+ bool result = true
+ && testMinMax(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
new file mode 100644
index 000000000..bb1182e5e
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
@@ -0,0 +1,163 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveAnd subgroupPartitionedInclusiveAndNV
+#define __partitionedInclusiveOr subgroupPartitionedInclusiveOrNV
+#define __partitionedInclusiveXor subgroupPartitionedInclusiveXorNV
+#define __partitionedExclusiveAnd subgroupPartitionedExclusiveAndNV
+#define __partitionedExclusiveOr subgroupPartitionedExclusiveOrNV
+#define __partitionedExclusiveXor subgroupPartitionedExclusiveXorNV
+#else
+#define __partitionedInclusiveAnd WaveMultiPrefixInclusiveBitAnd
+#define __partitionedInclusiveOr WaveMultiPrefixInclusiveBitOr
+#define __partitionedInclusiveXor WaveMultiPrefixInclusiveBitXor
+#define __partitionedExclusiveAnd WaveMultiPrefixExclusiveBitAnd
+#define __partitionedExclusiveOr WaveMultiPrefixExclusiveBitOr
+#define __partitionedExclusiveXor WaveMultiPrefixExclusiveBitXor
+#endif
+
+
+static uint gAndValue = 0;
+static uint gAndResultExclusive = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResultInclusive = 0;
+static uint gXorResultExclusive = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+ let andValue = T(gAndValue);
+ let orValue = T(gOrValue);
+ let xorValue = T(gXorValue);
+
+ return true
+ & (__partitionedInclusiveAnd(andValue, mask) == andValue)
+ & (__partitionedExclusiveAnd(andValue, mask) == T(gAndResultExclusive))
+ & (__partitionedInclusiveOr(orValue, mask) == orValue)
+ & (__partitionedExclusiveOr(orValue, mask) == T(0))
+ & (__partitionedInclusiveXor(xorValue, mask) == T(gXorResultInclusive))
+ & (__partitionedExclusiveXor(xorValue, mask) == T(gXorResultExclusive))
+ ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let andValue = GVec(T(gAndValue));
+ let orValue = GVec(T(gOrValue));
+ let xorValue = GVec(T(gXorValue));
+
+ return true
+ & all(__partitionedInclusiveAnd(andValue, mask) == andValue)
+ & all(__partitionedExclusiveAnd(andValue, mask) == GVec(T(gAndResultExclusive)))
+ & all(__partitionedInclusiveOr(orValue, mask) == orValue)
+ & all(__partitionedExclusiveOr(orValue, mask) == GVec(T(0)))
+ & all(__partitionedInclusiveXor(xorValue, mask) == GVec(T(gXorResultInclusive)))
+ & all(__partitionedExclusiveXor(xorValue, mask) == GVec(T(gXorResultExclusive)))
+ ;
+}
+
+bool testBitwise(uint4 mask)
+{
+ return true
+ & test1Bitwise<int>(mask)
+ & testVBitwise<int, 2>(mask)
+ & testVBitwise<int, 3>(mask)
+ & testVBitwise<int, 4>(mask)
+ & test1Bitwise<uint>(mask)
+ & testVBitwise<uint, 2>(mask)
+ & testVBitwise<uint, 3>(mask)
+ & testVBitwise<uint, 4>(mask)
+
+ // TODO: these are failing SPIRV validation and should be fixed.
+ // SPIRV's ops do not directly accept/return bool.
+ // & test1Bitwise<bool>(mask)
+ // & testVBitwise<bool, 2>(mask)
+ // & testVBitwise<bool, 3>(mask)
+ // & testVBitwise<bool, 4>(mask)
+
+#if defined(VK)
+ & test1Bitwise<int8_t>(mask)
+ & testVBitwise<int8_t, 2>(mask)
+ & testVBitwise<int8_t, 3>(mask)
+ & testVBitwise<int8_t, 4>(mask)
+ & test1Bitwise<uint8_t>(mask)
+ & testVBitwise<uint8_t, 2>(mask)
+ & testVBitwise<uint8_t, 3>(mask)
+ & testVBitwise<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+ & test1Bitwise<int16_t>(mask)
+ & testVBitwise<int16_t, 2>(mask)
+ & testVBitwise<int16_t, 3>(mask)
+ & testVBitwise<int16_t, 4>(mask)
+ & test1Bitwise<int64_t>(mask)
+ & testVBitwise<int64_t, 2>(mask)
+ & testVBitwise<int64_t, 3>(mask)
+ & testVBitwise<int64_t, 4>(mask)
+ & test1Bitwise<uint16_t>(mask)
+ & testVBitwise<uint16_t, 2>(mask)
+ & testVBitwise<uint16_t, 3>(mask)
+ & testVBitwise<uint16_t, 4>(mask)
+ & test1Bitwise<uint64_t>(mask)
+ & testVBitwise<uint64_t, 2>(mask)
+ & testVBitwise<uint64_t, 3>(mask)
+ & testVBitwise<uint64_t, 4>(mask)
+#endif
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ let index = dispatchThreadID.x;
+
+ let isSecondGroup = index >= 15;
+ let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+ let isLastInvocation = (index == 31);
+ let isLastInPartition = (index == 14) || (index == 31);
+ let isFirstInPartition = (index == 0) || (index == 15);
+
+ //
+ // Prefix and.
+ // - Both groups use 1 except for the last invocation in each partition where input is 0.
+ // - For inclusive ops, result is 1 except for last invocation in each partition.
+ // - For exclusive ops, first in partition is always results to ~0(identity). Otherwise exclusive ops result to 1.
+ gAndValue = isLastInPartition ? uint(0) : uint(1);
+ gAndResultExclusive = isFirstInPartition ? uint(~0) : uint(1);
+
+ //
+ // Prefix or.
+ // - Both groups use 0 except for the last invocation in each partition where input is 1.
+ // - For inclusive ops, result is 0 except for last invocation in each partition.
+ // - For exclusive ops, result is always 0.
+ gOrValue = isLastInPartition ? uint(1) : uint(0);
+
+ // Prefix xor.
+ // - First group input is always 1. Inclusive results alternate between 1 and 0, starting at 1. Exclusive result is also alternates but starts at 0 (opposite of inclusive result).
+ // - Second group is always 0. Results are all 0.
+ gXorValue = isSecondGroup ? uint(0) : uint(1);
+ gXorResultInclusive = (isSecondGroup || (index % 2 != 0)) ? uint(0) : uint(1);
+ gXorResultExclusive = isSecondGroup ? uint(0) : (uint(1) - gXorResultInclusive);
+
+ bool result = true
+ & testBitwise(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
new file mode 100644
index 000000000..654fd6130
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMax subgroupPartitionedInclusiveMaxNV
+#define __partitionedExclusiveMax subgroupPartitionedExclusiveMaxNV
+#else
+#define __partitionedInclusiveMax WaveMultiPrefixInclusiveMax
+#define __partitionedExclusiveMax WaveMultiPrefixExclusiveMax
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+ let smaller = T(gSmaller);
+ let maxValue = T(gMaxValue);
+
+ // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+ bool exclusiveRes = true
+ & (__partitionedExclusiveMax(maxValue, mask) == smaller)
+ ;
+ // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+ // nice to have something like T::min or T::max.
+ if (isFirstInPartition)
+ {
+ exclusiveRes = true;
+ }
+
+ return true
+ & (__partitionedInclusiveMax(maxValue, mask) == maxValue)
+ & exclusiveRes
+ ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let smaller = GVec(T(gSmaller));
+ let maxValue = GVec(T(gMaxValue));
+
+ // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+ bool exclusiveRes = true
+ & all(__partitionedExclusiveMax(maxValue, mask) == smaller)
+ ;
+ // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+ // nice to have something like T::min or T::max.
+ if (isFirstInPartition)
+ {
+ exclusiveRes = true;
+ }
+
+ return true
+ & all(__partitionedInclusiveMax(maxValue, mask) == maxValue)
+ & exclusiveRes;
+ ;
+}
+
+bool testMinMax(uint4 mask)
+{
+ return true
+ & test1MinMax<int>(mask)
+ & testVMinMax<int, 2>(mask)
+ & testVMinMax<int, 3>(mask)
+ & testVMinMax<int, 4>(mask)
+ & test1MinMax<uint>(mask)
+ & testVMinMax<uint, 2>(mask)
+ & testVMinMax<uint, 3>(mask)
+ & testVMinMax<uint, 4>(mask)
+ & test1MinMax<float>(mask)
+ & testVMinMax<float, 2>(mask)
+ & testVMinMax<float, 3>(mask)
+ & testVMinMax<float, 4>(mask)
+ & test1MinMax<double>(mask)
+ & testVMinMax<double, 2>(mask)
+ & testVMinMax<double, 3>(mask)
+ & testVMinMax<double, 4>(mask)
+ & test1MinMax<int8_t>(mask)
+ & testVMinMax<int8_t, 2>(mask)
+ & testVMinMax<int8_t, 3>(mask)
+ & testVMinMax<int8_t, 4>(mask)
+ & test1MinMax<int16_t>(mask)
+ & testVMinMax<int16_t, 2>(mask)
+ & testVMinMax<int16_t, 3>(mask)
+ & testVMinMax<int16_t, 4>(mask)
+ & test1MinMax<int64_t>(mask)
+ & testVMinMax<int64_t, 2>(mask)
+ & testVMinMax<int64_t, 3>(mask)
+ & testVMinMax<int64_t, 4>(mask)
+ & test1MinMax<uint8_t>(mask)
+ & testVMinMax<uint8_t, 2>(mask)
+ & testVMinMax<uint8_t, 3>(mask)
+ & testVMinMax<uint8_t, 4>(mask)
+ & test1MinMax<uint16_t>(mask)
+ & testVMinMax<uint16_t, 2>(mask)
+ & testVMinMax<uint16_t, 3>(mask)
+ & testVMinMax<uint16_t, 4>(mask)
+ & test1MinMax<uint64_t>(mask)
+ & testVMinMax<uint64_t, 2>(mask)
+ & testVMinMax<uint64_t, 3>(mask)
+ & testVMinMax<uint64_t, 4>(mask)
+ & test1MinMax<half>(mask)
+ & testVMinMax<half, 2>(mask)
+ & testVMinMax<half, 3>(mask)
+ & testVMinMax<half, 4>(mask)
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ let index = dispatchThreadID.x;
+
+ // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+ let isSecondGroup = index >= 15;
+ uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+ isFirstInPartition = (index == 0) || (index == 15);
+ let isLastInPartition = (index == 14) || (index == 31);
+
+ gSmaller = isSecondGroup ? 2 : 0;
+ gLarger = isSecondGroup ? 3 : 1;
+ gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+ bool result = true
+ & testMinMax(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
new file mode 100644
index 000000000..68e1e9c05
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMin subgroupPartitionedInclusiveMinNV
+#define __partitionedExclusiveMin subgroupPartitionedExclusiveMinNV
+#else
+#define __partitionedInclusiveMin WaveMultiPrefixInclusiveMin
+#define __partitionedExclusiveMin WaveMultiPrefixExclusiveMin
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1Min(uint4 mask)
+{
+ let larger = T(gLarger);
+ let minValue = T(gMaxValue);
+
+ // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+ bool exclusiveRes = true
+ & (__partitionedExclusiveMin(minValue, mask) == larger)
+ ;
+ // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+ // nice to have something like T::min or T::max.
+ if (isFirstInPartition)
+ {
+ exclusiveRes = true;
+ }
+
+ return true
+ & (__partitionedInclusiveMin(minValue, mask) == minValue)
+ & exclusiveRes
+ ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMin(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let larger = GVec(T(gLarger));
+ let minValue = GVec(T(gMaxValue));
+
+ // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+ bool exclusiveRes = true
+ & all(__partitionedExclusiveMin(minValue, mask) == larger)
+ ;
+ // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+ // nice to have something like T::min or T::max.
+ if (isFirstInPartition)
+ {
+ exclusiveRes = true;
+ }
+
+ return true
+ & all(__partitionedInclusiveMin(minValue, mask) == minValue)
+ & exclusiveRes
+ ;
+}
+
+bool testMin(uint4 mask)
+{
+ return true
+ & test1Min<int>(mask)
+ & testVMin<int, 2>(mask)
+ & testVMin<int, 3>(mask)
+ & testVMin<int, 4>(mask)
+ & test1Min<uint>(mask)
+ & testVMin<uint, 2>(mask)
+ & testVMin<uint, 3>(mask)
+ & testVMin<uint, 4>(mask)
+ & test1Min<float>(mask)
+ & testVMin<float, 2>(mask)
+ & testVMin<float, 3>(mask)
+ & testVMin<float, 4>(mask)
+ & test1Min<double>(mask)
+ & testVMin<double, 2>(mask)
+ & testVMin<double, 3>(mask)
+ & testVMin<double, 4>(mask)
+ & test1Min<int8_t>(mask)
+ & testVMin<int8_t, 2>(mask)
+ & testVMin<int8_t, 3>(mask)
+ & testVMin<int8_t, 4>(mask)
+ & test1Min<int16_t>(mask)
+ & testVMin<int16_t, 2>(mask)
+ & testVMin<int16_t, 3>(mask)
+ & testVMin<int16_t, 4>(mask)
+ & test1Min<int64_t>(mask)
+ & testVMin<int64_t, 2>(mask)
+ & testVMin<int64_t, 3>(mask)
+ & testVMin<int64_t, 4>(mask)
+ & test1Min<uint8_t>(mask)
+ & testVMin<uint8_t, 2>(mask)
+ & testVMin<uint8_t, 3>(mask)
+ & testVMin<uint8_t, 4>(mask)
+ & test1Min<uint16_t>(mask)
+ & testVMin<uint16_t, 2>(mask)
+ & testVMin<uint16_t, 3>(mask)
+ & testVMin<uint16_t, 4>(mask)
+ & test1Min<uint64_t>(mask)
+ & testVMin<uint64_t, 2>(mask)
+ & testVMin<uint64_t, 3>(mask)
+ & testVMin<uint64_t, 4>(mask)
+ & test1Min<half>(mask)
+ & testVMin<half, 2>(mask)
+ & testVMin<half, 3>(mask)
+ & testVMin<half, 4>(mask)
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ let index = dispatchThreadID.x;
+
+ // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+ let isSecondGroup = index >= 15;
+ uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+ isFirstInPartition = (index == 0) || (index == 15);
+ let isLastInPartition = (index == 14) || (index == 31);
+
+ bool result = true
+ & testMin(mask)
+ ;
+
+ gSmaller = isSecondGroup ? 2 : 0;
+ gLarger = isSecondGroup ? 3 : 1;
+ gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
index 69240198e..5de34b20a 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
@@ -10,6 +10,7 @@
RWStructuredBuffer<uint> outputBuffer;
[numthreads(8, 1, 1)]
+[shader("compute")]
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
uint index = int(dispatchThreadID.x);
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
index c80baa5b1..c80baa5b1 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
new file mode 100644
index 000000000..bb641cab1
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
@@ -0,0 +1,136 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveSum subgroupPartitionedInclusiveAddNV
+#define __partitionedInclusiveProduct subgroupPartitionedInclusiveMulNV
+#define __partitionedExclusiveSum subgroupPartitionedExclusiveAddNV
+#define __partitionedExclusiveProduct subgroupPartitionedExclusiveMulNV
+#else
+#define __partitionedInclusiveSum WaveMultiPrefixInclusiveSum
+#define __partitionedInclusiveProduct WaveMultiPrefixInclusiveProduct
+#define __partitionedExclusiveSum WaveMultiPrefixExclusiveSum
+#define __partitionedExclusiveProduct WaveMultiPrefixExclusiveProduct
+#endif
+
+static uint partitionedIndex = 0;
+static uint gProductValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+ let productValue = T(gProductValue);
+
+ return true
+ & (__partitionedInclusiveSum(T(1), mask) == T(partitionedIndex + 1))
+ & (__partitionedInclusiveProduct(productValue, mask) == productValue)
+ & (__partitionedExclusiveSum(T(1), mask) == T(partitionedIndex))
+ & (__partitionedExclusiveProduct(productValue, mask) == T(1))
+ ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let productValue = GVec(T(gProductValue));
+
+ return true
+ & all(__partitionedInclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex + 1)))
+ & all(__partitionedInclusiveProduct(productValue, mask) == productValue)
+ & all(__partitionedExclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex)))
+ & all(__partitionedExclusiveProduct(productValue, mask) == GVec(T(1)))
+ ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+ return true
+ & test1SumProduct<int>(mask)
+ & testVSumProduct<int, 2>(mask)
+ & testVSumProduct<int, 3>(mask)
+ & testVSumProduct<int, 4>(mask)
+ & test1SumProduct<uint>(mask)
+ & testVSumProduct<uint, 2>(mask)
+ & testVSumProduct<uint, 3>(mask)
+ & testVSumProduct<uint, 4>(mask)
+ & test1SumProduct<float>(mask)
+ & testVSumProduct<float, 2>(mask)
+ & testVSumProduct<float, 3>(mask)
+ & testVSumProduct<float, 4>(mask)
+ & test1SumProduct<double>(mask)
+ & testVSumProduct<double, 2>(mask)
+ & testVSumProduct<double, 3>(mask)
+ & testVSumProduct<double, 4>(mask)
+
+#if defined(VK)
+ & test1SumProduct<int8_t>(mask)
+ & testVSumProduct<int8_t, 2>(mask)
+ & testVSumProduct<int8_t, 3>(mask)
+ & testVSumProduct<int8_t, 4>(mask)
+ & test1SumProduct<uint8_t>(mask)
+ & testVSumProduct<uint8_t, 2>(mask)
+ & testVSumProduct<uint8_t, 3>(mask)
+ & testVSumProduct<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+ & test1SumProduct<int16_t>(mask)
+ & testVSumProduct<int16_t, 2>(mask)
+ & testVSumProduct<int16_t, 3>(mask)
+ & testVSumProduct<int16_t, 4>(mask)
+ & test1SumProduct<int64_t>(mask)
+ & testVSumProduct<int64_t, 2>(mask)
+ & testVSumProduct<int64_t, 3>(mask)
+ & testVSumProduct<int64_t, 4>(mask)
+ & test1SumProduct<uint16_t>(mask)
+ & testVSumProduct<uint16_t, 2>(mask)
+ & testVSumProduct<uint16_t, 3>(mask)
+ & testVSumProduct<uint16_t, 4>(mask)
+ & test1SumProduct<uint64_t>(mask)
+ & testVSumProduct<uint64_t, 2>(mask)
+ & testVSumProduct<uint64_t, 3>(mask)
+ & testVSumProduct<uint64_t, 4>(mask)
+ & test1SumProduct<half>(mask)
+ & testVSumProduct<half, 2>(mask)
+ & testVSumProduct<half, 3>(mask)
+ & testVSumProduct<half, 4>(mask)
+#endif
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint index = dispatchThreadID.x;
+ partitionedIndex = index;
+ bool isSecondGroup = false;
+
+ // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+ uint4 mask = uint4(0x0007FFF, 0, 0, 0);
+ if (index >= 15)
+ {
+ isSecondGroup = true;
+ mask = uint4(0xFFFF8000, 0, 0, 0);
+ partitionedIndex -= 15;
+ }
+
+ let isLastInPartition = (index == 14) || (index == 31);
+ gProductValue = isLastInPartition ? uint(0) : uint(1);
+
+ bool result = true
+ & testSumProduct(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
index 99698e497..99698e497 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
new file mode 100644
index 000000000..b40b014f4
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
@@ -0,0 +1,114 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedSum subgroupPartitionedAddNV
+#define __partitionedProduct subgroupPartitionedMulNV
+#else
+#define __partitionedSum WaveMultiSum
+#define __partitionedProduct WaveMultiProduct
+#endif
+
+static uint gSumResult = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+ let sumResult = T(gSumResult);
+
+ return true
+ & (__partitionedSum(T(1), mask) == sumResult)
+ & (__partitionedProduct(T(1), mask) == T(1))
+ ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+ typealias GVec = vector<T, N>;
+
+ let sumResult = GVec(T(gSumResult));
+
+ return true
+ & all(__partitionedSum(GVec(T(1)), mask) == sumResult)
+ & all(__partitionedProduct(GVec(T(1)), mask) == GVec(T(1)))
+ ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+ return true
+ & test1SumProduct<int>(mask)
+ & testVSumProduct<int, 2>(mask)
+ & testVSumProduct<int, 3>(mask)
+ & testVSumProduct<int, 4>(mask)
+ & test1SumProduct<uint>(mask)
+ & testVSumProduct<uint, 2>(mask)
+ & testVSumProduct<uint, 3>(mask)
+ & testVSumProduct<uint, 4>(mask)
+ & test1SumProduct<float>(mask)
+ & testVSumProduct<float, 2>(mask)
+ & testVSumProduct<float, 3>(mask)
+ & testVSumProduct<float, 4>(mask)
+ & test1SumProduct<double>(mask)
+ & testVSumProduct<double, 2>(mask)
+ & testVSumProduct<double, 3>(mask)
+ & testVSumProduct<double, 4>(mask)
+
+#if !defined(CUDA)
+ & test1SumProduct<int8_t>(mask)
+ & testVSumProduct<int8_t, 2>(mask)
+ & testVSumProduct<int8_t, 3>(mask)
+ & testVSumProduct<int8_t, 4>(mask)
+ & test1SumProduct<int16_t>(mask)
+ & testVSumProduct<int16_t, 2>(mask)
+ & testVSumProduct<int16_t, 3>(mask)
+ & testVSumProduct<int16_t, 4>(mask)
+ & test1SumProduct<int64_t>(mask)
+ & testVSumProduct<int64_t, 2>(mask)
+ & testVSumProduct<int64_t, 3>(mask)
+ & testVSumProduct<int64_t, 4>(mask)
+ & test1SumProduct<uint8_t>(mask)
+ & testVSumProduct<uint8_t, 2>(mask)
+ & testVSumProduct<uint8_t, 3>(mask)
+ & testVSumProduct<uint8_t, 4>(mask)
+ & test1SumProduct<uint16_t>(mask)
+ & testVSumProduct<uint16_t, 2>(mask)
+ & testVSumProduct<uint16_t, 3>(mask)
+ & testVSumProduct<uint16_t, 4>(mask)
+ & test1SumProduct<uint64_t>(mask)
+ & testVSumProduct<uint64_t, 2>(mask)
+ & testVSumProduct<uint64_t, 3>(mask)
+ & testVSumProduct<uint64_t, 4>(mask)
+ & test1SumProduct<half>(mask)
+ & testVSumProduct<half, 2>(mask)
+ & testVSumProduct<half, 3>(mask)
+ & testVSumProduct<half, 4>(mask)
+#endif
+ ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint index = dispatchThreadID.x;
+
+ // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+ let isSecondGroup = index >= 15;
+ uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+ gSumResult = isSecondGroup ? uint(17) : uint(15);
+
+ bool result = true
+ & testSumProduct(mask)
+ ;
+
+ // CHECK-COUNT-32: 1
+ outputBuffer[index] = uint(result);
+}
diff --git a/tests/language-feature/capability/testing-framework-with-profiles.slang b/tests/language-feature/capability/testing-framework-with-profiles.slang
index 215ba887e..97ff32a9d 100644
--- a/tests/language-feature/capability/testing-framework-with-profiles.slang
+++ b/tests/language-feature/capability/testing-framework-with-profiles.slang
@@ -17,5 +17,5 @@ RWStructuredBuffer<uint> outputBuffer;
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
// BUF: 1
- outputBuffer[0] = WaveMaskSum(0xFF, 1);
+ outputBuffer[0] = WaveActiveSum(1);
}