Add full support for SPV_NV_shader_subgroup_partitioned (#7103)

* Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics * More partitioned intrinsics * More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations * Refactor HLSL WaveMultiPrefix* implementations * fix cap atoms * Clean up implementation * Add GLSL intrinsics and cleanup * Add tests * Fix affected capability test * Update and fix tests * Move expected.txt file * Refactor WaveMask* to call WaveMulti* * Refactor SPIRV/GLSL preamble code * Enable emit-via-glsl tests * remove wave_multi_prefix capability in favor of subgroup_partitioned * Update docs * Update cap atoms doc
author: Darren Wihandi <65404740+fairywreath@users.noreply.github.com> 2025-05-25 12:58:08 -0400
committer: GitHub <noreply@github.com> 2025-05-25 09:58:08 -0700
commit: 0476b57faad96bee61f59f27ddd48c6cb067cfa2 (patch)
tree: d3fe49cd906c29b03b2a840dd2c057ccc331b4f7 /tests
parent: 554be7a5f990df19a21db10b4e5dc0285cbe8168 (diff)
17 files changed, 981 insertions, 13 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
index 8a47c5733..da94ad794 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -26,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     const WaveMask mask2 = mask0 & ~mask1;
         
     outputBuffer[idx] = WaveMaskProduct(mask2, idx);
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
index 3dd33f150..3a1c26f8e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -30,4 +30,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     value = WaveMaskMin(mask2, idx + 1);
     
     outputBuffer[idx] = value;
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
index f333a59fb..fb5573bd1 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //DISABLE_TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -37,4 +37,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     matrix<int, 2, 2> r = r0 + matrix<int, 2, 2>(r1) + r6;
    
     outputBuffer[idx] = r[0][0] + r[0][1] + r[1][0] + r[1][1];
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
index b12e9c1b3..e32524b1e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -25,4 +25,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     
     outputBuffer[idx] = r0 + (r2 << 16);
     
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
index 51e9b7600..2e0fba746 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -23,4 +23,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int r2 = int(r1.x) + int(r1.y) - idx;
     
     outputBuffer[idx] = r0 + (r2 << 16);
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
index b1f44f4fb..7c326e0f3 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -29,4 +29,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int2 r = r0 + int2(r1) + r2 + r3 + r4;
    
     outputBuffer[idx] = r.x + r.y;
-}
-\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
new file mode 100644
index 000000000..c2a292c14
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
@@ -0,0 +1,139 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedAnd subgroupPartitionedAndNV
+#define __partitionedOr subgroupPartitionedOrNV
+#define __partitionedXor subgroupPartitionedXorNV
+#else
+#define __partitionedAnd WaveMultiBitAnd
+#define __partitionedOr WaveMultiBitOr
+#define __partitionedXor WaveMultiBitXor
+#endif
+
+static uint gAndValue = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResult = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+    let andValue = T(gAndValue);
+    let orValue = T(gOrValue);
+    let orResult = T(gOrResult);
+    let xorValue = T(gXorValue);
+    let xorResult = T(gXorResult);
+
+    return true
+        & (__partitionedAnd(andValue, mask) == andValue)
+        & (__partitionedOr(orValue, mask) == orResult)
+        & (__partitionedXor(xorValue, mask) == xorResult)
+        ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let andValue = GVec(T(gAndValue));
+    let orValue = GVec(T(gOrValue));
+    let orResult = GVec(T(gOrResult));
+    let xorValue = GVec(T(gXorValue));
+    let xorResult = GVec(T(gXorResult));
+
+    return true
+        & all(__partitionedAnd(andValue, mask) == andValue)
+        & all(__partitionedOr(orValue, mask) == orResult)
+        & all(__partitionedXor(xorValue, mask) == xorResult)
+        ;
+}
+
+bool testBitwise(uint4 mask)
+{
+    return true
+        & test1Bitwise<int>(mask)
+        & testVBitwise<int, 2>(mask)
+        & testVBitwise<int, 3>(mask)
+        & testVBitwise<int, 4>(mask)
+        & test1Bitwise<uint>(mask)
+        & testVBitwise<uint, 2>(mask)
+        & testVBitwise<uint, 3>(mask)
+        & testVBitwise<uint, 4>(mask)
+
+        // TODO: these are failing SPIRV validation and should be fixed.
+        // SPIRV's ops do not directly accept/return bool.
+        // & test1Bitwise<bool>(mask)
+        // & testVBitwise<bool, 2>(mask)
+        // & testVBitwise<bool, 3>(mask)
+        // & testVBitwise<bool, 4>(mask)
+
+#if !defined(CUDA)
+        & test1Bitwise<int8_t>(mask)
+        & testVBitwise<int8_t, 2>(mask)
+        & testVBitwise<int8_t, 3>(mask)
+        & testVBitwise<int8_t, 4>(mask)
+        & test1Bitwise<int16_t>(mask)
+        & testVBitwise<int16_t, 2>(mask)
+        & testVBitwise<int16_t, 3>(mask)
+        & testVBitwise<int16_t, 4>(mask)
+        & test1Bitwise<int64_t>(mask)
+        & testVBitwise<int64_t, 2>(mask)
+        & testVBitwise<int64_t, 3>(mask)
+        & testVBitwise<int64_t, 4>(mask)
+        & test1Bitwise<uint8_t>(mask)
+        & testVBitwise<uint8_t, 2>(mask)
+        & testVBitwise<uint8_t, 3>(mask)
+        & testVBitwise<uint8_t, 4>(mask)
+        & test1Bitwise<uint16_t>(mask)
+        & testVBitwise<uint16_t, 2>(mask)
+        & testVBitwise<uint16_t, 3>(mask)
+        & testVBitwise<uint16_t, 4>(mask)
+        & test1Bitwise<uint64_t>(mask)
+        & testVBitwise<uint64_t, 2>(mask)
+        & testVBitwise<uint64_t, 3>(mask)
+        & testVBitwise<uint64_t, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    let isSecondGroup = index >= 15;
+    let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    // One invocation in second group is different from others to test or and xor operations.
+    let isOrSet = (index == 15);
+
+    gAndValue = isSecondGroup ? uint(1) : uint(0);
+    gOrValue = isOrSet ? uint(1) : uint(0);
+    gOrResult = isSecondGroup ? uint(1) : uint(0);
+
+    // Alternate 0s and 1s for xor.
+    gXorValue = (index % 2 == 0) ? uint(0) : uint(1);
+    if (isOrSet)
+    {
+        // This is in second group - disrupt the alternating sequence.
+        gXorValue = uint(0);
+    }
+    gXorResult = isSecondGroup ? uint(0) : uint(1);
+
+    bool result = true
+            & testBitwise(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
new file mode 100644
index 000000000..419ffecc5
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
@@ -0,0 +1,127 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedMin subgroupPartitionedMinNV
+#define __partitionedMax subgroupPartitionedMaxNV
+#else
+#define __partitionedMin WaveMultiMin
+#define __partitionedMax WaveMultiMax
+#endif
+
+
+static uint gMinResult = 0;
+static uint gMaxResult = 0;
+static uint gMinMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+    let minResult = T(gMinResult);
+    let maxResult = T(gMaxResult);
+    let minMaxValue = T(gMinMaxValue);
+
+    return true
+        & all(__partitionedMin(minMaxValue, mask) == minResult)
+        & all(__partitionedMax(minMaxValue, mask) == maxResult)
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let minResult = GVec(T(gMinResult));
+    let maxResult = GVec(T(gMaxResult));
+    let minMaxValue = GVec(T(gMinMaxValue));
+
+    return true
+        & all(__partitionedMin(minMaxValue, mask) == minResult)
+        & all(__partitionedMax(minMaxValue, mask) == maxResult)
+        ;
+}
+
+bool testMinMax(uint4 mask)
+{
+    return true
+        & test1MinMax<int>(mask)
+        & testVMinMax<int, 2>(mask)
+        & testVMinMax<int, 3>(mask)
+        & testVMinMax<int, 4>(mask)
+        & test1MinMax<uint>(mask)
+        & testVMinMax<uint, 2>(mask)
+        & testVMinMax<uint, 3>(mask)
+        & testVMinMax<uint, 4>(mask)
+        & test1MinMax<float>(mask)
+        & testVMinMax<float, 2>(mask)
+        & testVMinMax<float, 3>(mask)
+        & testVMinMax<float, 4>(mask)
+        & test1MinMax<double>(mask)
+        & testVMinMax<double, 2>(mask)
+        & testVMinMax<double, 3>(mask)
+        & testVMinMax<double, 4>(mask)
+
+#if !defined(CUDA)
+        & test1MinMax<int8_t>(mask)
+        & testVMinMax<int8_t, 2>(mask)
+        & testVMinMax<int8_t, 3>(mask)
+        & testVMinMax<int8_t, 4>(mask)
+        & test1MinMax<int16_t>(mask)
+        & testVMinMax<int16_t, 2>(mask)
+        & testVMinMax<int16_t, 3>(mask)
+        & testVMinMax<int16_t, 4>(mask)
+        & test1MinMax<int64_t>(mask)
+        & testVMinMax<int64_t, 2>(mask)
+        & testVMinMax<int64_t, 3>(mask)
+        & testVMinMax<int64_t, 4>(mask)
+        & test1MinMax<uint8_t>(mask)
+        & testVMinMax<uint8_t, 2>(mask)
+        & testVMinMax<uint8_t, 3>(mask)
+        & testVMinMax<uint8_t, 4>(mask)
+        & test1MinMax<uint16_t>(mask)
+        & testVMinMax<uint16_t, 2>(mask)
+        & testVMinMax<uint16_t, 3>(mask)
+        & testVMinMax<uint16_t, 4>(mask)
+        & test1MinMax<uint64_t>(mask)
+        & testVMinMax<uint64_t, 2>(mask)
+        & testVMinMax<uint64_t, 3>(mask)
+        & testVMinMax<uint64_t, 4>(mask)
+        & test1MinMax<half>(mask)
+        & testVMinMax<half, 2>(mask)
+        & testVMinMax<half, 3>(mask)
+        & testVMinMax<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    // Set min value on one invocation on each partition/mask.
+    let isMinInvocation = (index == 0) || (index == 15);
+
+    gMinResult = isSecondGroup ? uint(2) : uint(0);
+    gMaxResult = isSecondGroup ? uint(3) : uint(1);
+    gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult;
+
+    bool result = true
+            && testMinMax(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
new file mode 100644
index 000000000..bb1182e5e
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
@@ -0,0 +1,163 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveAnd subgroupPartitionedInclusiveAndNV
+#define __partitionedInclusiveOr subgroupPartitionedInclusiveOrNV
+#define __partitionedInclusiveXor subgroupPartitionedInclusiveXorNV
+#define __partitionedExclusiveAnd subgroupPartitionedExclusiveAndNV
+#define __partitionedExclusiveOr subgroupPartitionedExclusiveOrNV
+#define __partitionedExclusiveXor subgroupPartitionedExclusiveXorNV
+#else
+#define __partitionedInclusiveAnd WaveMultiPrefixInclusiveBitAnd
+#define __partitionedInclusiveOr WaveMultiPrefixInclusiveBitOr
+#define __partitionedInclusiveXor WaveMultiPrefixInclusiveBitXor
+#define __partitionedExclusiveAnd WaveMultiPrefixExclusiveBitAnd
+#define __partitionedExclusiveOr WaveMultiPrefixExclusiveBitOr
+#define __partitionedExclusiveXor WaveMultiPrefixExclusiveBitXor
+#endif
+
+
+static uint gAndValue = 0;
+static uint gAndResultExclusive = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResultInclusive = 0;
+static uint gXorResultExclusive = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+    let andValue = T(gAndValue);
+    let orValue = T(gOrValue);
+    let xorValue = T(gXorValue);
+
+    return true
+        & (__partitionedInclusiveAnd(andValue, mask) == andValue)
+        & (__partitionedExclusiveAnd(andValue, mask) == T(gAndResultExclusive))
+        & (__partitionedInclusiveOr(orValue, mask) == orValue)
+        & (__partitionedExclusiveOr(orValue, mask) == T(0))
+        & (__partitionedInclusiveXor(xorValue, mask) == T(gXorResultInclusive))
+        & (__partitionedExclusiveXor(xorValue, mask) == T(gXorResultExclusive))
+        ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let andValue = GVec(T(gAndValue));
+    let orValue = GVec(T(gOrValue));
+    let xorValue = GVec(T(gXorValue));
+
+    return true
+        & all(__partitionedInclusiveAnd(andValue, mask) == andValue)
+        & all(__partitionedExclusiveAnd(andValue, mask) == GVec(T(gAndResultExclusive)))
+        & all(__partitionedInclusiveOr(orValue, mask) == orValue)
+        & all(__partitionedExclusiveOr(orValue, mask) == GVec(T(0)))
+        & all(__partitionedInclusiveXor(xorValue, mask) == GVec(T(gXorResultInclusive)))
+        & all(__partitionedExclusiveXor(xorValue, mask) == GVec(T(gXorResultExclusive)))
+        ;
+}
+
+bool testBitwise(uint4 mask)
+{
+    return true
+        & test1Bitwise<int>(mask)
+        & testVBitwise<int, 2>(mask)
+        & testVBitwise<int, 3>(mask)
+        & testVBitwise<int, 4>(mask)
+        & test1Bitwise<uint>(mask)
+        & testVBitwise<uint, 2>(mask)
+        & testVBitwise<uint, 3>(mask)
+        & testVBitwise<uint, 4>(mask)
+
+        // TODO: these are failing SPIRV validation and should be fixed.
+        // SPIRV's ops do not directly accept/return bool.
+        // & test1Bitwise<bool>(mask)
+        // & testVBitwise<bool, 2>(mask)
+        // & testVBitwise<bool, 3>(mask)
+        // & testVBitwise<bool, 4>(mask)
+
+#if defined(VK)
+        & test1Bitwise<int8_t>(mask)
+        & testVBitwise<int8_t, 2>(mask)
+        & testVBitwise<int8_t, 3>(mask)
+        & testVBitwise<int8_t, 4>(mask)
+        & test1Bitwise<uint8_t>(mask)
+        & testVBitwise<uint8_t, 2>(mask)
+        & testVBitwise<uint8_t, 3>(mask)
+        & testVBitwise<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+        & test1Bitwise<int16_t>(mask)
+        & testVBitwise<int16_t, 2>(mask)
+        & testVBitwise<int16_t, 3>(mask)
+        & testVBitwise<int16_t, 4>(mask)
+        & test1Bitwise<int64_t>(mask)
+        & testVBitwise<int64_t, 2>(mask)
+        & testVBitwise<int64_t, 3>(mask)
+        & testVBitwise<int64_t, 4>(mask)
+        & test1Bitwise<uint16_t>(mask)
+        & testVBitwise<uint16_t, 2>(mask)
+        & testVBitwise<uint16_t, 3>(mask)
+        & testVBitwise<uint16_t, 4>(mask)
+        & test1Bitwise<uint64_t>(mask)
+        & testVBitwise<uint64_t, 2>(mask)
+        & testVBitwise<uint64_t, 3>(mask)
+        & testVBitwise<uint64_t, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    let isSecondGroup = index >= 15;
+    let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+    let isLastInvocation = (index == 31);
+    let isLastInPartition = (index == 14) || (index == 31);
+    let isFirstInPartition = (index == 0) || (index == 15);
+
+    //
+    // Prefix and.
+    // - Both groups use 1 except for the last invocation in each partition where input is 0.
+    // - For inclusive ops, result is 1 except for last invocation in each partition.
+    // - For exclusive ops, first in partition is always results to ~0(identity). Otherwise exclusive ops result to 1.
+    gAndValue = isLastInPartition ? uint(0) : uint(1);
+    gAndResultExclusive = isFirstInPartition ? uint(~0) : uint(1);
+
+    //
+    // Prefix or.
+    // - Both groups use 0 except for the last invocation in each partition where input is 1.
+    // - For inclusive ops, result is 0 except for last invocation in each partition.
+    // - For exclusive ops, result is always 0.
+    gOrValue = isLastInPartition ? uint(1) : uint(0);
+
+    // Prefix xor.
+    // - First group input is always 1. Inclusive results alternate between 1 and 0, starting at 1. Exclusive result is also alternates but starts at 0 (opposite of inclusive result).
+    // - Second group is always 0. Results are all 0.
+    gXorValue = isSecondGroup ? uint(0) : uint(1);
+    gXorResultInclusive = (isSecondGroup || (index % 2 != 0)) ? uint(0) : uint(1);
+    gXorResultExclusive = isSecondGroup ? uint(0) : (uint(1) - gXorResultInclusive);
+
+    bool result = true
+            & testBitwise(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
new file mode 100644
index 000000000..654fd6130
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMax subgroupPartitionedInclusiveMaxNV
+#define __partitionedExclusiveMax subgroupPartitionedExclusiveMaxNV
+#else
+#define __partitionedInclusiveMax WaveMultiPrefixInclusiveMax
+#define __partitionedExclusiveMax WaveMultiPrefixExclusiveMax
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+    let smaller = T(gSmaller);
+    let maxValue = T(gMaxValue);
+
+    // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & (__partitionedExclusiveMax(maxValue, mask) == smaller)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & (__partitionedInclusiveMax(maxValue, mask) == maxValue)
+        & exclusiveRes
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let smaller = GVec(T(gSmaller));
+    let maxValue = GVec(T(gMaxValue));
+
+    // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & all(__partitionedExclusiveMax(maxValue, mask) == smaller)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & all(__partitionedInclusiveMax(maxValue, mask) == maxValue)
+        & exclusiveRes;
+        ;
+}
+
+bool testMinMax(uint4 mask)
+{
+    return true
+        & test1MinMax<int>(mask)
+        & testVMinMax<int, 2>(mask)
+        & testVMinMax<int, 3>(mask)
+        & testVMinMax<int, 4>(mask)
+        & test1MinMax<uint>(mask)
+        & testVMinMax<uint, 2>(mask)
+        & testVMinMax<uint, 3>(mask)
+        & testVMinMax<uint, 4>(mask)
+        & test1MinMax<float>(mask)
+        & testVMinMax<float, 2>(mask)
+        & testVMinMax<float, 3>(mask)
+        & testVMinMax<float, 4>(mask)
+        & test1MinMax<double>(mask)
+        & testVMinMax<double, 2>(mask)
+        & testVMinMax<double, 3>(mask)
+        & testVMinMax<double, 4>(mask)
+        & test1MinMax<int8_t>(mask)
+        & testVMinMax<int8_t, 2>(mask)
+        & testVMinMax<int8_t, 3>(mask)
+        & testVMinMax<int8_t, 4>(mask)
+        & test1MinMax<int16_t>(mask)
+        & testVMinMax<int16_t, 2>(mask)
+        & testVMinMax<int16_t, 3>(mask)
+        & testVMinMax<int16_t, 4>(mask)
+        & test1MinMax<int64_t>(mask)
+        & testVMinMax<int64_t, 2>(mask)
+        & testVMinMax<int64_t, 3>(mask)
+        & testVMinMax<int64_t, 4>(mask)
+        & test1MinMax<uint8_t>(mask)
+        & testVMinMax<uint8_t, 2>(mask)
+        & testVMinMax<uint8_t, 3>(mask)
+        & testVMinMax<uint8_t, 4>(mask)
+        & test1MinMax<uint16_t>(mask)
+        & testVMinMax<uint16_t, 2>(mask)
+        & testVMinMax<uint16_t, 3>(mask)
+        & testVMinMax<uint16_t, 4>(mask)
+        & test1MinMax<uint64_t>(mask)
+        & testVMinMax<uint64_t, 2>(mask)
+        & testVMinMax<uint64_t, 3>(mask)
+        & testVMinMax<uint64_t, 4>(mask)
+        & test1MinMax<half>(mask)
+        & testVMinMax<half, 2>(mask)
+        & testVMinMax<half, 3>(mask)
+        & testVMinMax<half, 4>(mask)
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    isFirstInPartition = (index == 0) || (index == 15);
+    let isLastInPartition = (index == 14) || (index == 31);
+
+    gSmaller = isSecondGroup ? 2 : 0;
+    gLarger = isSecondGroup ? 3 : 1;
+    gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+    bool result = true
+            & testMinMax(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
new file mode 100644
index 000000000..68e1e9c05
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMin subgroupPartitionedInclusiveMinNV
+#define __partitionedExclusiveMin subgroupPartitionedExclusiveMinNV
+#else
+#define __partitionedInclusiveMin WaveMultiPrefixInclusiveMin
+#define __partitionedExclusiveMin WaveMultiPrefixExclusiveMin
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1Min(uint4 mask)
+{
+    let larger = T(gLarger);
+    let minValue = T(gMaxValue);
+
+    // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & (__partitionedExclusiveMin(minValue, mask) == larger)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & (__partitionedInclusiveMin(minValue, mask) == minValue)
+        & exclusiveRes
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMin(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let larger = GVec(T(gLarger));
+    let minValue = GVec(T(gMaxValue));
+
+    // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & all(__partitionedExclusiveMin(minValue, mask) == larger)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & all(__partitionedInclusiveMin(minValue, mask) == minValue)
+        & exclusiveRes
+        ;
+}
+
+bool testMin(uint4 mask)
+{
+    return true
+        & test1Min<int>(mask)
+        & testVMin<int, 2>(mask)
+        & testVMin<int, 3>(mask)
+        & testVMin<int, 4>(mask)
+        & test1Min<uint>(mask)
+        & testVMin<uint, 2>(mask)
+        & testVMin<uint, 3>(mask)
+        & testVMin<uint, 4>(mask)
+        & test1Min<float>(mask)
+        & testVMin<float, 2>(mask)
+        & testVMin<float, 3>(mask)
+        & testVMin<float, 4>(mask)
+        & test1Min<double>(mask)
+        & testVMin<double, 2>(mask)
+        & testVMin<double, 3>(mask)
+        & testVMin<double, 4>(mask)
+        & test1Min<int8_t>(mask)
+        & testVMin<int8_t, 2>(mask)
+        & testVMin<int8_t, 3>(mask)
+        & testVMin<int8_t, 4>(mask)
+        & test1Min<int16_t>(mask)
+        & testVMin<int16_t, 2>(mask)
+        & testVMin<int16_t, 3>(mask)
+        & testVMin<int16_t, 4>(mask)
+        & test1Min<int64_t>(mask)
+        & testVMin<int64_t, 2>(mask)
+        & testVMin<int64_t, 3>(mask)
+        & testVMin<int64_t, 4>(mask)
+        & test1Min<uint8_t>(mask)
+        & testVMin<uint8_t, 2>(mask)
+        & testVMin<uint8_t, 3>(mask)
+        & testVMin<uint8_t, 4>(mask)
+        & test1Min<uint16_t>(mask)
+        & testVMin<uint16_t, 2>(mask)
+        & testVMin<uint16_t, 3>(mask)
+        & testVMin<uint16_t, 4>(mask)
+        & test1Min<uint64_t>(mask)
+        & testVMin<uint64_t, 2>(mask)
+        & testVMin<uint64_t, 3>(mask)
+        & testVMin<uint64_t, 4>(mask)
+        & test1Min<half>(mask)
+        & testVMin<half, 2>(mask)
+        & testVMin<half, 3>(mask)
+        & testVMin<half, 4>(mask)
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    isFirstInPartition = (index == 0) || (index == 15);
+    let isLastInPartition = (index == 14) || (index == 31);
+
+    bool result = true
+            & testMin(mask)
+            ;
+
+    gSmaller = isSecondGroup ? 2 : 0;
+    gLarger = isSecondGroup ? 3 : 1;
+    gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
index 69240198e..5de34b20a 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
@@ -10,6 +10,7 @@
 RWStructuredBuffer<uint> outputBuffer;
 
 [numthreads(8, 1, 1)]
+[shader("compute")]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
     uint index = int(dispatchThreadID.x);
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
index c80baa5b1..c80baa5b1 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
new file mode 100644
index 000000000..bb641cab1
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
@@ -0,0 +1,136 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveSum subgroupPartitionedInclusiveAddNV
+#define __partitionedInclusiveProduct subgroupPartitionedInclusiveMulNV
+#define __partitionedExclusiveSum subgroupPartitionedExclusiveAddNV
+#define __partitionedExclusiveProduct subgroupPartitionedExclusiveMulNV
+#else
+#define __partitionedInclusiveSum WaveMultiPrefixInclusiveSum
+#define __partitionedInclusiveProduct WaveMultiPrefixInclusiveProduct
+#define __partitionedExclusiveSum WaveMultiPrefixExclusiveSum
+#define __partitionedExclusiveProduct WaveMultiPrefixExclusiveProduct
+#endif
+
+static uint partitionedIndex = 0;
+static uint gProductValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+    let productValue = T(gProductValue);
+
+    return true
+        & (__partitionedInclusiveSum(T(1), mask) == T(partitionedIndex + 1))
+        & (__partitionedInclusiveProduct(productValue, mask) == productValue)
+        & (__partitionedExclusiveSum(T(1), mask) == T(partitionedIndex))
+        & (__partitionedExclusiveProduct(productValue, mask) == T(1))
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let productValue = GVec(T(gProductValue));
+
+    return true
+        & all(__partitionedInclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex + 1)))
+        & all(__partitionedInclusiveProduct(productValue, mask) == productValue)
+        & all(__partitionedExclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex)))
+        & all(__partitionedExclusiveProduct(productValue, mask) == GVec(T(1)))
+        ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+    return true
+        & test1SumProduct<int>(mask)
+        & testVSumProduct<int, 2>(mask)
+        & testVSumProduct<int, 3>(mask)
+        & testVSumProduct<int, 4>(mask)
+        & test1SumProduct<uint>(mask)
+        & testVSumProduct<uint, 2>(mask)
+        & testVSumProduct<uint, 3>(mask)
+        & testVSumProduct<uint, 4>(mask)
+        & test1SumProduct<float>(mask)
+        & testVSumProduct<float, 2>(mask)
+        & testVSumProduct<float, 3>(mask)
+        & testVSumProduct<float, 4>(mask)
+        & test1SumProduct<double>(mask)
+        & testVSumProduct<double, 2>(mask)
+        & testVSumProduct<double, 3>(mask)
+        & testVSumProduct<double, 4>(mask)
+
+#if defined(VK)
+        & test1SumProduct<int8_t>(mask)
+        & testVSumProduct<int8_t, 2>(mask)
+        & testVSumProduct<int8_t, 3>(mask)
+        & testVSumProduct<int8_t, 4>(mask)
+        & test1SumProduct<uint8_t>(mask)
+        & testVSumProduct<uint8_t, 2>(mask)
+        & testVSumProduct<uint8_t, 3>(mask)
+        & testVSumProduct<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+        & test1SumProduct<int16_t>(mask)
+        & testVSumProduct<int16_t, 2>(mask)
+        & testVSumProduct<int16_t, 3>(mask)
+        & testVSumProduct<int16_t, 4>(mask)
+        & test1SumProduct<int64_t>(mask)
+        & testVSumProduct<int64_t, 2>(mask)
+        & testVSumProduct<int64_t, 3>(mask)
+        & testVSumProduct<int64_t, 4>(mask)
+        & test1SumProduct<uint16_t>(mask)
+        & testVSumProduct<uint16_t, 2>(mask)
+        & testVSumProduct<uint16_t, 3>(mask)
+        & testVSumProduct<uint16_t, 4>(mask)
+        & test1SumProduct<uint64_t>(mask)
+        & testVSumProduct<uint64_t, 2>(mask)
+        & testVSumProduct<uint64_t, 3>(mask)
+        & testVSumProduct<uint64_t, 4>(mask)
+        & test1SumProduct<half>(mask)
+        & testVSumProduct<half, 2>(mask)
+        & testVSumProduct<half, 3>(mask)
+        & testVSumProduct<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+    partitionedIndex = index;
+    bool isSecondGroup = false;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    uint4 mask = uint4(0x0007FFF, 0, 0, 0);
+    if (index >=  15)
+    {
+        isSecondGroup = true;
+        mask = uint4(0xFFFF8000, 0, 0, 0);
+        partitionedIndex -= 15;
+    }
+
+    let isLastInPartition = (index == 14) || (index == 31);
+    gProductValue = isLastInPartition ? uint(0) : uint(1);
+
+    bool result = true
+            & testSumProduct(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
index 99698e497..99698e497 100644
--- a/tests/hlsl-intrinsic/wave-multi-prefix.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
new file mode 100644
index 000000000..b40b014f4
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
@@ -0,0 +1,114 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedSum subgroupPartitionedAddNV
+#define __partitionedProduct subgroupPartitionedMulNV
+#else
+#define __partitionedSum WaveMultiSum
+#define __partitionedProduct WaveMultiProduct
+#endif
+
+static uint gSumResult = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+    let sumResult = T(gSumResult);
+
+    return true
+        & (__partitionedSum(T(1), mask) == sumResult)
+        & (__partitionedProduct(T(1), mask) == T(1))
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let sumResult = GVec(T(gSumResult));
+
+    return true
+        & all(__partitionedSum(GVec(T(1)), mask) == sumResult)
+        & all(__partitionedProduct(GVec(T(1)), mask) == GVec(T(1)))
+        ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+    return true
+        & test1SumProduct<int>(mask)
+        & testVSumProduct<int, 2>(mask)
+        & testVSumProduct<int, 3>(mask)
+        & testVSumProduct<int, 4>(mask)
+        & test1SumProduct<uint>(mask)
+        & testVSumProduct<uint, 2>(mask)
+        & testVSumProduct<uint, 3>(mask)
+        & testVSumProduct<uint, 4>(mask)
+        & test1SumProduct<float>(mask)
+        & testVSumProduct<float, 2>(mask)
+        & testVSumProduct<float, 3>(mask)
+        & testVSumProduct<float, 4>(mask)
+        & test1SumProduct<double>(mask)
+        & testVSumProduct<double, 2>(mask)
+        & testVSumProduct<double, 3>(mask)
+        & testVSumProduct<double, 4>(mask)
+
+#if !defined(CUDA)
+        & test1SumProduct<int8_t>(mask)
+        & testVSumProduct<int8_t, 2>(mask)
+        & testVSumProduct<int8_t, 3>(mask)
+        & testVSumProduct<int8_t, 4>(mask)
+        & test1SumProduct<int16_t>(mask)
+        & testVSumProduct<int16_t, 2>(mask)
+        & testVSumProduct<int16_t, 3>(mask)
+        & testVSumProduct<int16_t, 4>(mask)
+        & test1SumProduct<int64_t>(mask)
+        & testVSumProduct<int64_t, 2>(mask)
+        & testVSumProduct<int64_t, 3>(mask)
+        & testVSumProduct<int64_t, 4>(mask)
+        & test1SumProduct<uint8_t>(mask)
+        & testVSumProduct<uint8_t, 2>(mask)
+        & testVSumProduct<uint8_t, 3>(mask)
+        & testVSumProduct<uint8_t, 4>(mask)
+        & test1SumProduct<uint16_t>(mask)
+        & testVSumProduct<uint16_t, 2>(mask)
+        & testVSumProduct<uint16_t, 3>(mask)
+        & testVSumProduct<uint16_t, 4>(mask)
+        & test1SumProduct<uint64_t>(mask)
+        & testVSumProduct<uint64_t, 2>(mask)
+        & testVSumProduct<uint64_t, 3>(mask)
+        & testVSumProduct<uint64_t, 4>(mask)
+        & test1SumProduct<half>(mask)
+        & testVSumProduct<half, 2>(mask)
+        & testVSumProduct<half, 3>(mask)
+        & testVSumProduct<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+    gSumResult = isSecondGroup ? uint(17) : uint(15);
+
+    bool result = true
+            & testSumProduct(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/language-feature/capability/testing-framework-with-profiles.slang b/tests/language-feature/capability/testing-framework-with-profiles.slang
index 215ba887e..97ff32a9d 100644
--- a/tests/language-feature/capability/testing-framework-with-profiles.slang
+++ b/tests/language-feature/capability/testing-framework-with-profiles.slang
@@ -17,5 +17,5 @@ RWStructuredBuffer<uint> outputBuffer;
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
     // BUF: 1
-    outputBuffer[0] = WaveMaskSum(0xFF, 1);
+    outputBuffer[0] = WaveActiveSum(1);
 }
author	Darren Wihandi <65404740+fairywreath@users.noreply.github.com>	2025-05-25 12:58:08 -0400
committer	GitHub <noreply@github.com>	2025-05-25 09:58:08 -0700
commit	0476b57faad96bee61f59f27ddd48c6cb067cfa2 (patch)
tree	d3fe49cd906c29b03b2a840dd2c057ccc331b4f7 /tests
parent	554be7a5f990df19a21db10b4e5dc0285cbe8168 (diff)