From 0476b57faad96bee61f59f27ddd48c6cb067cfa2 Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Sun, 25 May 2025 12:58:08 -0400
Subject: Add full support for SPV_NV_shader_subgroup_partitioned (#7103)

* Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics

* More partitioned intrinsics

* More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations

* Refactor HLSL WaveMultiPrefix* implementations

* fix cap atoms

* Clean up implementation

* Add GLSL intrinsics and cleanup

* Add tests

* Fix affected capability test

* Update and fix tests

* Move expected.txt file

* Refactor WaveMask* to call WaveMulti*

* Refactor SPIRV/GLSL preamble code

* Enable emit-via-glsl tests

* remove wave_multi_prefix capability in favor of subgroup_partitioned

* Update docs

* Update cap atoms doc
---
 .../wave-mask/wave-active-product.slang            |   4 +-
 tests/hlsl-intrinsic/wave-mask/wave-diverge.slang  |   4 +-
 tests/hlsl-intrinsic/wave-mask/wave-matrix.slang   |   4 +-
 .../wave-mask/wave-prefix-product.slang            |   4 +-
 .../hlsl-intrinsic/wave-mask/wave-prefix-sum.slang |   4 +-
 tests/hlsl-intrinsic/wave-mask/wave-vector.slang   |   4 +-
 .../wave-multi-prefix-scalar-functional.slang      |  74 ----------
 ...lti-prefix-scalar-functional.slang.expected.txt |  40 -----
 tests/hlsl-intrinsic/wave-multi-prefix.slang       | 146 ------------------
 .../wave-multi/wave-multi-bitwise.slang            | 139 ++++++++++++++++++
 .../wave-multi/wave-multi-min-max.slang            | 127 ++++++++++++++++
 .../wave-multi/wave-multi-prefix-bitwise.slang     | 163 +++++++++++++++++++++
 .../wave-multi/wave-multi-prefix-max.slang         | 144 ++++++++++++++++++
 .../wave-multi/wave-multi-prefix-min.slang         | 144 ++++++++++++++++++
 .../wave-multi-prefix-scalar-functional.slang      |  75 ++++++++++
 ...lti-prefix-scalar-functional.slang.expected.txt |  40 +++++
 .../wave-multi/wave-multi-prefix-sum-product.slang | 136 +++++++++++++++++
 .../wave-multi/wave-multi-prefix.slang             | 146 ++++++++++++++++++
 .../wave-multi/wave-multi-sum-product.slang        | 114 ++++++++++++++
 .../testing-framework-with-profiles.slang          |   2 +-
 20 files changed, 1241 insertions(+), 273 deletions(-)
 delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang
 delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt
 delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
 create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang

(limited to 'tests')

diff --git a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
index 8a47c5733..da94ad794 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -26,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     const WaveMask mask2 = mask0 & ~mask1;
         
     outputBuffer[idx] = WaveMaskProduct(mask2, idx);
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
index 3dd33f150..3a1c26f8e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -30,4 +30,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     value = WaveMaskMin(mask2, idx + 1);
     
     outputBuffer[idx] = value;
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
index f333a59fb..fb5573bd1 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //DISABLE_TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -37,4 +37,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     matrix<int, 2, 2> r = r0 + matrix<int, 2, 2>(r1) + r6;
    
     outputBuffer[idx] = r[0][0] + r[0][1] + r[1][0] + r[1][1];
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
index b12e9c1b3..e32524b1e 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -25,4 +25,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     
     outputBuffer[idx] = r0 + (r2 << 16);
     
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
index 51e9b7600..2e0fba746 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -23,4 +23,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int r2 = int(r1.x) + int(r1.y) - idx;
     
     outputBuffer[idx] = r0 + (r2 << 16);
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
index b1f44f4fb..7c326e0f3 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang
@@ -1,7 +1,7 @@
 //TEST_CATEGORY(wave-mask, compute)
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device
 //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
 //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
@@ -29,4 +29,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int2 r = r0 + int2(r1) + r2 + r3 + r4;
    
     outputBuffer[idx] = r.x + r.y;
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang
deleted file mode 100644
index 69240198e..000000000
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang
+++ /dev/null
@@ -1,74 +0,0 @@
-//TEST_CATEGORY(wave, compute)
-//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
-//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-
-//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-vk -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
-
-//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
-RWStructuredBuffer<uint> outputBuffer;
-
-[numthreads(8, 1, 1)]
-void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    uint index = int(dispatchThreadID.x);
-
-    // Split into two groups.
-    uint4 mask = 0b00001111;
-    if (index >=  4)
-    {
-        mask = 0b11110000;
-    }
-
-    //
-    // WaveMultiPrefixSum.
-    // Results in hex: [0 1 3 7], [0 10 30 70]
-    //
-    uint sumValue = WaveMultiPrefixSum(1 << index, mask);
-    const uint sumBaseIndex = 0;
-    outputBuffer[sumBaseIndex + index] = sumValue;
-
-    //
-    // WaveMultiPrefixProduct.
-    // Results in hex: [1 1 2 8], [1 10 200 8000]
-    //
-    uint productValue = WaveMultiPrefixProduct(1 << index, mask);
-    const uint productBaseIndex = 8;
-    outputBuffer[productBaseIndex + index] = productValue;
-
-    //
-    // WaveMultiPrefixBitAnd.
-    // This prefix operation starts with all bits set.
-    // Results in hex: [FFFFFFFF 1 1 1], [FFFFFFFF F F F]
-    //
-    uint andBits = 0b1;
-    if (index >= 4)
-    {
-        andBits = 0b1111;
-    }
-    uint andValue = WaveMultiPrefixBitAnd(andBits, mask);
-    const uint andBaseIndex = 16;
-    outputBuffer[andBaseIndex + index] = andValue;
-
-    //
-    // WaveMultiPrefixBitOr.
-    // Results in hex: [0 1 3 7], [0 10 30 70]
-    //
-    uint orValue = WaveMultiPrefixBitOr(1 << index, mask);
-    const uint orBaseIndex = 24;
-    outputBuffer[orBaseIndex + index] = orValue;
-
-    //
-    // WaveMultiPrefixBitXor.
-    // Results in hex: [0 1 3 7], [0 F 0 F]
-    //
-    uint xorBits = (1 << index);
-    if (index >= 4)
-    {
-        xorBits = 0b1111;
-    }
-    uint xorValue = WaveMultiPrefixBitXor(xorBits, mask);
-    const uint xorBaseIndex = 32;
-    outputBuffer[xorBaseIndex + index] = xorValue;
-}
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt
deleted file mode 100644
index c80baa5b1..000000000
--- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-0
-1
-3
-7
-0
-10
-30
-70
-1
-1
-2
-8
-1
-10
-200
-8000
-FFFFFFFF
-1
-1
-1
-FFFFFFFF
-F
-F
-F
-0
-1
-3
-7
-0
-10
-30
-70
-0
-1
-3
-7
-0
-F
-0
-F
diff --git a/tests/hlsl-intrinsic/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi-prefix.slang
deleted file mode 100644
index 99698e497..000000000
--- a/tests/hlsl-intrinsic/wave-multi-prefix.slang
+++ /dev/null
@@ -1,146 +0,0 @@
-//TEST:SIMPLE(filecheck=CHECK_SPIRV): -stage compute -entry computeMain -target spirv -DNO_INTEGER_MATRIX
-//TEST:SIMPLE(filecheck=CHECK_GLSL): -stage compute -entry computeMain -target glsl -DNO_INTEGER_MATRIX
-//TEST:SIMPLE(filecheck=CHECK_CUDA): -stage compute -entry computeMain -target cuda
-//TEST:SIMPLE(filecheck=CHECK_HLSL): -stage compute -entry computeMain -target hlsl
-
-//
-// Tests all variants and overloads of WaveMultiPrefix* arithmetic intrinsics.
-//
-
-struct OutputData
-{
-    int scalarSum;
-    int scalarProduct;
-    int scalarBitAnd;
-    int scalarBitOr;
-    int scalarBitXor;
-    int vectorSum;
-    int vectorProduct;
-    int vectorBitAnd;
-    int vectorBitOr;
-    int vectorBitXor;
-    int matrixSum;
-    int matrixProduct;
-    int matrixBitAnd;
-    int matrixBitOr;
-    int matrixBitXor;
-    float floatScalarSum;
-    float floatScalarProduct;
-    float floatVectorSum;
-    float floatVectorProduct;
-    float floatMatrixSum;
-    float floatMatrixProduct;
-};
-
-RWStructuredBuffer<OutputData> outputBuffer;
-
-// CHECK_SPIRV: OpCapability GroupNonUniformPartitionedNV
-// CHECK_SPIRV: OpExtension "SPV_NV_shader_subgroup_partitioned"
-// CHECK_SPIRV: OpGroupNonUniformIAdd{{.*}}PartitionedExclusiveScanNV
-// CHECK_SPIRV: OpGroupNonUniformIMul{{.*}}PartitionedExclusiveScanNV
-// CHECK_SPIRV: OpGroupNonUniformBitwiseAnd{{.*}}PartitionedExclusiveScanNV
-// CHECK_SPIRV: OpGroupNonUniformBitwiseOr{{.*}}PartitionedExclusiveScanNV
-// CHECK_SPIRV: OpGroupNonUniformBitwiseXor{{.*}}PartitionedExclusiveScanNV
-// CHECK_SPIRV: OpGroupNonUniformFAdd{{.*}}PartitionedExclusiveScanNV
-
-// CHECK_GLSL: GL_NV_shader_subgroup_partitioned
-// CHECK_GLSL: subgroupPartitionedExclusiveAddNV
-// CHECK_GLSL: subgroupPartitionedExclusiveMulNV
-// CHECK_GLSL: subgroupPartitionedExclusiveAndNV
-// CHECK_GLSL: subgroupPartitionedExclusiveOrNV
-// CHECK_GLSL: subgroupPartitionedExclusiveXorNV
-
-// CHECK_CUDA: _wavePrefixSum
-// CHECK_CUDA: _wavePrefixProduct
-// CHECK_CUDA: _wavePrefixAnd
-// CHECK_CUDA: _wavePrefixOr
-// CHECK_CUDA: _wavePrefixXor
-// CHECK_CUDA: _wavePrefixSumMultiple
-// CHECK_CUDA: _wavePrefixProductMultiple
-// CHECK_CUDA: _wavePrefixAndMultiple
-// CHECK_CUDA: _wavePrefixOrMultiple
-// CHECK_CUDA: _wavePrefixXorMultiple
-
-// CHECK_HLSL: WaveMultiPrefixSum
-// CHECK_HLSL: WaveMultiPrefixProduct
-// CHECK_HLSL: WaveMultiPrefixBitAnd
-// CHECK_HLSL: WaveMultiPrefixBitOr
-// CHECK_HLSL: WaveMultiPrefixBitXor
-
-
-[numthreads(1, 1, 1)]
-void computeMain(uint3 dTid : SV_DispatchThreadID)
-{
-    int scalarVal = dTid.x;
-    uint4 mask = WaveMatch(scalarVal);
-
-    int scalarSum = WaveMultiPrefixSum(scalarVal, mask);
-    int scalarProduct = WaveMultiPrefixProduct(scalarVal, mask);
-    int scalarBitAnd = WaveMultiPrefixBitAnd(scalarVal, mask);
-    int scalarBitOr = WaveMultiPrefixBitOr(scalarVal, mask);
-    int scalarBitXor = WaveMultiPrefixBitXor(scalarVal, mask);
-
-    int3 vectorVal = int3(dTid.x, dTid.y, dTid.z);
-    int3 vectorSum = WaveMultiPrefixSum(vectorVal, mask);
-    int3 vectorProduct = WaveMultiPrefixProduct(vectorVal, mask);
-    int3 vectorBitAnd = WaveMultiPrefixBitAnd(vectorVal, mask);
-    int3 vectorBitOr = WaveMultiPrefixBitOr(vectorVal, mask);
-    int3 vectorBitXor = WaveMultiPrefixBitXor(vectorVal, mask);
-
-    float floatScalarVal = float(dTid.x) + 0.5f; // Example floating-point scalar value
-    uint4 floatMask = WaveMatch(floatScalarVal); // Create a mask for matching lanes
-
-    float floatScalarSum = WaveMultiPrefixSum(floatScalarVal, floatMask);
-    float floatScalarProduct = WaveMultiPrefixProduct(floatScalarVal, floatMask);
-
-    float3 floatVectorVal = float3(dTid.x, dTid.y, dTid.z) + 0.5f; // Example floating-point vector value
-    float3 floatVectorSum = WaveMultiPrefixSum(floatVectorVal, floatMask);
-    float3 floatVectorProduct = WaveMultiPrefixProduct(floatVectorVal, floatMask);
-
-    OutputData output;
-    output.scalarSum = scalarSum;
-    output.scalarProduct = scalarProduct;
-    output.scalarBitAnd = scalarBitAnd;
-    output.scalarBitOr = scalarBitOr;
-    output.scalarBitXor = scalarBitXor;
-    output.vectorSum = vectorSum.x;
-    output.vectorProduct = vectorProduct.x;
-    output.vectorBitAnd = vectorBitAnd.x;
-    output.vectorBitOr = vectorBitOr.x;
-    output.vectorBitXor = vectorBitXor.x;
-    output.floatScalarSum = floatScalarSum;
-    output.floatScalarProduct = floatScalarProduct;
-    output.floatVectorSum = floatVectorSum.x;
-    output.floatVectorProduct = floatVectorProduct.x;
-
-    float3x3 floatMatrixVal = float3x3(
-        float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, float(dTid.z) + 0.5f,
-        float(dTid.z) + 0.5f, float(dTid.x) + 0.5f, float(dTid.y) + 0.5f,
-        float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, float(dTid.x) + 0.5f
-    );
-    float3x3 floatMatrixSum = WaveMultiPrefixSum(floatMatrixVal, floatMask);
-    float3x3 floatMatrixProduct = WaveMultiPrefixProduct(floatMatrixVal, floatMask);
-    output.floatMatrixSum = floatMatrixSum[0][0];
-    output.floatMatrixProduct = floatMatrixProduct[0][0];
-
-#if !defined(NO_INTEGER_MATRIX)
-    int3x3 matrixVal = int3x3(
-        dTid.x, dTid.y, dTid.z,
-        dTid.z, dTid.x, dTid.y,
-        dTid.y, dTid.z, dTid.x
-    );
-    int3x3 matrixSum = WaveMultiPrefixSum(matrixVal, mask);
-    int3x3 matrixProduct = WaveMultiPrefixProduct(matrixVal, mask);
-    int3x3 matrixBitAnd = WaveMultiPrefixBitAnd(matrixVal, mask);
-    int3x3 matrixBitOr = WaveMultiPrefixBitOr(matrixVal, mask);
-    int3x3 matrixBitXor = WaveMultiPrefixBitXor(matrixVal, mask);
-    output.matrixSum = matrixSum[0][0];
-    output.matrixProduct = matrixProduct[0][0];
-    output.matrixBitAnd = matrixBitAnd[0][0];
-    output.matrixBitOr = matrixBitOr[0][0];
-    output.matrixBitXor = matrixBitXor[0][0];
-#endif
-
-    outputBuffer[dTid.x] = output;
-}
-
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
new file mode 100644
index 000000000..c2a292c14
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang
@@ -0,0 +1,139 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedAnd subgroupPartitionedAndNV
+#define __partitionedOr subgroupPartitionedOrNV
+#define __partitionedXor subgroupPartitionedXorNV
+#else
+#define __partitionedAnd WaveMultiBitAnd
+#define __partitionedOr WaveMultiBitOr
+#define __partitionedXor WaveMultiBitXor
+#endif
+
+static uint gAndValue = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResult = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+    let andValue = T(gAndValue);
+    let orValue = T(gOrValue);
+    let orResult = T(gOrResult);
+    let xorValue = T(gXorValue);
+    let xorResult = T(gXorResult);
+
+    return true
+        & (__partitionedAnd(andValue, mask) == andValue)
+        & (__partitionedOr(orValue, mask) == orResult)
+        & (__partitionedXor(xorValue, mask) == xorResult)
+        ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let andValue = GVec(T(gAndValue));
+    let orValue = GVec(T(gOrValue));
+    let orResult = GVec(T(gOrResult));
+    let xorValue = GVec(T(gXorValue));
+    let xorResult = GVec(T(gXorResult));
+
+    return true
+        & all(__partitionedAnd(andValue, mask) == andValue)
+        & all(__partitionedOr(orValue, mask) == orResult)
+        & all(__partitionedXor(xorValue, mask) == xorResult)
+        ;
+}
+
+bool testBitwise(uint4 mask)
+{
+    return true
+        & test1Bitwise<int>(mask)
+        & testVBitwise<int, 2>(mask)
+        & testVBitwise<int, 3>(mask)
+        & testVBitwise<int, 4>(mask)
+        & test1Bitwise<uint>(mask)
+        & testVBitwise<uint, 2>(mask)
+        & testVBitwise<uint, 3>(mask)
+        & testVBitwise<uint, 4>(mask)
+
+        // TODO: these are failing SPIRV validation and should be fixed.
+        // SPIRV's ops do not directly accept/return bool.
+        // & test1Bitwise<bool>(mask)
+        // & testVBitwise<bool, 2>(mask)
+        // & testVBitwise<bool, 3>(mask)
+        // & testVBitwise<bool, 4>(mask)
+
+#if !defined(CUDA)
+        & test1Bitwise<int8_t>(mask)
+        & testVBitwise<int8_t, 2>(mask)
+        & testVBitwise<int8_t, 3>(mask)
+        & testVBitwise<int8_t, 4>(mask)
+        & test1Bitwise<int16_t>(mask)
+        & testVBitwise<int16_t, 2>(mask)
+        & testVBitwise<int16_t, 3>(mask)
+        & testVBitwise<int16_t, 4>(mask)
+        & test1Bitwise<int64_t>(mask)
+        & testVBitwise<int64_t, 2>(mask)
+        & testVBitwise<int64_t, 3>(mask)
+        & testVBitwise<int64_t, 4>(mask)
+        & test1Bitwise<uint8_t>(mask)
+        & testVBitwise<uint8_t, 2>(mask)
+        & testVBitwise<uint8_t, 3>(mask)
+        & testVBitwise<uint8_t, 4>(mask)
+        & test1Bitwise<uint16_t>(mask)
+        & testVBitwise<uint16_t, 2>(mask)
+        & testVBitwise<uint16_t, 3>(mask)
+        & testVBitwise<uint16_t, 4>(mask)
+        & test1Bitwise<uint64_t>(mask)
+        & testVBitwise<uint64_t, 2>(mask)
+        & testVBitwise<uint64_t, 3>(mask)
+        & testVBitwise<uint64_t, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    let isSecondGroup = index >= 15;
+    let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    // One invocation in second group is different from others to test or and xor operations.
+    let isOrSet = (index == 15);
+
+    gAndValue = isSecondGroup ? uint(1) : uint(0);
+    gOrValue = isOrSet ? uint(1) : uint(0);
+    gOrResult = isSecondGroup ? uint(1) : uint(0);
+
+    // Alternate 0s and 1s for xor.
+    gXorValue = (index % 2 == 0) ? uint(0) : uint(1);
+    if (isOrSet)
+    {
+        // This is in second group - disrupt the alternating sequence.
+        gXorValue = uint(0);
+    }
+    gXorResult = isSecondGroup ? uint(0) : uint(1);
+
+    bool result = true
+            & testBitwise(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
new file mode 100644
index 000000000..419ffecc5
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang
@@ -0,0 +1,127 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedMin subgroupPartitionedMinNV
+#define __partitionedMax subgroupPartitionedMaxNV
+#else
+#define __partitionedMin WaveMultiMin
+#define __partitionedMax WaveMultiMax
+#endif
+
+
+static uint gMinResult = 0;
+static uint gMaxResult = 0;
+static uint gMinMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+    let minResult = T(gMinResult);
+    let maxResult = T(gMaxResult);
+    let minMaxValue = T(gMinMaxValue);
+
+    return true
+        & all(__partitionedMin(minMaxValue, mask) == minResult)
+        & all(__partitionedMax(minMaxValue, mask) == maxResult)
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let minResult = GVec(T(gMinResult));
+    let maxResult = GVec(T(gMaxResult));
+    let minMaxValue = GVec(T(gMinMaxValue));
+
+    return true
+        & all(__partitionedMin(minMaxValue, mask) == minResult)
+        & all(__partitionedMax(minMaxValue, mask) == maxResult)
+        ;
+}
+
+bool testMinMax(uint4 mask)
+{
+    return true
+        & test1MinMax<int>(mask)
+        & testVMinMax<int, 2>(mask)
+        & testVMinMax<int, 3>(mask)
+        & testVMinMax<int, 4>(mask)
+        & test1MinMax<uint>(mask)
+        & testVMinMax<uint, 2>(mask)
+        & testVMinMax<uint, 3>(mask)
+        & testVMinMax<uint, 4>(mask)
+        & test1MinMax<float>(mask)
+        & testVMinMax<float, 2>(mask)
+        & testVMinMax<float, 3>(mask)
+        & testVMinMax<float, 4>(mask)
+        & test1MinMax<double>(mask)
+        & testVMinMax<double, 2>(mask)
+        & testVMinMax<double, 3>(mask)
+        & testVMinMax<double, 4>(mask)
+
+#if !defined(CUDA)
+        & test1MinMax<int8_t>(mask)
+        & testVMinMax<int8_t, 2>(mask)
+        & testVMinMax<int8_t, 3>(mask)
+        & testVMinMax<int8_t, 4>(mask)
+        & test1MinMax<int16_t>(mask)
+        & testVMinMax<int16_t, 2>(mask)
+        & testVMinMax<int16_t, 3>(mask)
+        & testVMinMax<int16_t, 4>(mask)
+        & test1MinMax<int64_t>(mask)
+        & testVMinMax<int64_t, 2>(mask)
+        & testVMinMax<int64_t, 3>(mask)
+        & testVMinMax<int64_t, 4>(mask)
+        & test1MinMax<uint8_t>(mask)
+        & testVMinMax<uint8_t, 2>(mask)
+        & testVMinMax<uint8_t, 3>(mask)
+        & testVMinMax<uint8_t, 4>(mask)
+        & test1MinMax<uint16_t>(mask)
+        & testVMinMax<uint16_t, 2>(mask)
+        & testVMinMax<uint16_t, 3>(mask)
+        & testVMinMax<uint16_t, 4>(mask)
+        & test1MinMax<uint64_t>(mask)
+        & testVMinMax<uint64_t, 2>(mask)
+        & testVMinMax<uint64_t, 3>(mask)
+        & testVMinMax<uint64_t, 4>(mask)
+        & test1MinMax<half>(mask)
+        & testVMinMax<half, 2>(mask)
+        & testVMinMax<half, 3>(mask)
+        & testVMinMax<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    // Set min value on one invocation on each partition/mask.
+    let isMinInvocation = (index == 0) || (index == 15);
+
+    gMinResult = isSecondGroup ? uint(2) : uint(0);
+    gMaxResult = isSecondGroup ? uint(3) : uint(1);
+    gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult;
+
+    bool result = true
+            && testMinMax(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
new file mode 100644
index 000000000..bb1182e5e
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang
@@ -0,0 +1,163 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveAnd subgroupPartitionedInclusiveAndNV
+#define __partitionedInclusiveOr subgroupPartitionedInclusiveOrNV
+#define __partitionedInclusiveXor subgroupPartitionedInclusiveXorNV
+#define __partitionedExclusiveAnd subgroupPartitionedExclusiveAndNV
+#define __partitionedExclusiveOr subgroupPartitionedExclusiveOrNV
+#define __partitionedExclusiveXor subgroupPartitionedExclusiveXorNV
+#else
+#define __partitionedInclusiveAnd WaveMultiPrefixInclusiveBitAnd
+#define __partitionedInclusiveOr WaveMultiPrefixInclusiveBitOr
+#define __partitionedInclusiveXor WaveMultiPrefixInclusiveBitXor
+#define __partitionedExclusiveAnd WaveMultiPrefixExclusiveBitAnd
+#define __partitionedExclusiveOr WaveMultiPrefixExclusiveBitOr
+#define __partitionedExclusiveXor WaveMultiPrefixExclusiveBitXor
+#endif
+
+
+static uint gAndValue = 0;
+static uint gAndResultExclusive = 0;
+static uint gOrValue = 0;
+static uint gOrResult = 0;
+static uint gXorValue = 0;
+static uint gXorResultInclusive = 0;
+static uint gXorResultExclusive = 0;
+
+__generic<T : __BuiltinLogicalType>
+bool test1Bitwise(uint4 mask)
+{
+    let andValue = T(gAndValue);
+    let orValue = T(gOrValue);
+    let xorValue = T(gXorValue);
+
+    return true
+        & (__partitionedInclusiveAnd(andValue, mask) == andValue)
+        & (__partitionedExclusiveAnd(andValue, mask) == T(gAndResultExclusive))
+        & (__partitionedInclusiveOr(orValue, mask) == orValue)
+        & (__partitionedExclusiveOr(orValue, mask) == T(0))
+        & (__partitionedInclusiveXor(xorValue, mask) == T(gXorResultInclusive))
+        & (__partitionedExclusiveXor(xorValue, mask) == T(gXorResultExclusive))
+        ;
+}
+
+__generic<T : __BuiltinLogicalType, let N : int>
+bool testVBitwise(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let andValue = GVec(T(gAndValue));
+    let orValue = GVec(T(gOrValue));
+    let xorValue = GVec(T(gXorValue));
+
+    return true
+        & all(__partitionedInclusiveAnd(andValue, mask) == andValue)
+        & all(__partitionedExclusiveAnd(andValue, mask) == GVec(T(gAndResultExclusive)))
+        & all(__partitionedInclusiveOr(orValue, mask) == orValue)
+        & all(__partitionedExclusiveOr(orValue, mask) == GVec(T(0)))
+        & all(__partitionedInclusiveXor(xorValue, mask) == GVec(T(gXorResultInclusive)))
+        & all(__partitionedExclusiveXor(xorValue, mask) == GVec(T(gXorResultExclusive)))
+        ;
+}
+
+bool testBitwise(uint4 mask)
+{
+    return true
+        & test1Bitwise<int>(mask)
+        & testVBitwise<int, 2>(mask)
+        & testVBitwise<int, 3>(mask)
+        & testVBitwise<int, 4>(mask)
+        & test1Bitwise<uint>(mask)
+        & testVBitwise<uint, 2>(mask)
+        & testVBitwise<uint, 3>(mask)
+        & testVBitwise<uint, 4>(mask)
+
+        // TODO: these are failing SPIRV validation and should be fixed.
+        // SPIRV's ops do not directly accept/return bool.
+        // & test1Bitwise<bool>(mask)
+        // & testVBitwise<bool, 2>(mask)
+        // & testVBitwise<bool, 3>(mask)
+        // & testVBitwise<bool, 4>(mask)
+
+#if defined(VK)
+        & test1Bitwise<int8_t>(mask)
+        & testVBitwise<int8_t, 2>(mask)
+        & testVBitwise<int8_t, 3>(mask)
+        & testVBitwise<int8_t, 4>(mask)
+        & test1Bitwise<uint8_t>(mask)
+        & testVBitwise<uint8_t, 2>(mask)
+        & testVBitwise<uint8_t, 3>(mask)
+        & testVBitwise<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+        & test1Bitwise<int16_t>(mask)
+        & testVBitwise<int16_t, 2>(mask)
+        & testVBitwise<int16_t, 3>(mask)
+        & testVBitwise<int16_t, 4>(mask)
+        & test1Bitwise<int64_t>(mask)
+        & testVBitwise<int64_t, 2>(mask)
+        & testVBitwise<int64_t, 3>(mask)
+        & testVBitwise<int64_t, 4>(mask)
+        & test1Bitwise<uint16_t>(mask)
+        & testVBitwise<uint16_t, 2>(mask)
+        & testVBitwise<uint16_t, 3>(mask)
+        & testVBitwise<uint16_t, 4>(mask)
+        & test1Bitwise<uint64_t>(mask)
+        & testVBitwise<uint64_t, 2>(mask)
+        & testVBitwise<uint64_t, 3>(mask)
+        & testVBitwise<uint64_t, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    let isSecondGroup = index >= 15;
+    let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+    let isLastInvocation = (index == 31);
+    let isLastInPartition = (index == 14) || (index == 31);
+    let isFirstInPartition = (index == 0) || (index == 15);
+
+    //
+    // Prefix and.
+    // - Both groups use 1 except for the last invocation in each partition where input is 0.
+    // - For inclusive ops, result is 1 except for last invocation in each partition.
+    // - For exclusive ops, first in partition is always results to ~0(identity). Otherwise exclusive ops result to 1.
+    gAndValue = isLastInPartition ? uint(0) : uint(1);
+    gAndResultExclusive = isFirstInPartition ? uint(~0) : uint(1);
+
+    //
+    // Prefix or.
+    // - Both groups use 0 except for the last invocation in each partition where input is 1.
+    // - For inclusive ops, result is 0 except for last invocation in each partition.
+    // - For exclusive ops, result is always 0.
+    gOrValue = isLastInPartition ? uint(1) : uint(0);
+
+    // Prefix xor.
+    // - First group input is always 1. Inclusive results alternate between 1 and 0, starting at 1. Exclusive result is also alternates but starts at 0 (opposite of inclusive result).
+    // - Second group is always 0. Results are all 0.
+    gXorValue = isSecondGroup ? uint(0) : uint(1);
+    gXorResultInclusive = (isSecondGroup || (index % 2 != 0)) ? uint(0) : uint(1);
+    gXorResultExclusive = isSecondGroup ? uint(0) : (uint(1) - gXorResultInclusive);
+
+    bool result = true
+            & testBitwise(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
new file mode 100644
index 000000000..654fd6130
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMax subgroupPartitionedInclusiveMaxNV
+#define __partitionedExclusiveMax subgroupPartitionedExclusiveMaxNV
+#else
+#define __partitionedInclusiveMax WaveMultiPrefixInclusiveMax
+#define __partitionedExclusiveMax WaveMultiPrefixExclusiveMax
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1MinMax(uint4 mask)
+{
+    let smaller = T(gSmaller);
+    let maxValue = T(gMaxValue);
+
+    // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & (__partitionedExclusiveMax(maxValue, mask) == smaller)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & (__partitionedInclusiveMax(maxValue, mask) == maxValue)
+        & exclusiveRes
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMinMax(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let smaller = GVec(T(gSmaller));
+    let maxValue = GVec(T(gMaxValue));
+
+    // The larger values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & all(__partitionedExclusiveMax(maxValue, mask) == smaller)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & all(__partitionedInclusiveMax(maxValue, mask) == maxValue)
+        & exclusiveRes;
+        ;
+}
+
+bool testMinMax(uint4 mask)
+{
+    return true
+        & test1MinMax<int>(mask)
+        & testVMinMax<int, 2>(mask)
+        & testVMinMax<int, 3>(mask)
+        & testVMinMax<int, 4>(mask)
+        & test1MinMax<uint>(mask)
+        & testVMinMax<uint, 2>(mask)
+        & testVMinMax<uint, 3>(mask)
+        & testVMinMax<uint, 4>(mask)
+        & test1MinMax<float>(mask)
+        & testVMinMax<float, 2>(mask)
+        & testVMinMax<float, 3>(mask)
+        & testVMinMax<float, 4>(mask)
+        & test1MinMax<double>(mask)
+        & testVMinMax<double, 2>(mask)
+        & testVMinMax<double, 3>(mask)
+        & testVMinMax<double, 4>(mask)
+        & test1MinMax<int8_t>(mask)
+        & testVMinMax<int8_t, 2>(mask)
+        & testVMinMax<int8_t, 3>(mask)
+        & testVMinMax<int8_t, 4>(mask)
+        & test1MinMax<int16_t>(mask)
+        & testVMinMax<int16_t, 2>(mask)
+        & testVMinMax<int16_t, 3>(mask)
+        & testVMinMax<int16_t, 4>(mask)
+        & test1MinMax<int64_t>(mask)
+        & testVMinMax<int64_t, 2>(mask)
+        & testVMinMax<int64_t, 3>(mask)
+        & testVMinMax<int64_t, 4>(mask)
+        & test1MinMax<uint8_t>(mask)
+        & testVMinMax<uint8_t, 2>(mask)
+        & testVMinMax<uint8_t, 3>(mask)
+        & testVMinMax<uint8_t, 4>(mask)
+        & test1MinMax<uint16_t>(mask)
+        & testVMinMax<uint16_t, 2>(mask)
+        & testVMinMax<uint16_t, 3>(mask)
+        & testVMinMax<uint16_t, 4>(mask)
+        & test1MinMax<uint64_t>(mask)
+        & testVMinMax<uint64_t, 2>(mask)
+        & testVMinMax<uint64_t, 3>(mask)
+        & testVMinMax<uint64_t, 4>(mask)
+        & test1MinMax<half>(mask)
+        & testVMinMax<half, 2>(mask)
+        & testVMinMax<half, 3>(mask)
+        & testVMinMax<half, 4>(mask)
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    isFirstInPartition = (index == 0) || (index == 15);
+    let isLastInPartition = (index == 14) || (index == 31);
+
+    gSmaller = isSecondGroup ? 2 : 0;
+    gLarger = isSecondGroup ? 3 : 1;
+    gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+    bool result = true
+            & testMinMax(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
new file mode 100644
index 000000000..68e1e9c05
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
@@ -0,0 +1,144 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveMin subgroupPartitionedInclusiveMinNV
+#define __partitionedExclusiveMin subgroupPartitionedExclusiveMinNV
+#else
+#define __partitionedInclusiveMin WaveMultiPrefixInclusiveMin
+#define __partitionedExclusiveMin WaveMultiPrefixExclusiveMin
+#endif
+
+static bool isFirstInPartition = false;
+static uint gSmaller = 0;
+static uint gLarger = 0;
+static uint gMaxValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1Min(uint4 mask)
+{
+    let larger = T(gLarger);
+    let minValue = T(gMaxValue);
+
+    // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & (__partitionedExclusiveMin(minValue, mask) == larger)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & (__partitionedInclusiveMin(minValue, mask) == minValue)
+        & exclusiveRes
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVMin(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let larger = GVec(T(gLarger));
+    let minValue = GVec(T(gMaxValue));
+
+    // The smaller values are set to be the last in the partition, exclusive variants will never get these values.
+    bool exclusiveRes = true
+                        & all(__partitionedExclusiveMin(minValue, mask) == larger)
+                        ;
+    // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be
+    // nice to have something like T::min or T::max.
+    if (isFirstInPartition)
+    {
+        exclusiveRes = true;
+    }
+
+    return true
+        & all(__partitionedInclusiveMin(minValue, mask) == minValue)
+        & exclusiveRes
+        ;
+}
+
+bool testMin(uint4 mask)
+{
+    return true
+        & test1Min<int>(mask)
+        & testVMin<int, 2>(mask)
+        & testVMin<int, 3>(mask)
+        & testVMin<int, 4>(mask)
+        & test1Min<uint>(mask)
+        & testVMin<uint, 2>(mask)
+        & testVMin<uint, 3>(mask)
+        & testVMin<uint, 4>(mask)
+        & test1Min<float>(mask)
+        & testVMin<float, 2>(mask)
+        & testVMin<float, 3>(mask)
+        & testVMin<float, 4>(mask)
+        & test1Min<double>(mask)
+        & testVMin<double, 2>(mask)
+        & testVMin<double, 3>(mask)
+        & testVMin<double, 4>(mask)
+        & test1Min<int8_t>(mask)
+        & testVMin<int8_t, 2>(mask)
+        & testVMin<int8_t, 3>(mask)
+        & testVMin<int8_t, 4>(mask)
+        & test1Min<int16_t>(mask)
+        & testVMin<int16_t, 2>(mask)
+        & testVMin<int16_t, 3>(mask)
+        & testVMin<int16_t, 4>(mask)
+        & test1Min<int64_t>(mask)
+        & testVMin<int64_t, 2>(mask)
+        & testVMin<int64_t, 3>(mask)
+        & testVMin<int64_t, 4>(mask)
+        & test1Min<uint8_t>(mask)
+        & testVMin<uint8_t, 2>(mask)
+        & testVMin<uint8_t, 3>(mask)
+        & testVMin<uint8_t, 4>(mask)
+        & test1Min<uint16_t>(mask)
+        & testVMin<uint16_t, 2>(mask)
+        & testVMin<uint16_t, 3>(mask)
+        & testVMin<uint16_t, 4>(mask)
+        & test1Min<uint64_t>(mask)
+        & testVMin<uint64_t, 2>(mask)
+        & testVMin<uint64_t, 3>(mask)
+        & testVMin<uint64_t, 4>(mask)
+        & test1Min<half>(mask)
+        & testVMin<half, 2>(mask)
+        & testVMin<half, 3>(mask)
+        & testVMin<half, 4>(mask)
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+[MaximallyReconverges]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    let index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+
+    isFirstInPartition = (index == 0) || (index == 15);
+    let isLastInPartition = (index == 14) || (index == 31);
+
+    bool result = true
+            & testMin(mask)
+            ;
+
+    gSmaller = isSecondGroup ? 2 : 0;
+    gLarger = isSecondGroup ? 3 : 1;
+    gMaxValue = isLastInPartition ? gLarger : gSmaller;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
new file mode 100644
index 000000000..5de34b20a
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
@@ -0,0 +1,75 @@
+//TEST_CATEGORY(wave, compute)
+//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
+//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
+
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-vk -compute -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(8, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = int(dispatchThreadID.x);
+
+    // Split into two groups.
+    uint4 mask = 0b00001111;
+    if (index >=  4)
+    {
+        mask = 0b11110000;
+    }
+
+    //
+    // WaveMultiPrefixSum.
+    // Results in hex: [0 1 3 7], [0 10 30 70]
+    //
+    uint sumValue = WaveMultiPrefixSum(1 << index, mask);
+    const uint sumBaseIndex = 0;
+    outputBuffer[sumBaseIndex + index] = sumValue;
+
+    //
+    // WaveMultiPrefixProduct.
+    // Results in hex: [1 1 2 8], [1 10 200 8000]
+    //
+    uint productValue = WaveMultiPrefixProduct(1 << index, mask);
+    const uint productBaseIndex = 8;
+    outputBuffer[productBaseIndex + index] = productValue;
+
+    //
+    // WaveMultiPrefixBitAnd.
+    // This prefix operation starts with all bits set.
+    // Results in hex: [FFFFFFFF 1 1 1], [FFFFFFFF F F F]
+    //
+    uint andBits = 0b1;
+    if (index >= 4)
+    {
+        andBits = 0b1111;
+    }
+    uint andValue = WaveMultiPrefixBitAnd(andBits, mask);
+    const uint andBaseIndex = 16;
+    outputBuffer[andBaseIndex + index] = andValue;
+
+    //
+    // WaveMultiPrefixBitOr.
+    // Results in hex: [0 1 3 7], [0 10 30 70]
+    //
+    uint orValue = WaveMultiPrefixBitOr(1 << index, mask);
+    const uint orBaseIndex = 24;
+    outputBuffer[orBaseIndex + index] = orValue;
+
+    //
+    // WaveMultiPrefixBitXor.
+    // Results in hex: [0 1 3 7], [0 F 0 F]
+    //
+    uint xorBits = (1 << index);
+    if (index >= 4)
+    {
+        xorBits = 0b1111;
+    }
+    uint xorValue = WaveMultiPrefixBitXor(xorBits, mask);
+    const uint xorBaseIndex = 32;
+    outputBuffer[xorBaseIndex + index] = xorValue;
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
new file mode 100644
index 000000000..c80baa5b1
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt
@@ -0,0 +1,40 @@
+0
+1
+3
+7
+0
+10
+30
+70
+1
+1
+2
+8
+1
+10
+200
+8000
+FFFFFFFF
+1
+1
+1
+FFFFFFFF
+F
+F
+F
+0
+1
+3
+7
+0
+10
+30
+70
+0
+1
+3
+7
+0
+F
+0
+F
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
new file mode 100644
index 000000000..bb641cab1
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang
@@ -0,0 +1,136 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedInclusiveSum subgroupPartitionedInclusiveAddNV
+#define __partitionedInclusiveProduct subgroupPartitionedInclusiveMulNV
+#define __partitionedExclusiveSum subgroupPartitionedExclusiveAddNV
+#define __partitionedExclusiveProduct subgroupPartitionedExclusiveMulNV
+#else
+#define __partitionedInclusiveSum WaveMultiPrefixInclusiveSum
+#define __partitionedInclusiveProduct WaveMultiPrefixInclusiveProduct
+#define __partitionedExclusiveSum WaveMultiPrefixExclusiveSum
+#define __partitionedExclusiveProduct WaveMultiPrefixExclusiveProduct
+#endif
+
+static uint partitionedIndex = 0;
+static uint gProductValue = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+    let productValue = T(gProductValue);
+
+    return true
+        & (__partitionedInclusiveSum(T(1), mask) == T(partitionedIndex + 1))
+        & (__partitionedInclusiveProduct(productValue, mask) == productValue)
+        & (__partitionedExclusiveSum(T(1), mask) == T(partitionedIndex))
+        & (__partitionedExclusiveProduct(productValue, mask) == T(1))
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let productValue = GVec(T(gProductValue));
+
+    return true
+        & all(__partitionedInclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex + 1)))
+        & all(__partitionedInclusiveProduct(productValue, mask) == productValue)
+        & all(__partitionedExclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex)))
+        & all(__partitionedExclusiveProduct(productValue, mask) == GVec(T(1)))
+        ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+    return true
+        & test1SumProduct<int>(mask)
+        & testVSumProduct<int, 2>(mask)
+        & testVSumProduct<int, 3>(mask)
+        & testVSumProduct<int, 4>(mask)
+        & test1SumProduct<uint>(mask)
+        & testVSumProduct<uint, 2>(mask)
+        & testVSumProduct<uint, 3>(mask)
+        & testVSumProduct<uint, 4>(mask)
+        & test1SumProduct<float>(mask)
+        & testVSumProduct<float, 2>(mask)
+        & testVSumProduct<float, 3>(mask)
+        & testVSumProduct<float, 4>(mask)
+        & test1SumProduct<double>(mask)
+        & testVSumProduct<double, 2>(mask)
+        & testVSumProduct<double, 3>(mask)
+        & testVSumProduct<double, 4>(mask)
+
+#if defined(VK)
+        & test1SumProduct<int8_t>(mask)
+        & testVSumProduct<int8_t, 2>(mask)
+        & testVSumProduct<int8_t, 3>(mask)
+        & testVSumProduct<int8_t, 4>(mask)
+        & test1SumProduct<uint8_t>(mask)
+        & testVSumProduct<uint8_t, 2>(mask)
+        & testVSumProduct<uint8_t, 3>(mask)
+        & testVSumProduct<uint8_t, 4>(mask)
+#endif
+
+#if !defined(CUDA)
+        & test1SumProduct<int16_t>(mask)
+        & testVSumProduct<int16_t, 2>(mask)
+        & testVSumProduct<int16_t, 3>(mask)
+        & testVSumProduct<int16_t, 4>(mask)
+        & test1SumProduct<int64_t>(mask)
+        & testVSumProduct<int64_t, 2>(mask)
+        & testVSumProduct<int64_t, 3>(mask)
+        & testVSumProduct<int64_t, 4>(mask)
+        & test1SumProduct<uint16_t>(mask)
+        & testVSumProduct<uint16_t, 2>(mask)
+        & testVSumProduct<uint16_t, 3>(mask)
+        & testVSumProduct<uint16_t, 4>(mask)
+        & test1SumProduct<uint64_t>(mask)
+        & testVSumProduct<uint64_t, 2>(mask)
+        & testVSumProduct<uint64_t, 3>(mask)
+        & testVSumProduct<uint64_t, 4>(mask)
+        & test1SumProduct<half>(mask)
+        & testVSumProduct<half, 2>(mask)
+        & testVSumProduct<half, 3>(mask)
+        & testVSumProduct<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+    partitionedIndex = index;
+    bool isSecondGroup = false;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    uint4 mask = uint4(0x0007FFF, 0, 0, 0);
+    if (index >=  15)
+    {
+        isSecondGroup = true;
+        mask = uint4(0xFFFF8000, 0, 0, 0);
+        partitionedIndex -= 15;
+    }
+
+    let isLastInPartition = (index == 14) || (index == 31);
+    gProductValue = isLastInPartition ? uint(0) : uint(1);
+
+    bool result = true
+            & testSumProduct(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
new file mode 100644
index 000000000..99698e497
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang
@@ -0,0 +1,146 @@
+//TEST:SIMPLE(filecheck=CHECK_SPIRV): -stage compute -entry computeMain -target spirv -DNO_INTEGER_MATRIX
+//TEST:SIMPLE(filecheck=CHECK_GLSL): -stage compute -entry computeMain -target glsl -DNO_INTEGER_MATRIX
+//TEST:SIMPLE(filecheck=CHECK_CUDA): -stage compute -entry computeMain -target cuda
+//TEST:SIMPLE(filecheck=CHECK_HLSL): -stage compute -entry computeMain -target hlsl
+
+//
+// Tests all variants and overloads of WaveMultiPrefix* arithmetic intrinsics.
+//
+
+struct OutputData
+{
+    int scalarSum;
+    int scalarProduct;
+    int scalarBitAnd;
+    int scalarBitOr;
+    int scalarBitXor;
+    int vectorSum;
+    int vectorProduct;
+    int vectorBitAnd;
+    int vectorBitOr;
+    int vectorBitXor;
+    int matrixSum;
+    int matrixProduct;
+    int matrixBitAnd;
+    int matrixBitOr;
+    int matrixBitXor;
+    float floatScalarSum;
+    float floatScalarProduct;
+    float floatVectorSum;
+    float floatVectorProduct;
+    float floatMatrixSum;
+    float floatMatrixProduct;
+};
+
+RWStructuredBuffer<OutputData> outputBuffer;
+
+// CHECK_SPIRV: OpCapability GroupNonUniformPartitionedNV
+// CHECK_SPIRV: OpExtension "SPV_NV_shader_subgroup_partitioned"
+// CHECK_SPIRV: OpGroupNonUniformIAdd{{.*}}PartitionedExclusiveScanNV
+// CHECK_SPIRV: OpGroupNonUniformIMul{{.*}}PartitionedExclusiveScanNV
+// CHECK_SPIRV: OpGroupNonUniformBitwiseAnd{{.*}}PartitionedExclusiveScanNV
+// CHECK_SPIRV: OpGroupNonUniformBitwiseOr{{.*}}PartitionedExclusiveScanNV
+// CHECK_SPIRV: OpGroupNonUniformBitwiseXor{{.*}}PartitionedExclusiveScanNV
+// CHECK_SPIRV: OpGroupNonUniformFAdd{{.*}}PartitionedExclusiveScanNV
+
+// CHECK_GLSL: GL_NV_shader_subgroup_partitioned
+// CHECK_GLSL: subgroupPartitionedExclusiveAddNV
+// CHECK_GLSL: subgroupPartitionedExclusiveMulNV
+// CHECK_GLSL: subgroupPartitionedExclusiveAndNV
+// CHECK_GLSL: subgroupPartitionedExclusiveOrNV
+// CHECK_GLSL: subgroupPartitionedExclusiveXorNV
+
+// CHECK_CUDA: _wavePrefixSum
+// CHECK_CUDA: _wavePrefixProduct
+// CHECK_CUDA: _wavePrefixAnd
+// CHECK_CUDA: _wavePrefixOr
+// CHECK_CUDA: _wavePrefixXor
+// CHECK_CUDA: _wavePrefixSumMultiple
+// CHECK_CUDA: _wavePrefixProductMultiple
+// CHECK_CUDA: _wavePrefixAndMultiple
+// CHECK_CUDA: _wavePrefixOrMultiple
+// CHECK_CUDA: _wavePrefixXorMultiple
+
+// CHECK_HLSL: WaveMultiPrefixSum
+// CHECK_HLSL: WaveMultiPrefixProduct
+// CHECK_HLSL: WaveMultiPrefixBitAnd
+// CHECK_HLSL: WaveMultiPrefixBitOr
+// CHECK_HLSL: WaveMultiPrefixBitXor
+
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dTid : SV_DispatchThreadID)
+{
+    int scalarVal = dTid.x;
+    uint4 mask = WaveMatch(scalarVal);
+
+    int scalarSum = WaveMultiPrefixSum(scalarVal, mask);
+    int scalarProduct = WaveMultiPrefixProduct(scalarVal, mask);
+    int scalarBitAnd = WaveMultiPrefixBitAnd(scalarVal, mask);
+    int scalarBitOr = WaveMultiPrefixBitOr(scalarVal, mask);
+    int scalarBitXor = WaveMultiPrefixBitXor(scalarVal, mask);
+
+    int3 vectorVal = int3(dTid.x, dTid.y, dTid.z);
+    int3 vectorSum = WaveMultiPrefixSum(vectorVal, mask);
+    int3 vectorProduct = WaveMultiPrefixProduct(vectorVal, mask);
+    int3 vectorBitAnd = WaveMultiPrefixBitAnd(vectorVal, mask);
+    int3 vectorBitOr = WaveMultiPrefixBitOr(vectorVal, mask);
+    int3 vectorBitXor = WaveMultiPrefixBitXor(vectorVal, mask);
+
+    float floatScalarVal = float(dTid.x) + 0.5f; // Example floating-point scalar value
+    uint4 floatMask = WaveMatch(floatScalarVal); // Create a mask for matching lanes
+
+    float floatScalarSum = WaveMultiPrefixSum(floatScalarVal, floatMask);
+    float floatScalarProduct = WaveMultiPrefixProduct(floatScalarVal, floatMask);
+
+    float3 floatVectorVal = float3(dTid.x, dTid.y, dTid.z) + 0.5f; // Example floating-point vector value
+    float3 floatVectorSum = WaveMultiPrefixSum(floatVectorVal, floatMask);
+    float3 floatVectorProduct = WaveMultiPrefixProduct(floatVectorVal, floatMask);
+
+    OutputData output;
+    output.scalarSum = scalarSum;
+    output.scalarProduct = scalarProduct;
+    output.scalarBitAnd = scalarBitAnd;
+    output.scalarBitOr = scalarBitOr;
+    output.scalarBitXor = scalarBitXor;
+    output.vectorSum = vectorSum.x;
+    output.vectorProduct = vectorProduct.x;
+    output.vectorBitAnd = vectorBitAnd.x;
+    output.vectorBitOr = vectorBitOr.x;
+    output.vectorBitXor = vectorBitXor.x;
+    output.floatScalarSum = floatScalarSum;
+    output.floatScalarProduct = floatScalarProduct;
+    output.floatVectorSum = floatVectorSum.x;
+    output.floatVectorProduct = floatVectorProduct.x;
+
+    float3x3 floatMatrixVal = float3x3(
+        float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, float(dTid.z) + 0.5f,
+        float(dTid.z) + 0.5f, float(dTid.x) + 0.5f, float(dTid.y) + 0.5f,
+        float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, float(dTid.x) + 0.5f
+    );
+    float3x3 floatMatrixSum = WaveMultiPrefixSum(floatMatrixVal, floatMask);
+    float3x3 floatMatrixProduct = WaveMultiPrefixProduct(floatMatrixVal, floatMask);
+    output.floatMatrixSum = floatMatrixSum[0][0];
+    output.floatMatrixProduct = floatMatrixProduct[0][0];
+
+#if !defined(NO_INTEGER_MATRIX)
+    int3x3 matrixVal = int3x3(
+        dTid.x, dTid.y, dTid.z,
+        dTid.z, dTid.x, dTid.y,
+        dTid.y, dTid.z, dTid.x
+    );
+    int3x3 matrixSum = WaveMultiPrefixSum(matrixVal, mask);
+    int3x3 matrixProduct = WaveMultiPrefixProduct(matrixVal, mask);
+    int3x3 matrixBitAnd = WaveMultiPrefixBitAnd(matrixVal, mask);
+    int3x3 matrixBitOr = WaveMultiPrefixBitOr(matrixVal, mask);
+    int3x3 matrixBitXor = WaveMultiPrefixBitXor(matrixVal, mask);
+    output.matrixSum = matrixSum[0][0];
+    output.matrixProduct = matrixProduct[0][0];
+    output.matrixBitAnd = matrixBitAnd[0][0];
+    output.matrixBitOr = matrixBitOr[0][0];
+    output.matrixBitXor = matrixBitXor[0][0];
+#endif
+
+    outputBuffer[dTid.x] = output;
+}
+
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
new file mode 100644
index 000000000..b40b014f4
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang
@@ -0,0 +1,114 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute  -shaderobj -xslang -DCUDA
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#if defined(USE_GLSL_SYNTAX)
+#define __partitionedSum subgroupPartitionedAddNV
+#define __partitionedProduct subgroupPartitionedMulNV
+#else
+#define __partitionedSum WaveMultiSum
+#define __partitionedProduct WaveMultiProduct
+#endif
+
+static uint gSumResult = 0;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1SumProduct(uint4 mask)
+{
+    let sumResult = T(gSumResult);
+
+    return true
+        & (__partitionedSum(T(1), mask) == sumResult)
+        & (__partitionedProduct(T(1), mask) == T(1))
+        ;
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVSumProduct(uint4 mask) {
+    typealias GVec = vector<T, N>;
+
+    let sumResult = GVec(T(gSumResult));
+
+    return true
+        & all(__partitionedSum(GVec(T(1)), mask) == sumResult)
+        & all(__partitionedProduct(GVec(T(1)), mask) == GVec(T(1)))
+        ;
+}
+
+bool testSumProduct(uint4 mask)
+{
+    return true
+        & test1SumProduct<int>(mask)
+        & testVSumProduct<int, 2>(mask)
+        & testVSumProduct<int, 3>(mask)
+        & testVSumProduct<int, 4>(mask)
+        & test1SumProduct<uint>(mask)
+        & testVSumProduct<uint, 2>(mask)
+        & testVSumProduct<uint, 3>(mask)
+        & testVSumProduct<uint, 4>(mask)
+        & test1SumProduct<float>(mask)
+        & testVSumProduct<float, 2>(mask)
+        & testVSumProduct<float, 3>(mask)
+        & testVSumProduct<float, 4>(mask)
+        & test1SumProduct<double>(mask)
+        & testVSumProduct<double, 2>(mask)
+        & testVSumProduct<double, 3>(mask)
+        & testVSumProduct<double, 4>(mask)
+
+#if !defined(CUDA)
+        & test1SumProduct<int8_t>(mask)
+        & testVSumProduct<int8_t, 2>(mask)
+        & testVSumProduct<int8_t, 3>(mask)
+        & testVSumProduct<int8_t, 4>(mask)
+        & test1SumProduct<int16_t>(mask)
+        & testVSumProduct<int16_t, 2>(mask)
+        & testVSumProduct<int16_t, 3>(mask)
+        & testVSumProduct<int16_t, 4>(mask)
+        & test1SumProduct<int64_t>(mask)
+        & testVSumProduct<int64_t, 2>(mask)
+        & testVSumProduct<int64_t, 3>(mask)
+        & testVSumProduct<int64_t, 4>(mask)
+        & test1SumProduct<uint8_t>(mask)
+        & testVSumProduct<uint8_t, 2>(mask)
+        & testVSumProduct<uint8_t, 3>(mask)
+        & testVSumProduct<uint8_t, 4>(mask)
+        & test1SumProduct<uint16_t>(mask)
+        & testVSumProduct<uint16_t, 2>(mask)
+        & testVSumProduct<uint16_t, 3>(mask)
+        & testVSumProduct<uint16_t, 4>(mask)
+        & test1SumProduct<uint64_t>(mask)
+        & testVSumProduct<uint64_t, 2>(mask)
+        & testVSumProduct<uint64_t, 3>(mask)
+        & testVSumProduct<uint64_t, 4>(mask)
+        & test1SumProduct<half>(mask)
+        & testVSumProduct<half, 2>(mask)
+        & testVSumProduct<half, 3>(mask)
+        & testVSumProduct<half, 4>(mask)
+#endif
+        ;
+}
+
+[numthreads(32, 1, 1)]
+[shader("compute")]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint index = dispatchThreadID.x;
+
+    // Split into two groups, first group has 15 invocations/lanes and second group has 17.
+    let isSecondGroup = index >= 15;
+    uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
+    gSumResult = isSecondGroup ? uint(17) : uint(15);
+
+    bool result = true
+            & testSumProduct(mask)
+            ;
+
+    // CHECK-COUNT-32: 1
+    outputBuffer[index] = uint(result);
+}
diff --git a/tests/language-feature/capability/testing-framework-with-profiles.slang b/tests/language-feature/capability/testing-framework-with-profiles.slang
index 215ba887e..97ff32a9d 100644
--- a/tests/language-feature/capability/testing-framework-with-profiles.slang
+++ b/tests/language-feature/capability/testing-framework-with-profiles.slang
@@ -17,5 +17,5 @@ RWStructuredBuffer<uint> outputBuffer;
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
     // BUF: 1
-    outputBuffer[0] = WaveMaskSum(0xFF, 1);
+    outputBuffer[0] = WaveActiveSum(1);
 }
-- 
cgit v1.2.3