From 0476b57faad96bee61f59f27ddd48c6cb067cfa2 Mon Sep 17 00:00:00 2001 From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com> Date: Sun, 25 May 2025 12:58:08 -0400 Subject: Add full support for SPV_NV_shader_subgroup_partitioned (#7103) * Properly implement WaveMask* variants of WaveMultiPrefix* intrinsics * More partitioned intrinsics * More partitioned intrinsics and cleaned up non-prefixed WaveMask* implementations * Refactor HLSL WaveMultiPrefix* implementations * fix cap atoms * Clean up implementation * Add GLSL intrinsics and cleanup * Add tests * Fix affected capability test * Update and fix tests * Move expected.txt file * Refactor WaveMask* to call WaveMulti* * Refactor SPIRV/GLSL preamble code * Enable emit-via-glsl tests * remove wave_multi_prefix capability in favor of subgroup_partitioned * Update docs * Update cap atoms doc --- .../wave-mask/wave-active-product.slang | 4 +- tests/hlsl-intrinsic/wave-mask/wave-diverge.slang | 4 +- tests/hlsl-intrinsic/wave-mask/wave-matrix.slang | 4 +- .../wave-mask/wave-prefix-product.slang | 4 +- .../hlsl-intrinsic/wave-mask/wave-prefix-sum.slang | 4 +- tests/hlsl-intrinsic/wave-mask/wave-vector.slang | 4 +- .../wave-multi-prefix-scalar-functional.slang | 74 ---------- ...lti-prefix-scalar-functional.slang.expected.txt | 40 ----- tests/hlsl-intrinsic/wave-multi-prefix.slang | 146 ------------------ .../wave-multi/wave-multi-bitwise.slang | 139 ++++++++++++++++++ .../wave-multi/wave-multi-min-max.slang | 127 ++++++++++++++++ .../wave-multi/wave-multi-prefix-bitwise.slang | 163 +++++++++++++++++++++ .../wave-multi/wave-multi-prefix-max.slang | 144 ++++++++++++++++++ .../wave-multi/wave-multi-prefix-min.slang | 144 ++++++++++++++++++ .../wave-multi-prefix-scalar-functional.slang | 75 ++++++++++ ...lti-prefix-scalar-functional.slang.expected.txt | 40 +++++ .../wave-multi/wave-multi-prefix-sum-product.slang | 136 +++++++++++++++++ .../wave-multi/wave-multi-prefix.slang | 146 ++++++++++++++++++ .../wave-multi/wave-multi-sum-product.slang | 114 ++++++++++++++ .../testing-framework-with-profiles.slang | 2 +- 20 files changed, 1241 insertions(+), 273 deletions(-) delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt delete mode 100644 tests/hlsl-intrinsic/wave-multi-prefix.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang create mode 100644 tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang (limited to 'tests') diff --git a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang index 8a47c5733..da94ad794 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-active-product.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -26,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) const WaveMask mask2 = mask0 & ~mask1; outputBuffer[idx] = WaveMaskProduct(mask2, idx); -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang index 3dd33f150..3a1c26f8e 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-diverge.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -30,4 +30,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) value = WaveMaskMin(mask2, idx + 1); outputBuffer[idx] = value; -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang index f333a59fb..fb5573bd1 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-matrix.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //DISABLE_TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -37,4 +37,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) matrix r = r0 + matrix(r1) + r6; outputBuffer[idx] = r[0][0] + r[0][1] + r[1][0] + r[1][1]; -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang index b12e9c1b3..e32524b1e 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-product.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -25,4 +25,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) outputBuffer[idx] = r0 + (r2 << 16); -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang index 51e9b7600..2e0fba746 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-prefix-sum.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -23,4 +23,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) int r2 = int(r1.x) + int(r1.y) - idx; outputBuffer[idx] = r0 + (r2 << 16); -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang index b1f44f4fb..7c326e0f3 100644 --- a/tests/hlsl-intrinsic/wave-mask/wave-vector.slang +++ b/tests/hlsl-intrinsic/wave-mask/wave-vector.slang @@ -1,7 +1,7 @@ //TEST_CATEGORY(wave-mask, compute) //DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj //DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_5 -shaderobj -render-feature hardware-device //TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device //TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj @@ -29,4 +29,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) int2 r = r0 + int2(r1) + r2 + r3 + r4; outputBuffer[idx] = r.x + r.y; -} \ No newline at end of file +} diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang deleted file mode 100644 index 69240198e..000000000 --- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang +++ /dev/null @@ -1,74 +0,0 @@ -//TEST_CATEGORY(wave, compute) -//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj - -//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj -//TEST:COMPARE_COMPUTE_EX:-vk -compute -shaderobj -//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj - -//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer -RWStructuredBuffer outputBuffer; - -[numthreads(8, 1, 1)] -void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - uint index = int(dispatchThreadID.x); - - // Split into two groups. - uint4 mask = 0b00001111; - if (index >= 4) - { - mask = 0b11110000; - } - - // - // WaveMultiPrefixSum. - // Results in hex: [0 1 3 7], [0 10 30 70] - // - uint sumValue = WaveMultiPrefixSum(1 << index, mask); - const uint sumBaseIndex = 0; - outputBuffer[sumBaseIndex + index] = sumValue; - - // - // WaveMultiPrefixProduct. - // Results in hex: [1 1 2 8], [1 10 200 8000] - // - uint productValue = WaveMultiPrefixProduct(1 << index, mask); - const uint productBaseIndex = 8; - outputBuffer[productBaseIndex + index] = productValue; - - // - // WaveMultiPrefixBitAnd. - // This prefix operation starts with all bits set. - // Results in hex: [FFFFFFFF 1 1 1], [FFFFFFFF F F F] - // - uint andBits = 0b1; - if (index >= 4) - { - andBits = 0b1111; - } - uint andValue = WaveMultiPrefixBitAnd(andBits, mask); - const uint andBaseIndex = 16; - outputBuffer[andBaseIndex + index] = andValue; - - // - // WaveMultiPrefixBitOr. - // Results in hex: [0 1 3 7], [0 10 30 70] - // - uint orValue = WaveMultiPrefixBitOr(1 << index, mask); - const uint orBaseIndex = 24; - outputBuffer[orBaseIndex + index] = orValue; - - // - // WaveMultiPrefixBitXor. - // Results in hex: [0 1 3 7], [0 F 0 F] - // - uint xorBits = (1 << index); - if (index >= 4) - { - xorBits = 0b1111; - } - uint xorValue = WaveMultiPrefixBitXor(xorBits, mask); - const uint xorBaseIndex = 32; - outputBuffer[xorBaseIndex + index] = xorValue; -} diff --git a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt deleted file mode 100644 index c80baa5b1..000000000 --- a/tests/hlsl-intrinsic/wave-multi-prefix-scalar-functional.slang.expected.txt +++ /dev/null @@ -1,40 +0,0 @@ -0 -1 -3 -7 -0 -10 -30 -70 -1 -1 -2 -8 -1 -10 -200 -8000 -FFFFFFFF -1 -1 -1 -FFFFFFFF -F -F -F -0 -1 -3 -7 -0 -10 -30 -70 -0 -1 -3 -7 -0 -F -0 -F diff --git a/tests/hlsl-intrinsic/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi-prefix.slang deleted file mode 100644 index 99698e497..000000000 --- a/tests/hlsl-intrinsic/wave-multi-prefix.slang +++ /dev/null @@ -1,146 +0,0 @@ -//TEST:SIMPLE(filecheck=CHECK_SPIRV): -stage compute -entry computeMain -target spirv -DNO_INTEGER_MATRIX -//TEST:SIMPLE(filecheck=CHECK_GLSL): -stage compute -entry computeMain -target glsl -DNO_INTEGER_MATRIX -//TEST:SIMPLE(filecheck=CHECK_CUDA): -stage compute -entry computeMain -target cuda -//TEST:SIMPLE(filecheck=CHECK_HLSL): -stage compute -entry computeMain -target hlsl - -// -// Tests all variants and overloads of WaveMultiPrefix* arithmetic intrinsics. -// - -struct OutputData -{ - int scalarSum; - int scalarProduct; - int scalarBitAnd; - int scalarBitOr; - int scalarBitXor; - int vectorSum; - int vectorProduct; - int vectorBitAnd; - int vectorBitOr; - int vectorBitXor; - int matrixSum; - int matrixProduct; - int matrixBitAnd; - int matrixBitOr; - int matrixBitXor; - float floatScalarSum; - float floatScalarProduct; - float floatVectorSum; - float floatVectorProduct; - float floatMatrixSum; - float floatMatrixProduct; -}; - -RWStructuredBuffer outputBuffer; - -// CHECK_SPIRV: OpCapability GroupNonUniformPartitionedNV -// CHECK_SPIRV: OpExtension "SPV_NV_shader_subgroup_partitioned" -// CHECK_SPIRV: OpGroupNonUniformIAdd{{.*}}PartitionedExclusiveScanNV -// CHECK_SPIRV: OpGroupNonUniformIMul{{.*}}PartitionedExclusiveScanNV -// CHECK_SPIRV: OpGroupNonUniformBitwiseAnd{{.*}}PartitionedExclusiveScanNV -// CHECK_SPIRV: OpGroupNonUniformBitwiseOr{{.*}}PartitionedExclusiveScanNV -// CHECK_SPIRV: OpGroupNonUniformBitwiseXor{{.*}}PartitionedExclusiveScanNV -// CHECK_SPIRV: OpGroupNonUniformFAdd{{.*}}PartitionedExclusiveScanNV - -// CHECK_GLSL: GL_NV_shader_subgroup_partitioned -// CHECK_GLSL: subgroupPartitionedExclusiveAddNV -// CHECK_GLSL: subgroupPartitionedExclusiveMulNV -// CHECK_GLSL: subgroupPartitionedExclusiveAndNV -// CHECK_GLSL: subgroupPartitionedExclusiveOrNV -// CHECK_GLSL: subgroupPartitionedExclusiveXorNV - -// CHECK_CUDA: _wavePrefixSum -// CHECK_CUDA: _wavePrefixProduct -// CHECK_CUDA: _wavePrefixAnd -// CHECK_CUDA: _wavePrefixOr -// CHECK_CUDA: _wavePrefixXor -// CHECK_CUDA: _wavePrefixSumMultiple -// CHECK_CUDA: _wavePrefixProductMultiple -// CHECK_CUDA: _wavePrefixAndMultiple -// CHECK_CUDA: _wavePrefixOrMultiple -// CHECK_CUDA: _wavePrefixXorMultiple - -// CHECK_HLSL: WaveMultiPrefixSum -// CHECK_HLSL: WaveMultiPrefixProduct -// CHECK_HLSL: WaveMultiPrefixBitAnd -// CHECK_HLSL: WaveMultiPrefixBitOr -// CHECK_HLSL: WaveMultiPrefixBitXor - - -[numthreads(1, 1, 1)] -void computeMain(uint3 dTid : SV_DispatchThreadID) -{ - int scalarVal = dTid.x; - uint4 mask = WaveMatch(scalarVal); - - int scalarSum = WaveMultiPrefixSum(scalarVal, mask); - int scalarProduct = WaveMultiPrefixProduct(scalarVal, mask); - int scalarBitAnd = WaveMultiPrefixBitAnd(scalarVal, mask); - int scalarBitOr = WaveMultiPrefixBitOr(scalarVal, mask); - int scalarBitXor = WaveMultiPrefixBitXor(scalarVal, mask); - - int3 vectorVal = int3(dTid.x, dTid.y, dTid.z); - int3 vectorSum = WaveMultiPrefixSum(vectorVal, mask); - int3 vectorProduct = WaveMultiPrefixProduct(vectorVal, mask); - int3 vectorBitAnd = WaveMultiPrefixBitAnd(vectorVal, mask); - int3 vectorBitOr = WaveMultiPrefixBitOr(vectorVal, mask); - int3 vectorBitXor = WaveMultiPrefixBitXor(vectorVal, mask); - - float floatScalarVal = float(dTid.x) + 0.5f; // Example floating-point scalar value - uint4 floatMask = WaveMatch(floatScalarVal); // Create a mask for matching lanes - - float floatScalarSum = WaveMultiPrefixSum(floatScalarVal, floatMask); - float floatScalarProduct = WaveMultiPrefixProduct(floatScalarVal, floatMask); - - float3 floatVectorVal = float3(dTid.x, dTid.y, dTid.z) + 0.5f; // Example floating-point vector value - float3 floatVectorSum = WaveMultiPrefixSum(floatVectorVal, floatMask); - float3 floatVectorProduct = WaveMultiPrefixProduct(floatVectorVal, floatMask); - - OutputData output; - output.scalarSum = scalarSum; - output.scalarProduct = scalarProduct; - output.scalarBitAnd = scalarBitAnd; - output.scalarBitOr = scalarBitOr; - output.scalarBitXor = scalarBitXor; - output.vectorSum = vectorSum.x; - output.vectorProduct = vectorProduct.x; - output.vectorBitAnd = vectorBitAnd.x; - output.vectorBitOr = vectorBitOr.x; - output.vectorBitXor = vectorBitXor.x; - output.floatScalarSum = floatScalarSum; - output.floatScalarProduct = floatScalarProduct; - output.floatVectorSum = floatVectorSum.x; - output.floatVectorProduct = floatVectorProduct.x; - - float3x3 floatMatrixVal = float3x3( - float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, - float(dTid.z) + 0.5f, float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, - float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, float(dTid.x) + 0.5f - ); - float3x3 floatMatrixSum = WaveMultiPrefixSum(floatMatrixVal, floatMask); - float3x3 floatMatrixProduct = WaveMultiPrefixProduct(floatMatrixVal, floatMask); - output.floatMatrixSum = floatMatrixSum[0][0]; - output.floatMatrixProduct = floatMatrixProduct[0][0]; - -#if !defined(NO_INTEGER_MATRIX) - int3x3 matrixVal = int3x3( - dTid.x, dTid.y, dTid.z, - dTid.z, dTid.x, dTid.y, - dTid.y, dTid.z, dTid.x - ); - int3x3 matrixSum = WaveMultiPrefixSum(matrixVal, mask); - int3x3 matrixProduct = WaveMultiPrefixProduct(matrixVal, mask); - int3x3 matrixBitAnd = WaveMultiPrefixBitAnd(matrixVal, mask); - int3x3 matrixBitOr = WaveMultiPrefixBitOr(matrixVal, mask); - int3x3 matrixBitXor = WaveMultiPrefixBitXor(matrixVal, mask); - output.matrixSum = matrixSum[0][0]; - output.matrixProduct = matrixProduct[0][0]; - output.matrixBitAnd = matrixBitAnd[0][0]; - output.matrixBitOr = matrixBitOr[0][0]; - output.matrixBitXor = matrixBitXor[0][0]; -#endif - - outputBuffer[dTid.x] = output; -} - diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang new file mode 100644 index 000000000..c2a292c14 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-bitwise.slang @@ -0,0 +1,139 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedAnd subgroupPartitionedAndNV +#define __partitionedOr subgroupPartitionedOrNV +#define __partitionedXor subgroupPartitionedXorNV +#else +#define __partitionedAnd WaveMultiBitAnd +#define __partitionedOr WaveMultiBitOr +#define __partitionedXor WaveMultiBitXor +#endif + +static uint gAndValue = 0; +static uint gOrValue = 0; +static uint gOrResult = 0; +static uint gXorValue = 0; +static uint gXorResult = 0; + +__generic +bool test1Bitwise(uint4 mask) +{ + let andValue = T(gAndValue); + let orValue = T(gOrValue); + let orResult = T(gOrResult); + let xorValue = T(gXorValue); + let xorResult = T(gXorResult); + + return true + & (__partitionedAnd(andValue, mask) == andValue) + & (__partitionedOr(orValue, mask) == orResult) + & (__partitionedXor(xorValue, mask) == xorResult) + ; +} + +__generic +bool testVBitwise(uint4 mask) { + typealias GVec = vector; + + let andValue = GVec(T(gAndValue)); + let orValue = GVec(T(gOrValue)); + let orResult = GVec(T(gOrResult)); + let xorValue = GVec(T(gXorValue)); + let xorResult = GVec(T(gXorResult)); + + return true + & all(__partitionedAnd(andValue, mask) == andValue) + & all(__partitionedOr(orValue, mask) == orResult) + & all(__partitionedXor(xorValue, mask) == xorResult) + ; +} + +bool testBitwise(uint4 mask) +{ + return true + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + + // TODO: these are failing SPIRV validation and should be fixed. + // SPIRV's ops do not directly accept/return bool. + // & test1Bitwise(mask) + // & testVBitwise(mask) + // & testVBitwise(mask) + // & testVBitwise(mask) + +#if !defined(CUDA) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + let isSecondGroup = index >= 15; + let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + // One invocation in second group is different from others to test or and xor operations. + let isOrSet = (index == 15); + + gAndValue = isSecondGroup ? uint(1) : uint(0); + gOrValue = isOrSet ? uint(1) : uint(0); + gOrResult = isSecondGroup ? uint(1) : uint(0); + + // Alternate 0s and 1s for xor. + gXorValue = (index % 2 == 0) ? uint(0) : uint(1); + if (isOrSet) + { + // This is in second group - disrupt the alternating sequence. + gXorValue = uint(0); + } + gXorResult = isSecondGroup ? uint(0) : uint(1); + + bool result = true + & testBitwise(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang new file mode 100644 index 000000000..419ffecc5 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-min-max.slang @@ -0,0 +1,127 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedMin subgroupPartitionedMinNV +#define __partitionedMax subgroupPartitionedMaxNV +#else +#define __partitionedMin WaveMultiMin +#define __partitionedMax WaveMultiMax +#endif + + +static uint gMinResult = 0; +static uint gMaxResult = 0; +static uint gMinMaxValue = 0; + +__generic +bool test1MinMax(uint4 mask) +{ + let minResult = T(gMinResult); + let maxResult = T(gMaxResult); + let minMaxValue = T(gMinMaxValue); + + return true + & all(__partitionedMin(minMaxValue, mask) == minResult) + & all(__partitionedMax(minMaxValue, mask) == maxResult) + ; +} + +__generic +bool testVMinMax(uint4 mask) { + typealias GVec = vector; + + let minResult = GVec(T(gMinResult)); + let maxResult = GVec(T(gMaxResult)); + let minMaxValue = GVec(T(gMinMaxValue)); + + return true + & all(__partitionedMin(minMaxValue, mask) == minResult) + & all(__partitionedMax(minMaxValue, mask) == maxResult) + ; +} + +bool testMinMax(uint4 mask) +{ + return true + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + +#if !defined(CUDA) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + // Set min value on one invocation on each partition/mask. + let isMinInvocation = (index == 0) || (index == 15); + + gMinResult = isSecondGroup ? uint(2) : uint(0); + gMaxResult = isSecondGroup ? uint(3) : uint(1); + gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult; + + bool result = true + && testMinMax(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang new file mode 100644 index 000000000..bb1182e5e --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-bitwise.slang @@ -0,0 +1,163 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveAnd subgroupPartitionedInclusiveAndNV +#define __partitionedInclusiveOr subgroupPartitionedInclusiveOrNV +#define __partitionedInclusiveXor subgroupPartitionedInclusiveXorNV +#define __partitionedExclusiveAnd subgroupPartitionedExclusiveAndNV +#define __partitionedExclusiveOr subgroupPartitionedExclusiveOrNV +#define __partitionedExclusiveXor subgroupPartitionedExclusiveXorNV +#else +#define __partitionedInclusiveAnd WaveMultiPrefixInclusiveBitAnd +#define __partitionedInclusiveOr WaveMultiPrefixInclusiveBitOr +#define __partitionedInclusiveXor WaveMultiPrefixInclusiveBitXor +#define __partitionedExclusiveAnd WaveMultiPrefixExclusiveBitAnd +#define __partitionedExclusiveOr WaveMultiPrefixExclusiveBitOr +#define __partitionedExclusiveXor WaveMultiPrefixExclusiveBitXor +#endif + + +static uint gAndValue = 0; +static uint gAndResultExclusive = 0; +static uint gOrValue = 0; +static uint gOrResult = 0; +static uint gXorValue = 0; +static uint gXorResultInclusive = 0; +static uint gXorResultExclusive = 0; + +__generic +bool test1Bitwise(uint4 mask) +{ + let andValue = T(gAndValue); + let orValue = T(gOrValue); + let xorValue = T(gXorValue); + + return true + & (__partitionedInclusiveAnd(andValue, mask) == andValue) + & (__partitionedExclusiveAnd(andValue, mask) == T(gAndResultExclusive)) + & (__partitionedInclusiveOr(orValue, mask) == orValue) + & (__partitionedExclusiveOr(orValue, mask) == T(0)) + & (__partitionedInclusiveXor(xorValue, mask) == T(gXorResultInclusive)) + & (__partitionedExclusiveXor(xorValue, mask) == T(gXorResultExclusive)) + ; +} + +__generic +bool testVBitwise(uint4 mask) { + typealias GVec = vector; + + let andValue = GVec(T(gAndValue)); + let orValue = GVec(T(gOrValue)); + let xorValue = GVec(T(gXorValue)); + + return true + & all(__partitionedInclusiveAnd(andValue, mask) == andValue) + & all(__partitionedExclusiveAnd(andValue, mask) == GVec(T(gAndResultExclusive))) + & all(__partitionedInclusiveOr(orValue, mask) == orValue) + & all(__partitionedExclusiveOr(orValue, mask) == GVec(T(0))) + & all(__partitionedInclusiveXor(xorValue, mask) == GVec(T(gXorResultInclusive))) + & all(__partitionedExclusiveXor(xorValue, mask) == GVec(T(gXorResultExclusive))) + ; +} + +bool testBitwise(uint4 mask) +{ + return true + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + + // TODO: these are failing SPIRV validation and should be fixed. + // SPIRV's ops do not directly accept/return bool. + // & test1Bitwise(mask) + // & testVBitwise(mask) + // & testVBitwise(mask) + // & testVBitwise(mask) + +#if defined(VK) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) +#endif + +#if !defined(CUDA) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & test1Bitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) + & testVBitwise(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + let isSecondGroup = index >= 15; + let mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + let isLastInvocation = (index == 31); + let isLastInPartition = (index == 14) || (index == 31); + let isFirstInPartition = (index == 0) || (index == 15); + + // + // Prefix and. + // - Both groups use 1 except for the last invocation in each partition where input is 0. + // - For inclusive ops, result is 1 except for last invocation in each partition. + // - For exclusive ops, first in partition is always results to ~0(identity). Otherwise exclusive ops result to 1. + gAndValue = isLastInPartition ? uint(0) : uint(1); + gAndResultExclusive = isFirstInPartition ? uint(~0) : uint(1); + + // + // Prefix or. + // - Both groups use 0 except for the last invocation in each partition where input is 1. + // - For inclusive ops, result is 0 except for last invocation in each partition. + // - For exclusive ops, result is always 0. + gOrValue = isLastInPartition ? uint(1) : uint(0); + + // Prefix xor. + // - First group input is always 1. Inclusive results alternate between 1 and 0, starting at 1. Exclusive result is also alternates but starts at 0 (opposite of inclusive result). + // - Second group is always 0. Results are all 0. + gXorValue = isSecondGroup ? uint(0) : uint(1); + gXorResultInclusive = (isSecondGroup || (index % 2 != 0)) ? uint(0) : uint(1); + gXorResultExclusive = isSecondGroup ? uint(0) : (uint(1) - gXorResultInclusive); + + bool result = true + & testBitwise(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang new file mode 100644 index 000000000..654fd6130 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang @@ -0,0 +1,144 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveMax subgroupPartitionedInclusiveMaxNV +#define __partitionedExclusiveMax subgroupPartitionedExclusiveMaxNV +#else +#define __partitionedInclusiveMax WaveMultiPrefixInclusiveMax +#define __partitionedExclusiveMax WaveMultiPrefixExclusiveMax +#endif + +static bool isFirstInPartition = false; +static uint gSmaller = 0; +static uint gLarger = 0; +static uint gMaxValue = 0; + +__generic +bool test1MinMax(uint4 mask) +{ + let smaller = T(gSmaller); + let maxValue = T(gMaxValue); + + // The larger values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & (__partitionedExclusiveMax(maxValue, mask) == smaller) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & (__partitionedInclusiveMax(maxValue, mask) == maxValue) + & exclusiveRes + ; +} + +__generic +bool testVMinMax(uint4 mask) { + typealias GVec = vector; + + let smaller = GVec(T(gSmaller)); + let maxValue = GVec(T(gMaxValue)); + + // The larger values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & all(__partitionedExclusiveMax(maxValue, mask) == smaller) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & all(__partitionedInclusiveMax(maxValue, mask) == maxValue) + & exclusiveRes; + ; +} + +bool testMinMax(uint4 mask) +{ + return true + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & test1MinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + & testVMinMax(mask) + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +[MaximallyReconverges] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + isFirstInPartition = (index == 0) || (index == 15); + let isLastInPartition = (index == 14) || (index == 31); + + gSmaller = isSecondGroup ? 2 : 0; + gLarger = isSecondGroup ? 3 : 1; + gMaxValue = isLastInPartition ? gLarger : gSmaller; + + bool result = true + & testMinMax(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang new file mode 100644 index 000000000..68e1e9c05 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang @@ -0,0 +1,144 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveMin subgroupPartitionedInclusiveMinNV +#define __partitionedExclusiveMin subgroupPartitionedExclusiveMinNV +#else +#define __partitionedInclusiveMin WaveMultiPrefixInclusiveMin +#define __partitionedExclusiveMin WaveMultiPrefixExclusiveMin +#endif + +static bool isFirstInPartition = false; +static uint gSmaller = 0; +static uint gLarger = 0; +static uint gMaxValue = 0; + +__generic +bool test1Min(uint4 mask) +{ + let larger = T(gLarger); + let minValue = T(gMaxValue); + + // The smaller values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & (__partitionedExclusiveMin(minValue, mask) == larger) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & (__partitionedInclusiveMin(minValue, mask) == minValue) + & exclusiveRes + ; +} + +__generic +bool testVMin(uint4 mask) { + typealias GVec = vector; + + let larger = GVec(T(gLarger)); + let minValue = GVec(T(gMaxValue)); + + // The smaller values are set to be the last in the partition, exclusive variants will never get these values. + bool exclusiveRes = true + & all(__partitionedExclusiveMin(minValue, mask) == larger) + ; + // Do not check exclusive prefix for the first invocation in partition as their values(identity values) depend on the builtin type `T`. It would be + // nice to have something like T::min or T::max. + if (isFirstInPartition) + { + exclusiveRes = true; + } + + return true + & all(__partitionedInclusiveMin(minValue, mask) == minValue) + & exclusiveRes + ; +} + +bool testMin(uint4 mask) +{ + return true + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + & test1Min(mask) + & testVMin(mask) + & testVMin(mask) + & testVMin(mask) + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +[MaximallyReconverges] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + let index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + + isFirstInPartition = (index == 0) || (index == 15); + let isLastInPartition = (index == 14) || (index == 31); + + bool result = true + & testMin(mask) + ; + + gSmaller = isSecondGroup ? 2 : 0; + gLarger = isSecondGroup ? 3 : 1; + gMaxValue = isLastInPartition ? gLarger : gSmaller; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang new file mode 100644 index 000000000..5de34b20a --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang @@ -0,0 +1,75 @@ +//TEST_CATEGORY(wave, compute) +//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj +//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj + +//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj +//TEST:COMPARE_COMPUTE_EX:-vk -compute -shaderobj +//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +[numthreads(8, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = int(dispatchThreadID.x); + + // Split into two groups. + uint4 mask = 0b00001111; + if (index >= 4) + { + mask = 0b11110000; + } + + // + // WaveMultiPrefixSum. + // Results in hex: [0 1 3 7], [0 10 30 70] + // + uint sumValue = WaveMultiPrefixSum(1 << index, mask); + const uint sumBaseIndex = 0; + outputBuffer[sumBaseIndex + index] = sumValue; + + // + // WaveMultiPrefixProduct. + // Results in hex: [1 1 2 8], [1 10 200 8000] + // + uint productValue = WaveMultiPrefixProduct(1 << index, mask); + const uint productBaseIndex = 8; + outputBuffer[productBaseIndex + index] = productValue; + + // + // WaveMultiPrefixBitAnd. + // This prefix operation starts with all bits set. + // Results in hex: [FFFFFFFF 1 1 1], [FFFFFFFF F F F] + // + uint andBits = 0b1; + if (index >= 4) + { + andBits = 0b1111; + } + uint andValue = WaveMultiPrefixBitAnd(andBits, mask); + const uint andBaseIndex = 16; + outputBuffer[andBaseIndex + index] = andValue; + + // + // WaveMultiPrefixBitOr. + // Results in hex: [0 1 3 7], [0 10 30 70] + // + uint orValue = WaveMultiPrefixBitOr(1 << index, mask); + const uint orBaseIndex = 24; + outputBuffer[orBaseIndex + index] = orValue; + + // + // WaveMultiPrefixBitXor. + // Results in hex: [0 1 3 7], [0 F 0 F] + // + uint xorBits = (1 << index); + if (index >= 4) + { + xorBits = 0b1111; + } + uint xorValue = WaveMultiPrefixBitXor(xorBits, mask); + const uint xorBaseIndex = 32; + outputBuffer[xorBaseIndex + index] = xorValue; +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt new file mode 100644 index 000000000..c80baa5b1 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang.expected.txt @@ -0,0 +1,40 @@ +0 +1 +3 +7 +0 +10 +30 +70 +1 +1 +2 +8 +1 +10 +200 +8000 +FFFFFFFF +1 +1 +1 +FFFFFFFF +F +F +F +0 +1 +3 +7 +0 +10 +30 +70 +0 +1 +3 +7 +0 +F +0 +F diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang new file mode 100644 index 000000000..bb641cab1 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-sum-product.slang @@ -0,0 +1,136 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedInclusiveSum subgroupPartitionedInclusiveAddNV +#define __partitionedInclusiveProduct subgroupPartitionedInclusiveMulNV +#define __partitionedExclusiveSum subgroupPartitionedExclusiveAddNV +#define __partitionedExclusiveProduct subgroupPartitionedExclusiveMulNV +#else +#define __partitionedInclusiveSum WaveMultiPrefixInclusiveSum +#define __partitionedInclusiveProduct WaveMultiPrefixInclusiveProduct +#define __partitionedExclusiveSum WaveMultiPrefixExclusiveSum +#define __partitionedExclusiveProduct WaveMultiPrefixExclusiveProduct +#endif + +static uint partitionedIndex = 0; +static uint gProductValue = 0; + +__generic +bool test1SumProduct(uint4 mask) +{ + let productValue = T(gProductValue); + + return true + & (__partitionedInclusiveSum(T(1), mask) == T(partitionedIndex + 1)) + & (__partitionedInclusiveProduct(productValue, mask) == productValue) + & (__partitionedExclusiveSum(T(1), mask) == T(partitionedIndex)) + & (__partitionedExclusiveProduct(productValue, mask) == T(1)) + ; +} + +__generic +bool testVSumProduct(uint4 mask) { + typealias GVec = vector; + + let productValue = GVec(T(gProductValue)); + + return true + & all(__partitionedInclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex + 1))) + & all(__partitionedInclusiveProduct(productValue, mask) == productValue) + & all(__partitionedExclusiveSum(GVec(T(1)), mask) == GVec(T(partitionedIndex))) + & all(__partitionedExclusiveProduct(productValue, mask) == GVec(T(1))) + ; +} + +bool testSumProduct(uint4 mask) +{ + return true + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + +#if defined(VK) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) +#endif + +#if !defined(CUDA) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + partitionedIndex = index; + bool isSecondGroup = false; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + uint4 mask = uint4(0x0007FFF, 0, 0, 0); + if (index >= 15) + { + isSecondGroup = true; + mask = uint4(0xFFFF8000, 0, 0, 0); + partitionedIndex -= 15; + } + + let isLastInPartition = (index == 14) || (index == 31); + gProductValue = isLastInPartition ? uint(0) : uint(1); + + bool result = true + & testSumProduct(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang new file mode 100644 index 000000000..99698e497 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix.slang @@ -0,0 +1,146 @@ +//TEST:SIMPLE(filecheck=CHECK_SPIRV): -stage compute -entry computeMain -target spirv -DNO_INTEGER_MATRIX +//TEST:SIMPLE(filecheck=CHECK_GLSL): -stage compute -entry computeMain -target glsl -DNO_INTEGER_MATRIX +//TEST:SIMPLE(filecheck=CHECK_CUDA): -stage compute -entry computeMain -target cuda +//TEST:SIMPLE(filecheck=CHECK_HLSL): -stage compute -entry computeMain -target hlsl + +// +// Tests all variants and overloads of WaveMultiPrefix* arithmetic intrinsics. +// + +struct OutputData +{ + int scalarSum; + int scalarProduct; + int scalarBitAnd; + int scalarBitOr; + int scalarBitXor; + int vectorSum; + int vectorProduct; + int vectorBitAnd; + int vectorBitOr; + int vectorBitXor; + int matrixSum; + int matrixProduct; + int matrixBitAnd; + int matrixBitOr; + int matrixBitXor; + float floatScalarSum; + float floatScalarProduct; + float floatVectorSum; + float floatVectorProduct; + float floatMatrixSum; + float floatMatrixProduct; +}; + +RWStructuredBuffer outputBuffer; + +// CHECK_SPIRV: OpCapability GroupNonUniformPartitionedNV +// CHECK_SPIRV: OpExtension "SPV_NV_shader_subgroup_partitioned" +// CHECK_SPIRV: OpGroupNonUniformIAdd{{.*}}PartitionedExclusiveScanNV +// CHECK_SPIRV: OpGroupNonUniformIMul{{.*}}PartitionedExclusiveScanNV +// CHECK_SPIRV: OpGroupNonUniformBitwiseAnd{{.*}}PartitionedExclusiveScanNV +// CHECK_SPIRV: OpGroupNonUniformBitwiseOr{{.*}}PartitionedExclusiveScanNV +// CHECK_SPIRV: OpGroupNonUniformBitwiseXor{{.*}}PartitionedExclusiveScanNV +// CHECK_SPIRV: OpGroupNonUniformFAdd{{.*}}PartitionedExclusiveScanNV + +// CHECK_GLSL: GL_NV_shader_subgroup_partitioned +// CHECK_GLSL: subgroupPartitionedExclusiveAddNV +// CHECK_GLSL: subgroupPartitionedExclusiveMulNV +// CHECK_GLSL: subgroupPartitionedExclusiveAndNV +// CHECK_GLSL: subgroupPartitionedExclusiveOrNV +// CHECK_GLSL: subgroupPartitionedExclusiveXorNV + +// CHECK_CUDA: _wavePrefixSum +// CHECK_CUDA: _wavePrefixProduct +// CHECK_CUDA: _wavePrefixAnd +// CHECK_CUDA: _wavePrefixOr +// CHECK_CUDA: _wavePrefixXor +// CHECK_CUDA: _wavePrefixSumMultiple +// CHECK_CUDA: _wavePrefixProductMultiple +// CHECK_CUDA: _wavePrefixAndMultiple +// CHECK_CUDA: _wavePrefixOrMultiple +// CHECK_CUDA: _wavePrefixXorMultiple + +// CHECK_HLSL: WaveMultiPrefixSum +// CHECK_HLSL: WaveMultiPrefixProduct +// CHECK_HLSL: WaveMultiPrefixBitAnd +// CHECK_HLSL: WaveMultiPrefixBitOr +// CHECK_HLSL: WaveMultiPrefixBitXor + + +[numthreads(1, 1, 1)] +void computeMain(uint3 dTid : SV_DispatchThreadID) +{ + int scalarVal = dTid.x; + uint4 mask = WaveMatch(scalarVal); + + int scalarSum = WaveMultiPrefixSum(scalarVal, mask); + int scalarProduct = WaveMultiPrefixProduct(scalarVal, mask); + int scalarBitAnd = WaveMultiPrefixBitAnd(scalarVal, mask); + int scalarBitOr = WaveMultiPrefixBitOr(scalarVal, mask); + int scalarBitXor = WaveMultiPrefixBitXor(scalarVal, mask); + + int3 vectorVal = int3(dTid.x, dTid.y, dTid.z); + int3 vectorSum = WaveMultiPrefixSum(vectorVal, mask); + int3 vectorProduct = WaveMultiPrefixProduct(vectorVal, mask); + int3 vectorBitAnd = WaveMultiPrefixBitAnd(vectorVal, mask); + int3 vectorBitOr = WaveMultiPrefixBitOr(vectorVal, mask); + int3 vectorBitXor = WaveMultiPrefixBitXor(vectorVal, mask); + + float floatScalarVal = float(dTid.x) + 0.5f; // Example floating-point scalar value + uint4 floatMask = WaveMatch(floatScalarVal); // Create a mask for matching lanes + + float floatScalarSum = WaveMultiPrefixSum(floatScalarVal, floatMask); + float floatScalarProduct = WaveMultiPrefixProduct(floatScalarVal, floatMask); + + float3 floatVectorVal = float3(dTid.x, dTid.y, dTid.z) + 0.5f; // Example floating-point vector value + float3 floatVectorSum = WaveMultiPrefixSum(floatVectorVal, floatMask); + float3 floatVectorProduct = WaveMultiPrefixProduct(floatVectorVal, floatMask); + + OutputData output; + output.scalarSum = scalarSum; + output.scalarProduct = scalarProduct; + output.scalarBitAnd = scalarBitAnd; + output.scalarBitOr = scalarBitOr; + output.scalarBitXor = scalarBitXor; + output.vectorSum = vectorSum.x; + output.vectorProduct = vectorProduct.x; + output.vectorBitAnd = vectorBitAnd.x; + output.vectorBitOr = vectorBitOr.x; + output.vectorBitXor = vectorBitXor.x; + output.floatScalarSum = floatScalarSum; + output.floatScalarProduct = floatScalarProduct; + output.floatVectorSum = floatVectorSum.x; + output.floatVectorProduct = floatVectorProduct.x; + + float3x3 floatMatrixVal = float3x3( + float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, + float(dTid.z) + 0.5f, float(dTid.x) + 0.5f, float(dTid.y) + 0.5f, + float(dTid.y) + 0.5f, float(dTid.z) + 0.5f, float(dTid.x) + 0.5f + ); + float3x3 floatMatrixSum = WaveMultiPrefixSum(floatMatrixVal, floatMask); + float3x3 floatMatrixProduct = WaveMultiPrefixProduct(floatMatrixVal, floatMask); + output.floatMatrixSum = floatMatrixSum[0][0]; + output.floatMatrixProduct = floatMatrixProduct[0][0]; + +#if !defined(NO_INTEGER_MATRIX) + int3x3 matrixVal = int3x3( + dTid.x, dTid.y, dTid.z, + dTid.z, dTid.x, dTid.y, + dTid.y, dTid.z, dTid.x + ); + int3x3 matrixSum = WaveMultiPrefixSum(matrixVal, mask); + int3x3 matrixProduct = WaveMultiPrefixProduct(matrixVal, mask); + int3x3 matrixBitAnd = WaveMultiPrefixBitAnd(matrixVal, mask); + int3x3 matrixBitOr = WaveMultiPrefixBitOr(matrixVal, mask); + int3x3 matrixBitXor = WaveMultiPrefixBitXor(matrixVal, mask); + output.matrixSum = matrixSum[0][0]; + output.matrixProduct = matrixProduct[0][0]; + output.matrixBitAnd = matrixBitAnd[0][0]; + output.matrixBitOr = matrixBitOr[0][0]; + output.matrixBitXor = matrixBitXor[0][0]; +#endif + + outputBuffer[dTid.x] = output; +} + diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang new file mode 100644 index 000000000..b40b014f4 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-sum-product.slang @@ -0,0 +1,114 @@ +//TEST_CATEGORY(wave, compute) +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA + +//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +#if defined(USE_GLSL_SYNTAX) +#define __partitionedSum subgroupPartitionedAddNV +#define __partitionedProduct subgroupPartitionedMulNV +#else +#define __partitionedSum WaveMultiSum +#define __partitionedProduct WaveMultiProduct +#endif + +static uint gSumResult = 0; + +__generic +bool test1SumProduct(uint4 mask) +{ + let sumResult = T(gSumResult); + + return true + & (__partitionedSum(T(1), mask) == sumResult) + & (__partitionedProduct(T(1), mask) == T(1)) + ; +} + +__generic +bool testVSumProduct(uint4 mask) { + typealias GVec = vector; + + let sumResult = GVec(T(gSumResult)); + + return true + & all(__partitionedSum(GVec(T(1)), mask) == sumResult) + & all(__partitionedProduct(GVec(T(1)), mask) == GVec(T(1))) + ; +} + +bool testSumProduct(uint4 mask) +{ + return true + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + +#if !defined(CUDA) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & test1SumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) + & testVSumProduct(mask) +#endif + ; +} + +[numthreads(32, 1, 1)] +[shader("compute")] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint index = dispatchThreadID.x; + + // Split into two groups, first group has 15 invocations/lanes and second group has 17. + let isSecondGroup = index >= 15; + uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0); + gSumResult = isSecondGroup ? uint(17) : uint(15); + + bool result = true + & testSumProduct(mask) + ; + + // CHECK-COUNT-32: 1 + outputBuffer[index] = uint(result); +} diff --git a/tests/language-feature/capability/testing-framework-with-profiles.slang b/tests/language-feature/capability/testing-framework-with-profiles.slang index 215ba887e..97ff32a9d 100644 --- a/tests/language-feature/capability/testing-framework-with-profiles.slang +++ b/tests/language-feature/capability/testing-framework-with-profiles.slang @@ -17,5 +17,5 @@ RWStructuredBuffer outputBuffer; void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { // BUF: 1 - outputBuffer[0] = WaveMaskSum(0xFF, 1); + outputBuffer[0] = WaveActiveSum(1); } -- cgit v1.2.3