1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
//TEST_CATEGORY(wave, compute)
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -xslang -DCUDA
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<uint> outputBuffer;
#if defined(USE_GLSL_SYNTAX)
#define __partitionedMin subgroupPartitionedMinNV
#define __partitionedMax subgroupPartitionedMaxNV
#else
#define __partitionedMin WaveMultiMin
#define __partitionedMax WaveMultiMax
#endif
static uint gMinResult = 0;
static uint gMaxResult = 0;
static uint gMinMaxValue = 0;
__generic<T : __BuiltinArithmeticType>
bool test1MinMax(uint4 mask)
{
let minResult = T(gMinResult);
let maxResult = T(gMaxResult);
let minMaxValue = T(gMinMaxValue);
return true
& all(__partitionedMin(minMaxValue, mask) == minResult)
& all(__partitionedMax(minMaxValue, mask) == maxResult)
;
}
__generic<T : __BuiltinArithmeticType, let N : int>
bool testVMinMax(uint4 mask) {
typealias GVec = vector<T, N>;
let minResult = GVec(T(gMinResult));
let maxResult = GVec(T(gMaxResult));
let minMaxValue = GVec(T(gMinMaxValue));
return true
& all(__partitionedMin(minMaxValue, mask) == minResult)
& all(__partitionedMax(minMaxValue, mask) == maxResult)
;
}
bool testMinMax(uint4 mask)
{
return true
& test1MinMax<int>(mask)
& testVMinMax<int, 2>(mask)
& testVMinMax<int, 3>(mask)
& testVMinMax<int, 4>(mask)
& test1MinMax<uint>(mask)
& testVMinMax<uint, 2>(mask)
& testVMinMax<uint, 3>(mask)
& testVMinMax<uint, 4>(mask)
& test1MinMax<float>(mask)
& testVMinMax<float, 2>(mask)
& testVMinMax<float, 3>(mask)
& testVMinMax<float, 4>(mask)
& test1MinMax<double>(mask)
& testVMinMax<double, 2>(mask)
& testVMinMax<double, 3>(mask)
& testVMinMax<double, 4>(mask)
#if !defined(CUDA)
& test1MinMax<int8_t>(mask)
& testVMinMax<int8_t, 2>(mask)
& testVMinMax<int8_t, 3>(mask)
& testVMinMax<int8_t, 4>(mask)
& test1MinMax<int16_t>(mask)
& testVMinMax<int16_t, 2>(mask)
& testVMinMax<int16_t, 3>(mask)
& testVMinMax<int16_t, 4>(mask)
& test1MinMax<int64_t>(mask)
& testVMinMax<int64_t, 2>(mask)
& testVMinMax<int64_t, 3>(mask)
& testVMinMax<int64_t, 4>(mask)
& test1MinMax<uint8_t>(mask)
& testVMinMax<uint8_t, 2>(mask)
& testVMinMax<uint8_t, 3>(mask)
& testVMinMax<uint8_t, 4>(mask)
& test1MinMax<uint16_t>(mask)
& testVMinMax<uint16_t, 2>(mask)
& testVMinMax<uint16_t, 3>(mask)
& testVMinMax<uint16_t, 4>(mask)
& test1MinMax<uint64_t>(mask)
& testVMinMax<uint64_t, 2>(mask)
& testVMinMax<uint64_t, 3>(mask)
& testVMinMax<uint64_t, 4>(mask)
& test1MinMax<half>(mask)
& testVMinMax<half, 2>(mask)
& testVMinMax<half, 3>(mask)
& testVMinMax<half, 4>(mask)
#endif
;
}
[numthreads(32, 1, 1)]
[shader("compute")]
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
uint index = dispatchThreadID.x;
// Split into two groups, first group has 15 invocations/lanes and second group has 17.
let isSecondGroup = index >= 15;
uint4 mask = isSecondGroup ? uint4(0xFFFF8000, 0, 0, 0) : uint4(0x0007FFF, 0, 0, 0);
// Set min value on one invocation on each partition/mask.
let isMinInvocation = (index == 0) || (index == 15);
gMinResult = isSecondGroup ? uint(2) : uint(0);
gMaxResult = isSecondGroup ? uint(3) : uint(1);
gMinMaxValue = isMinInvocation ? gMinResult : gMaxResult;
bool result = true
&& testMinMax(mask)
;
// CHECK-COUNT-32: 1
outputBuffer[index] = uint(result);
}
|