blob: 2ca9c00603f1eddfb936486fe22609bd186d433e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -output-using-type
// Does not run on DX11 as SM 6.4 is required.
//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx11
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_4 -shaderobj -output-using-type
//TEST(compute):COMPARE_COMPUTE_EX:-metal -compute -shaderobj -output-using-type
//TEST(compute):COMPARE_COMPUTE_EX:-wgsl -compute -shaderobj -render-feature half -output-using-type
//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -g0 -output-using-type
//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type
//TEST_INPUT:ubuffer(data=[0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<int> outputBuffer;
[numthreads(1, 1, 1)]
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
uint outputIndex = 0;
//
// dot4add_u8packed()
// [4 3 2 1] dot [1 2 4 2] + 5
// (4 * 1) + (3 * 2) + (2 * 4) + (1 * 2) + 5 = 25
//
uint unsignedX = 0x01020304U;
uint unsignedY = 0x02040201U;
uint unsignedAcc = 5U;
uint unsignedResult = dot4add_u8packed(unsignedX, unsignedY, unsignedAcc);
outputBuffer[outputIndex++] = unsignedResult;
//
// dot4add_i8packed()
// [6 2 3 -1] dot [-2 -6 2 6] - 100
// (6 * -2) + (2 * -6) + (3 * 2) + (-1 * 6) - 100 = -124
//
int signedX = 0xFF030206;
int signedY = 0x0602FAFE;
int signedAcc = -100;
int signedResult = dot4add_i8packed(signedX, signedY, signedAcc);
outputBuffer[outputIndex++] = signedResult;
//
// dot2add()
// [10.8 -3.3] dot [1.4 -20.3] - 2.11
// (10.8 * 1.4) + (-3.3 * -20.3) - 2.0 = 80.11
//
half2 half2X = half2(half(10.8), half(-3.3));
half2 half2Y = half2(half(1.4), half(-20.3));
// `half2Acc` is assigned -2.0 here.
// Thread index is used so that `half2Acc` will not be implicitly emitted as literal `-2.0` which
// may be treated as a double by DXC and cause it to fail to compile because no overload exists for `dot2add` that
// accepts double.
float half2Acc = float(dispatchThreadID.x + 1) * -2.0f;
float half2Result = dot2add(half2X, half2Y, half2Acc);
outputBuffer[outputIndex++] = int(half2Result);
}
|