tests/hlsl-intrinsic/wave-mask/wave.slang


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

//TEST_CATEGORY(wave-mask, compute)
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -shaderobj

//TEST_INPUT:ubuffer(data=[0 0 0 0  0 0 0 0  0 0 0 0  0 0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<int> outputBuffer;

//TEST_INPUT:ubuffer(data=[3 10 2 -1  4 53 4 6  1 2 3 4  7 5 3 1], stride=4):name inputBuffer
RWStructuredBuffer<int> inputBuffer;

groupshared int sharedMem[32];

int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalValue, int elementCount)
{
    WaveMask localMask = WaveMaskBallot(mask, index < elementCount);
    
    sharedMem[index] = 0;
    
    if(waveLaneId < elementCount)
    {
        int temp = 0;
        int val = originalValue;        
        
        for(int i = 1; i < elementCount; i += i)
        {
            int temp = WaveMaskShuffle(localMask, val, waveLaneId - i);            
            if(waveLaneId >= i)
            {
                val += temp;
            }
        }        
        
        // Make it an exclusive prefix sum
        val -= originalValue;
        
        // Write to shared memory
        sharedMem[index] = val;
        return val;
    }    
    
    return 0;    
}

// It matters how kernels with WaveMask intrinsics are launched(!). 
// TODO(JS):
// If I launch with an numthreads amount that is not the size of the Wave on the device, then some 
// lanes will not be executing at startup, and the kernel will have to know that is the case. 
// This works currently though because the mask is only used
// on CUDA, and it's Wave size is 32.
[numthreads(32, 1, 1)]
void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
    // Assumes all threads in the Wave are active at start.
    WaveMask waveMask = ~WaveMask(0);
    
    int index = int(dispatchThreadID.x);
    const int waveLaneId = int(WaveGetLaneIndex());
    
    const int value = inputBuffer[index];
    const int elementCount = 9;
    
    exclusivePrefixSum(waveMask, index, waveLaneId, value, elementCount);
    
    // We don't read from any other lane, so we don't actually need any sync
    //WaveMaskSharedSync(waveMask); 
    
    // It returns the result, but we are going to read from shared memory, to check that aspect worked
    int prefixValue = sharedMem[index];
    
    outputBuffer[index] = prefixValue;
}