diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-04-15 14:14:58 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-04-15 14:14:58 -0400 |
| commit | d5d32221daf950b2f923122a179e791572dd6cb6 (patch) | |
| tree | 0f4bd215c11abc98d0e1f9b3da920838e6e5862b /tests | |
| parent | fbac017938343724407ab036abd736c942b4e187 (diff) | |
First support for 'WaveMask' intrinsics (#1321)
* WIP tests to confirm divergence on CUDA.
* Added wave.slang test that uses masks.
Made all CUDA intrinsic impls take a mask explicitly.
Added initial WaveMaskXXX intrinsics.
* Added WaveMaskSharedSync.
* Improvements aroung WaveMaskSharedSync/WaveMaskSync
* Remove tabs.
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/hlsl-intrinsic/wave-mask/wave.slang | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave.slang b/tests/hlsl-intrinsic/wave-mask/wave.slang new file mode 100644 index 000000000..6b641906d --- /dev/null +++ b/tests/hlsl-intrinsic/wave-mask/wave.slang @@ -0,0 +1,64 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +//TEST_INPUT:ubuffer(data=[3 10 2 -1 4 53 4 6 1 2 3 4 7 5 3 1], stride=4):name inputBuffer +RWStructuredBuffer<int> inputBuffer; + +groupshared int sharedMem[32]; + +int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalValue, int elementCount) +{ + WaveMask localMask = WaveMaskBallot(mask, waveLaneId < elementCount); + + sharedMem[index] = 0; + + if(waveLaneId < elementCount) + { + int temp = 0; + int val = originalValue; + + for(int i = 1; i < elementCount; i += i) + { + int temp = WaveMaskShuffle(localMask, val, waveLaneId - i); + if(waveLaneId >= i) + { + val += temp; + } + } + + // Make it an exclusive prefix sum + val -= originalValue; + + // Write to shared memory + sharedMem[index] = val; + + // Syncronizes on the mask, and ensures memory fence for shared data write + WaveMaskSharedSync(localMask); + return val; + } + + return 0; +} + +[numthreads(32, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int index = int(dispatchThreadID.x); + const int waveLaneId = WaveGetLaneIndex(); + + const int value = inputBuffer[index]; + const int elementCount = 9; + + exclusivePrefixSum(WaveGetActiveMask(), index, waveLaneId, value, elementCount); + + // It returns the result, but we are going to read from shared memory, to check that aspect worked + int prefixValue = sharedMem[index]; + + outputBuffer[index] = prefixValue; +}
\ No newline at end of file |
