First support for 'WaveMask' intrinsics (#1321)

* WIP tests to confirm divergence on CUDA. * Added wave.slang test that uses masks. Made all CUDA intrinsic impls take a mask explicitly. Added initial WaveMaskXXX intrinsics. * Added WaveMaskSharedSync. * Improvements aroung WaveMaskSharedSync/WaveMaskSync * Remove tabs.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-04-15 14:14:58 -0400
committer: GitHub <noreply@github.com> 2020-04-15 14:14:58 -0400
commit: d5d32221daf950b2f923122a179e791572dd6cb6 (patch)
tree: 0f4bd215c11abc98d0e1f9b3da920838e6e5862b /tests
parent: fbac017938343724407ab036abd736c942b4e187 (diff)
1 files changed, 64 insertions, 0 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave.slang b/tests/hlsl-intrinsic/wave-mask/wave.slang
new file mode 100644
index 000000000..6b641906d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-mask/wave.slang
@@ -0,0 +1,64 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0  0 0 0 0  0 0 0 0  0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[3 10 2 -1  4 53 4 6  1 2 3 4  7 5 3 1], stride=4):name inputBuffer
+RWStructuredBuffer<int> inputBuffer;
+
+groupshared int sharedMem[32];
+
+int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalValue, int elementCount)
+{
+    WaveMask localMask = WaveMaskBallot(mask, waveLaneId < elementCount);
+    
+    sharedMem[index] = 0;
+    
+    if(waveLaneId < elementCount)
+    {
+        int temp = 0;
+        int val = originalValue;        
+        
+        for(int  i = 1; i < elementCount; i += i)
+        {
+            int temp = WaveMaskShuffle(localMask, val, waveLaneId - i);            
+            if(waveLaneId >= i)
+            {
+                val += temp;
+            }
+        }        
+        
+        // Make it an exclusive prefix sum
+        val -= originalValue;
+        
+        // Write to shared memory
+        sharedMem[index] = val;
+        
+        // Syncronizes on the mask, and ensures memory fence for shared data write
+        WaveMaskSharedSync(localMask); 
+        return val;
+    }    
+    
+    return 0;    
+}
+
+[numthreads(32, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int index = int(dispatchThreadID.x);
+    const int waveLaneId = WaveGetLaneIndex();
+    
+    const int value = inputBuffer[index];
+    const int elementCount = 9;
+    
+    exclusivePrefixSum(WaveGetActiveMask(), index, waveLaneId, value, elementCount);
+    
+    // It returns the result, but we are going to read from shared memory, to check that aspect worked
+    int prefixValue = sharedMem[index];
+    
+    outputBuffer[index] = prefixValue;
+}
+\ No newline at end of file
author	jsmall-nvidia <jsmall@nvidia.com>	2020-04-15 14:14:58 -0400
committer	GitHub <noreply@github.com>	2020-04-15 14:14:58 -0400
commit	d5d32221daf950b2f923122a179e791572dd6cb6 (patch)
tree	0f4bd215c11abc98d0e1f9b3da920838e6e5862b /tests
parent	fbac017938343724407ab036abd736c942b4e187 (diff)