Feature/wave mask review (#1325)

* Fix issues in wave-mask/wave.slang tests. WaveGetActiveMask -> WaveGetConvergedMask. Update target-compatibility.md * First pass at wave-intrinsics.md documentation. Write up around WaveMaskSharedSync. * Added more of the Wave intrinsics as WaveMask intrinsics. Improvements to documentation around wave-intrinsics.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-04-20 13:03:18 -0400
committer: GitHub <noreply@github.com> 2020-04-20 13:03:18 -0400
commit: c4441d804aaa97bad7ff01bef505491d30bbc046 (patch)
tree: ac251ab76ccb8fd3a07a7dd61f22dd4fc7c2bd41 /tests
parent: acb1c39b4e29358cf496c07dc325e52f39be71f4 (diff)
1 files changed, 15 insertions, 6 deletions
diff --git a/tests/hlsl-intrinsic/wave-mask/wave.slang b/tests/hlsl-intrinsic/wave-mask/wave.slang
index 6b641906d..346940cb2 100644
--- a/tests/hlsl-intrinsic/wave-mask/wave.slang
+++ b/tests/hlsl-intrinsic/wave-mask/wave.slang
@@ -14,7 +14,7 @@ groupshared int sharedMem[32];
 
 int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalValue, int elementCount)
 {
-    WaveMask localMask = WaveMaskBallot(mask, waveLaneId < elementCount);
+    WaveMask localMask = WaveMaskBallot(mask, index < elementCount);
     
     sharedMem[index] = 0;
     
@@ -23,7 +23,7 @@ int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalVal
         int temp = 0;
         int val = originalValue;        
         
-        for(int  i = 1; i < elementCount; i += i)
+        for(int i = 1; i < elementCount; i += i)
         {
             int temp = WaveMaskShuffle(localMask, val, waveLaneId - i);            
             if(waveLaneId >= i)
@@ -37,25 +37,34 @@ int exclusivePrefixSum(WaveMask mask, int index, int waveLaneId, int originalVal
         
         // Write to shared memory
         sharedMem[index] = val;
-        
-        // Syncronizes on the mask, and ensures memory fence for shared data write
-        WaveMaskSharedSync(localMask); 
         return val;
     }    
     
     return 0;    
 }
 
+// It matters how kernels with WaveMask intrinsics are launched(!). 
+// TODO(JS):
+// If I launch with an numthreads amount that is not the size of the Wave on the device, then some 
+// lanes will not be executing at startup, and the kernel will have to know that is the case. 
+// This works currently though because the mask is only used
+// on CUDA, and it's Wave size is 32.
 [numthreads(32, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
+    // Assumes all threads in the Wave are active at start.
+    WaveMask waveMask = ~WaveMask(0);
+    
     int index = int(dispatchThreadID.x);
     const int waveLaneId = WaveGetLaneIndex();
     
     const int value = inputBuffer[index];
     const int elementCount = 9;
     
-    exclusivePrefixSum(WaveGetActiveMask(), index, waveLaneId, value, elementCount);
+    exclusivePrefixSum(waveMask, index, waveLaneId, value, elementCount);
+    
+    // We don't read from any other lane, so we don't actually need any sync
+    //WaveMaskSharedSync(waveMask); 
     
     // It returns the result, but we are going to read from shared memory, to check that aspect worked
     int prefixValue = sharedMem[index];
author	jsmall-nvidia <jsmall@nvidia.com>	2020-04-20 13:03:18 -0400
committer	GitHub <noreply@github.com>	2020-04-20 13:03:18 -0400
commit	c4441d804aaa97bad7ff01bef505491d30bbc046 (patch)
tree	ac251ab76ccb8fd3a07a7dd61f22dd4fc7c2bd41 /tests
parent	acb1c39b4e29358cf496c07dc325e52f39be71f4 (diff)