Additional Wave Intrinsic Support (#1252)

* Test for some wave intrinsics. More wave intrinsic support on CUDA. * Use shfl_xor_sync. * Improvements around wave intrinsics. Fix built in integer types belong to __BuiltinIntegerType. * Improvements and fixes around Wave intrinsics. * Added WaveIsFirstLane test. No longer use __wavemask_lt, as appears not available as an intrinsic. * Small fixes to CUDA prelude. * Add wave-active-product test. Handle the special case for arbitray sums. * Used macro to implement CUDA wave intrinsics.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-03-02 16:18:20 -0500
committer: GitHub <noreply@github.com> 2020-03-02 16:18:20 -0500
commit: 8899c149b05def1cce626ea649012c4c974861de (patch)
tree: 77e97c2997a653ba9262b32f55e9e3f37e166653
parent: b85ca6f86d46ee3c4d5784d0bd4ebc8509e2a9bd (diff)
12 files changed, 421 insertions, 69 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 1938e3dc1..1ca93d9d1 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -128,36 +128,6 @@ union Union64
     double d;
 };
 
-// ---------------------- Miscellaneous --------------------------------------
-
-// TODO(JS): It appears that cuda does not have a simple way to get a lane index. 
-// 
-// Another approach could be... 
-// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
-// If that is really true another way to do this, would be for code generator to add this function 
-// with the [numthreads] baked in. 
-// 
-// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
-#ifndef SLANG_USE_ASM_LANE_ID
- __forceinline__ __device__ uint32_t _getLaneId()
-{
-    // If the launch is (or I guess some multiple of the warp size) 
-    // we try this mechanism, which is apparently faster. 
-    return threadIdx.x & SLANG_CUDA_WARP_MASK;
-}
-#else
-__forceinline__ __device__ uint32_t _getLaneId()
-{
-    // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
-    // This mechanism is not the fastest way to do it, and that is why the other mechanism 
-    // is the default. But the other mechanism relies on a launch that makes the assumption 
-    // true.
-    unsigned ret; 
-    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
-    return ret;
-}
-#endif
-
 // ----------------------------- F32 -----------------------------------------
 
 // Unary 
@@ -440,6 +410,192 @@ struct RWByteAddressBuffer
     size_t sizeInBytes; //< Must be multiple of 4 
 };
 
+
+// ---------------------- Wave --------------------------------------
+
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index. 
+// 
+// Another approach could be... 
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
+// If that is really true another way to do this, would be for code generator to add this function 
+// with the [numthreads] baked in. 
+// 
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+#ifndef SLANG_USE_ASM_LANE_ID
+ __forceinline__ __device__ uint32_t _getLaneId()
+{
+    // If the launch is (or I guess some multiple of the warp size) 
+    // we try this mechanism, which is apparently faster. 
+    return threadIdx.x & SLANG_CUDA_WARP_MASK;
+}
+#else
+__forceinline__ __device__ uint32_t _getLaneId()
+{
+    // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
+    // This mechanism is not the fastest way to do it, and that is why the other mechanism 
+    // is the default. But the other mechanism relies on a launch that makes the assumption 
+    // true.
+    unsigned ret; 
+    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+    return ret;
+}
+#endif
+
+// Note! Note will return true if mask is 0, but thats okay, because there must be one
+// lane active to execute anything
+__inline__ __device__ bool _waveIsSingleLane(int mask)
+{
+    return (mask & (mask - 1)) == 0;
+}
+
+// Returns the power of 2 size of run of set bits. Returns 0 if not a suitable run.
+__inline__ __device__ int _waveCalcPow2Offset(int mask)
+{
+    // This should be the most common case, so fast path it
+    if (mask == SLANG_CUDA_WARP_MASK)
+    {
+        return SLANG_CUDA_WARP_SIZE;
+    }
+    // Is it a contiguous run of bits?
+    if ((mask & (mask + 1)) == 0)
+    {
+        // const int offsetSize = __ffs(mask + 1) - 1;
+        const int offset = 32 - __clz(mask);
+        // Is it a power of 2 size
+        if ((offset & (offset - 1)) == 0)
+        {
+            return offset;
+        }
+    }
+    return 0;
+}
+
+__inline__ __device__ bool _waveIsFirstLane()
+{
+    const int mask = __activemask();
+    // We special case bit 0, as that most warps are expected to be fully active. 
+    
+    // mask & -mask, isolates the lowest set bit.
+    //return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId()));
+    
+    // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. 
+    return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId());
+}
+
+// TODO(JS): NOTE! These functions only work across all lanes. 
+// Special handling will be needed if only some lanes are active.
+
+#define SLANG_CUDA_REDUCE_OP(INIT_VALUE, OP) \
+    const int offsetSize = _waveCalcPow2Offset(mask); \
+    if (offsetSize > 0) \
+    { \
+        for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \
+        { \
+            val = val OP __shfl_xor_sync( mask, val, offset); \
+        } \
+        return val;  \
+    } \
+    else if (_waveIsSingleLane(mask)) \
+    { \
+        return val; \
+    } \
+    else \
+    { \
+        int result = INIT_VALUE; \
+        int remaining = mask; \
+        while (remaining) \
+        { \
+            const int laneBit = remaining & -remaining; \
+            /* Get the sourceLane */ \
+            const int srcLane = __ffs(laneBit) - 1; \
+            /* Broadcast (can also broadcast to self) */ \
+            result = result OP __shfl_sync(mask, val, srcLane); \
+            remaining &= ~laneBit; \
+        } \
+        return result; \
+    }
+
+#define SLANG_CUDA_REDUCE_FUNC(INIT_VALUE, FUNC) \
+    const int offsetSize = _waveCalcPow2Offset(mask); \
+    if (offsetSize > 0) \
+    { \
+        for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) \
+        { \
+            val = FUNC(val, __shfl_xor_sync( mask, val, offset)); \
+        } \
+        return val;  \
+    } \
+    else if (_waveIsSingleLane(mask)) \
+    { \
+        return val; \
+    } \
+    else \
+    { \
+        int result = INIT_VALUE; \
+        int remaining = mask; \
+        while (remaining) \
+        { \
+            const int laneBit = remaining & -remaining; \
+            /* Get the sourceLane */ \
+            const int srcLane = __ffs(laneBit) - 1; \
+            /* Broadcast (can also broadcast to self) */ \
+            result = FUNC(result, __shfl_sync(mask, val, srcLane)); \
+            remaining &= ~laneBit; \
+        } \
+        return result; \
+    }    
+    
+
+__inline__ __device__ int _waveOr(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_OP(0, |)
+}
+
+__inline__ __device__ int _waveAnd(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_OP(~int(0), &)
+}
+
+__inline__ __device__ int _waveXor(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_OP(0, ^)
+}
+
+__inline__ __device__ int _waveProduct(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_OP(1, *)
+}
+
+__inline__ __device__ int _waveSum(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_OP(0, +)
+}
+
+__inline__ __device__ int _waveMin(int mask, int val) 
+{    
+    SLANG_CUDA_REDUCE_FUNC(val, I32_min)
+}
+
+__inline__ __device__ int _waveMax(int mask, int val) 
+{
+    SLANG_CUDA_REDUCE_FUNC(val, I32_max)
+}
+
+__inline__ __device__ bool _waveAllEqual(int mask, int val) 
+{
+    int pred;
+    __match_all_sync(mask, val, &pred);
+    return pred != 0;
+}
+
+__inline__ __device__ int _waveReadFirst(int val) 
+{
+    const int mask = __activemask();
+    const int lowestLaneId = __ffs(mask) - 1;
+    return __shfl_sync(mask, val, lowestLaneId);   
+}
+
+
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
 
diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp
index f68c4dc01..27d269125 100644
--- a/source/core/slang-nvrtc-compiler.cpp
+++ b/source/core/slang-nvrtc-compiler.cpp
@@ -307,6 +307,9 @@ SlangResult NVRTCDownstreamCompiler::compile(const CompileOptions& options, RefP
         // This is arguably too much - but nvrtc does not appear to have a mechanism to switch off individual warnings.
         // I tried the -Xcudafe mechanism but that does not appear to work for nvrtc
         cmdLine.addArg("-w");
+
+        //
+        cmdLine.addArg("-arch=compute_70");
     }
 
     nvrtcProgram program = nullptr;
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 70bc90392..6822d304b 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -132,9 +132,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
     case BaseType::Half:
     case BaseType::Float:
     case BaseType::Double:
-        sb << "\n    , __BuiltinFloatingPointType\n";
+        sb << "\n    ,  __BuiltinFloatingPointType\n";
         sb << "\n    ,  __BuiltinRealType\n";
-        ; // fall through to:
+        sb << "\n    ,  __BuiltinSignedArithmeticType\n";
+        sb << "\n    ,  __BuiltinArithmeticType\n";
+        sb << "\n    ,  __BuiltinType\n";
+        break;
     case BaseType::Int8:
     case BaseType::Int16:
     case BaseType::Int:
@@ -146,6 +149,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
     case BaseType::UInt:
     case BaseType::UInt64:
         sb << "\n    ,  __BuiltinArithmeticType\n";
+        sb << "\n    ,  __BuiltinIntegerType\n";
         ; // fall through to:
     case BaseType::Bool:
         sb << "\n    ,  __BuiltinType\n";
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index 4c8da2a9a..3ff1fd243 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -135,9 +135,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
     case BaseType::Half:
     case BaseType::Float:
     case BaseType::Double:
-        sb << "\n    , __BuiltinFloatingPointType\n";
+        sb << "\n    ,  __BuiltinFloatingPointType\n";
         sb << "\n    ,  __BuiltinRealType\n";
-        ; // fall through to:
+        sb << "\n    ,  __BuiltinSignedArithmeticType\n";
+        sb << "\n    ,  __BuiltinArithmeticType\n";
+        sb << "\n    ,  __BuiltinType\n";
+        break;
     case BaseType::Int8:
     case BaseType::Int16:
     case BaseType::Int:
@@ -149,6 +152,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
     case BaseType::UInt:
     case BaseType::UInt64:
         sb << "\n    ,  __BuiltinArithmeticType\n";
+        sb << "\n    ,  __BuiltinIntegerType\n";
         ; // fall through to:
     case BaseType::Bool:
         sb << "\n    ,  __BuiltinType\n";
@@ -195,7 +199,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
         // TODO: should this cover the full gamut of integer types?
     case BaseType::Int:
     case BaseType::UInt:
-SLANG_RAW("#line 195 \"core.meta.slang\"")
+SLANG_RAW("#line 199 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("        __generic<T:__EnumType>\n")
 SLANG_RAW("        __init(T value);\n")
@@ -211,7 +215,7 @@ SLANG_RAW("        __init(T value);\n")
 
 // Declare built-in pointer type
 // (eventually we can have the traditional syntax sugar for this)
-SLANG_RAW("#line 210 \"core.meta.slang\"")
+SLANG_RAW("#line 214 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T>\n")
@@ -273,7 +277,7 @@ sb << "    __init(T value);\n";
 sb << "    __init(vector<T,N> value);\n";
 
 sb << "};\n";
-SLANG_RAW("#line 256 \"core.meta.slang\"")
+SLANG_RAW("#line 260 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T = float, let R : int = 4, let C : int = 4>\n")
@@ -1509,7 +1513,7 @@ for (auto op : binaryOps)
         sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
     }
 }
-SLANG_RAW("#line 1491 \"core.meta.slang\"")
+SLANG_RAW("#line 1495 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c8aae9158..edb678ad6 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1395,35 +1395,51 @@ __generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);
 __generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);
 __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);
 
-__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)")
+T WaveActiveBitAnd(T expr);
 __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveOr(__activemask(), $0)")
+T WaveActiveBitOr(T expr);
 __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveXor(__activemask(), $0)")
+T WaveActiveBitXor(T expr);
 __generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveMax(__activemask(), $0)")
+T WaveActiveMax(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveMin(__activemask(), $0)")
+T WaveActiveMin(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)")
+T WaveActiveProduct(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveSum(__activemask(), $0)")
+T WaveActiveSum(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)")
+bool WaveActiveAllEqual(T value);
 __generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);
 
@@ -1438,7 +1454,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,
 
 __target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
 bool WaveActiveAllTrue(bool condition);
-__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
+__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)")
 bool WaveActiveAnyTrue(bool condition);
 
 __target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
@@ -1454,14 +1470,19 @@ __target_intrinsic(cuda, "_getLaneId()")
 uint WaveGetLaneIndex();
 
 // If there are no *active* lanes less than this one, we must be the lowest lane
-__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
+__target_intrinsic(cuda, "_waveIsFirstLane()")
 bool WaveIsFirstLane();
 
-__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
+// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)
+// that would mean different lanes having a different mask, and they all have to have the same mask.
+
+__generic<T : __BuiltinArithmeticType>
+T WavePrefixProduct(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr);
+__generic<T : __BuiltinArithmeticType>
+T WavePrefixSum(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);
 
@@ -1473,11 +1494,14 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr);
+__generic<T : __BuiltinArithmeticType>
+T WaveMultiPrefixBitXor(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);
 
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0)  & __lanemask_lt())")
 uint WavePrefixCountBits(bool value);
+
 uint WaveMultiPrefixCountBits(bool value, uint4 mask);
 
 __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);
@@ -1488,11 +1512,15 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, uint4 mask)
 __generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);
 
-__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "_waveReadFirst($0)")
+T WaveReadLaneFirst(T expr);
 __generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
 __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
 
-__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)")
+T WaveReadLaneAt(T value, int lane);
 __generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
 __generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
 
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 69349d9dc..16a3244ab 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1471,35 +1471,51 @@ SLANG_RAW("__generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);\
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveAnd(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitAnd(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveOr(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitOr(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveXor(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitXor(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveMax(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveMax(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveMin(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveMin(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveProduct(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveProduct(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveSum(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveSum(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveAllEqual(__activemask(), $0)\")\n")
+SLANG_RAW("bool WaveActiveAllEqual(T value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);\n")
 SLANG_RAW("\n")
@@ -1514,7 +1530,7 @@ SLANG_RAW("// seems to be appropriate.\n")
 SLANG_RAW("\n")
 SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
 SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
-SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__any_sync(__activemask(), $0) != 0)\")\n")
 SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
 SLANG_RAW("\n")
 SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
@@ -1530,14 +1546,19 @@ SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
 SLANG_RAW("uint WaveGetLaneIndex();\n")
 SLANG_RAW("\n")
 SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
-SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveIsFirstLane()\")\n")
 SLANG_RAW("bool WaveIsFirstLane();\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
+SLANG_RAW("// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)\n")
+SLANG_RAW("// that would mean different lanes having a different mask, and they all have to have the same mask.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WavePrefixProduct(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WavePrefixSum(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
@@ -1549,11 +1570,14 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr)
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WaveMultiPrefixBitXor(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0)  & __lanemask_lt())\")\n")
 SLANG_RAW("uint WavePrefixCountBits(bool value);\n")
+SLANG_RAW("\n")
 SLANG_RAW("uint WaveMultiPrefixCountBits(bool value, uint4 mask);\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);\n")
@@ -1564,11 +1588,15 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value,
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveReadFirst($0)\")\n")
+SLANG_RAW("T WaveReadLaneFirst(T expr);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);\n")
 SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)\")\n")
+SLANG_RAW("T WaveReadLaneAt(T value, int lane);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);\n")
 SLANG_RAW("\n")
@@ -1658,7 +1686,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1613 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
@@ -1682,6 +1710,8 @@ SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_BACK_FACING_TRIANGLES       = 0x1
 SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_FRONT_FACING_TRIANGLES      = 0x20;\n")
 SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_OPAQUE                      = 0x40;\n")
 SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_NON_OPAQUE                  = 0x80;\n")
+SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_TRIANGLES                   = 0x100;\n")
+SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES       = 0x200;\n")
 SLANG_RAW("\n")
 SLANG_RAW("// 10.1.2 - Ray Description Structure\n")
 SLANG_RAW("\n")
diff --git a/tests/hlsl-intrinsic/wave-active-product.slang b/tests/hlsl-intrinsic/wave-active-product.slang
new file mode 100644
index 000000000..cacc0a539
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-active-product.slang
@@ -0,0 +1,31 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(8, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    const int idx = int(dispatchThreadID.x);
+       
+#if 1
+    if (idx < 3)
+    {
+        // Diverge!!
+        outputBuffer[idx] = -1;
+        return;
+    }
+    outputBuffer[idx] = WaveActiveProduct(idx);
+#else
+
+    /// NOTE! Can't say I totally understand WaveActiveProduct. 
+    /// The following returns 0x240 on CUDA - which is what I'd expect
+    /// On DX12, it returns 0
+
+    outputBuffer[idx] = WaveActiveProduct((idx & 3) + 1);
+#endif
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt b/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt
new file mode 100644
index 000000000..dbe392009
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-active-product.slang.expected.txt
@@ -0,0 +1,16 @@
+FFFFFFFF
+FFFFFFFF
+FFFFFFFF
+9D8
+9D8
+9D8
+9D8
+9D8
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/wave-is-first-lane.slang b/tests/hlsl-intrinsic/wave-is-first-lane.slang
new file mode 100644
index 000000000..39a19336d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-is-first-lane.slang
@@ -0,0 +1,24 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(8, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    if (idx < 3)
+    {
+        // Diverge!!
+        outputBuffer[idx] = -1;
+        return;
+    }
+    
+    int value = 0;
+    outputBuffer[idx] = WaveIsFirstLane();
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt b/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt
new file mode 100644
index 000000000..43debbc9d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-is-first-lane.slang.expected.txt
@@ -0,0 +1,16 @@
+FFFFFFFF
+FFFFFFFF
+FFFFFFFF
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/hlsl-intrinsic/wave.slang b/tests/hlsl-intrinsic/wave.slang
new file mode 100644
index 000000000..bc30da4ad
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave.slang
@@ -0,0 +1,36 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    value |= WaveActiveAllTrue(idx < 4 ) ? 1 : 0;
+    value |= WaveActiveAnyTrue(idx == 2) ? 2 : 0;
+    value |= WaveActiveAnyTrue(idx == -1) ? 4 : 0;
+    value |= WaveActiveAllTrue(idx == 3) ? 8 : 0;
+        
+    int sum = WaveActiveSum(idx);
+    value |= (sum << 4);
+    
+    // TODO(JS): 
+    // This result is unexpected. I expect 1 * 2 * 1 * 2 = 4. But we get 0 on DX (so disable for now). On CUDA I get 4.
+    // int product = WaveActiveProduct((idx & 1) + 1);
+    /// value |= (product << 8);    
+        
+    // TODO(JS): NOTE! This only works with uint, *NOT* int on HLSL/DXC. 
+    // We need to update the stdlib to reflect this.
+    uint xor = WaveActiveBitXor(uint(idx + 1));
+    value |= int(xor << 12);
+    
+    outputBuffer[idx] = value;
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave.slang.expected.txt b/tests/hlsl-intrinsic/wave.slang.expected.txt
new file mode 100644
index 000000000..a3dff7d2d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave.slang.expected.txt
@@ -0,0 +1,4 @@
+4063
+4063
+4063
+4063
author	jsmall-nvidia <jsmall@nvidia.com>	2020-03-02 16:18:20 -0500
committer	GitHub <noreply@github.com>	2020-03-02 16:18:20 -0500
commit	8899c149b05def1cce626ea649012c4c974861de (patch)
tree	77e97c2997a653ba9262b32f55e9e3f37e166653
parent	b85ca6f86d46ee3c4d5784d0bd4ebc8509e2a9bd (diff)