6 files changed, 154 insertions, 3 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 5f0dffd5c..c764afba1 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -750,6 +750,29 @@ __inline__ __device__ bool _waveAllEqual(T val)
 }
 
 template <typename T>
+__inline__ __device__ bool _waveAllEqualMultiple(T inVal) 
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    const size_t count = sizeof(T) / sizeof(ElemType);
+    
+    // __match_all_sync is a synchronises so can use __activemask()
+    const int mask = __activemask();
+    int pred;
+    
+    const ElemType* src = (const ElemType*)&inVal;
+    
+    for (size_t i = 0; i < count; ++i)
+    {
+        __match_all_sync(mask, src[i], &pred);
+        if (pred == 0)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T>
 __inline__ __device__ T _waveReadFirst(T val) 
 {
     const int mask = __activemask();
@@ -757,6 +780,48 @@ __inline__ __device__ T _waveReadFirst(T val)
     return __shfl_sync(mask, val, lowestLaneId);   
 }
 
+template <typename T>
+__inline__ __device__ T _waveReadFirstMultiple(T inVal) 
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    const size_t count = sizeof(T) / sizeof(ElemType);
+    
+    T outVal;
+    
+    const ElemType* src = (const ElemType*)&inVal;
+    ElemType* dst = (ElemType*)&outVal;
+    
+    const int mask = __activemask();
+    const int lowestLaneId = __ffs(mask) - 1;
+    
+    for (size_t i = 0; i < count; ++i)
+    {
+        dst[i] = __shfl_sync(mask, src[i], lowestLaneId);   
+    }
+    
+    return outVal;
+}
+
+template <typename T>
+__inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    const size_t count = sizeof(T) / sizeof(ElemType);
+    
+    T outVal;
+    
+    const ElemType* src = (const ElemType*)&inVal;
+    ElemType* dst = (ElemType*)&outVal;
+    
+    const int mask = __activemask();
+    
+    for (size_t i = 0; i < count; ++i)
+    {
+        dst[i] = __shfl_sync(mask, src[i], lane);   
+    }
+    
+    return outVal;
+}
 
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 39cea9ba3..4b717d540 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2441,10 +2441,10 @@ __glsl_extension(GL_KHR_shader_subgroup_vote)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAllEqual($0)")
 __target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
-vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
+bool WaveActiveAllEqual(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
-matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);
+bool WaveActiveAllEqual(matrix<T,N,M> value);
 
 __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
@@ -2585,21 +2585,27 @@ __generic<T : __BuiltinType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
+__target_intrinsic(cuda, "_waveReadFirstMultiple($0)")
 vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
 __generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveReadFirstMultiple($0)")
 matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
 
+// NOTE! On GLSL based targets the lane index *must* be a compile time expression!
+// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
 __generic<T : __BuiltinType>
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)")
+__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
 T WaveReadLaneAt(T value, int lane);
 __generic<T : __BuiltinType, let N : int>
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
 vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
 __generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
 matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
 
 
diff --git a/tests/hlsl-intrinsic/wave-equality.slang b/tests/hlsl-intrinsic/wave-equality.slang
new file mode 100644
index 000000000..d12d8cfbc
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-equality.slang
@@ -0,0 +1,31 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+    
+    value |= WaveActiveAllEqual(idx * 0 + 1)  ? 1 : 0;     // true
+    value |= WaveActiveAllEqual(idx & 2)      ? 2 : 0;     // false
+   
+    // Vector
+   
+    int2 v0 = int2(idx & 0xf0, (idx & 0xf00) + 1);      // (0, 1)
+    int2 v1 = int2(idx & 2, (idx & 2) + 1);
+    
+    value |= WaveActiveAllEqual(v0)           ? 0x10 : 0;     // true
+    value |= WaveActiveAllEqual(v1)           ? 0x20 : 0;     // false
+    
+    outputBuffer[idx] = value;
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-equality.slang.expected.txt b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt
new file mode 100644
index 000000000..2bf571888
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt
@@ -0,0 +1,4 @@
+11
+11
+11
+11
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang b/tests/hlsl-intrinsic/wave-lane-at.slang
new file mode 100644
index 000000000..ca05d985d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-lane-at.slang
@@ -0,0 +1,41 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+// Disabled because on GLSL targets the lane index *must* be a const expr - and in this test it is not.
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+    
+    value += WaveReadLaneAt(idx, (idx + 1) & 3);
+    
+    // vector
+    
+    {
+        float2 v = float2(idx + 1, idx + 2);
+        float2 readValue = WaveReadLaneAt(v, (idx - 1) & 3);
+        
+        value += int(readValue[0] + readValue[1]);
+    }
+        
+    // matrix
+    {
+        matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
+        
+        matrix<int, 2, 2> readValue = WaveReadLaneAt(v, (idx - 1) & 3);
+        
+        value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
+    }
+    
+    outputBuffer[idx] = value;
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
new file mode 100644
index 000000000..a327b0804
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
@@ -0,0 +1,4 @@
+19
+2
+B
+10