From 6f43b2698a99cc4f4bb4e905749fb87f24bf391b Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Fri, 27 Mar 2020 18:35:06 -0400
Subject: WaveBroadcastAt/WaveShuffle (#1299)

* Support for WaveReadLaneAt with dynamic (but uniform across Wave) on Vk by enabling VK1.4.
Fixed wave-lane-at.slang test to test with laneId that is uniform across the Wave.

* Added WaveShuffle intrinsic.
Test for WaveShuffle intrinsic.

* Added some documentation on WaveShuffle

* Fix that version required for subgroupBroadcast to be non constexpr is actually 1.5

* Added WaveBroadcastLaneAt
Documented WaveShuffle/BroadcastLaneAt/ReadLaneAt

* Update docs around WaveBroadcast/Read/Shuffle.
Use '_waveShuffle` as name in CUDA prelude to better describe it's more flexible behavior.
---
 docs/target-compatibility.md                       | 20 ++++++--
 prelude/slang-cuda-prelude.h                       |  2 +-
 source/slang/hlsl.meta.slang                       | 46 +++++++++++++-----
 .../hlsl-intrinsic/wave-broadcast-lane-at-vk.slang | 28 +++++++++++
 .../wave-broadcast-lane-at-vk.slang.expected.txt   |  4 ++
 tests/hlsl-intrinsic/wave-broadcast-lane-at.slang  | 41 ++++++++++++++++
 .../wave-broadcast-lane-at.slang.expected.txt      |  4 ++
 tests/hlsl-intrinsic/wave-lane-at-vk.slang         | 45 ------------------
 .../wave-lane-at-vk.slang.expected.txt             |  4 --
 tests/hlsl-intrinsic/wave-lane-at.slang            | 54 ----------------------
 .../hlsl-intrinsic/wave-lane-at.slang.expected.txt |  4 --
 tests/hlsl-intrinsic/wave-read-lane-at-vk.slang    | 44 ++++++++++++++++++
 .../wave-read-lane-at-vk.slang.expected.txt        |  4 ++
 tests/hlsl-intrinsic/wave-read-lane-at.slang       | 54 ++++++++++++++++++++++
 .../wave-read-lane-at.slang.expected.txt           |  4 ++
 tests/hlsl-intrinsic/wave-shuffle-vk.slang         |  1 -
 16 files changed, 233 insertions(+), 126 deletions(-)
 create mode 100644 tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
 create mode 100644 tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang.expected.txt
 create mode 100644 tests/hlsl-intrinsic/wave-broadcast-lane-at.slang
 create mode 100644 tests/hlsl-intrinsic/wave-broadcast-lane-at.slang.expected.txt
 delete mode 100644 tests/hlsl-intrinsic/wave-lane-at-vk.slang
 delete mode 100644 tests/hlsl-intrinsic/wave-lane-at-vk.slang.expected.txt
 delete mode 100644 tests/hlsl-intrinsic/wave-lane-at.slang
 delete mode 100644 tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
 create mode 100644 tests/hlsl-intrinsic/wave-read-lane-at-vk.slang
 create mode 100644 tests/hlsl-intrinsic/wave-read-lane-at-vk.slang.expected.txt
 create mode 100644 tests/hlsl-intrinsic/wave-read-lane-at.slang
 create mode 100644 tests/hlsl-intrinsic/wave-read-lane-at.slang.expected.txt

diff --git a/docs/target-compatibility.md b/docs/target-compatibility.md
index 6967f7454..ee5341733 100644
--- a/docs/target-compatibility.md
+++ b/docs/target-compatibility.md
@@ -20,7 +20,7 @@ Items with ^ means there is some discussion about support later in the document
 | SM6.0 Wave Intrinsics       |     No       |   Yes        |  Partial   |     Yes       |    No
 | SM6.0 Quad Intrinsics       |     No       |   Yes        |   No +     |     No        |    No
 | SM6.5 Wave Intrinsics       |     No       |   Yes ^      |   No +     |     Yes       |    No
-| WaveShuffle                 |     No       |   Limited ^  |   Yes +    |     Yes       |    No
+| WaveShuffle                 |     No       |   Limited ^  |   Yes      |     Yes       |    No
 | Tesselation                 |     Yes ^    |   Yes ^      |   No +     |     No        |    No
 | Graphics Pipeline           |     Yes      |   Yes        |   Yes      |     No        |    No
 | Ray Tracing DXR 1.0         |     No       |   Yes ^      |   Yes ^    |     No        |    No
@@ -59,13 +59,23 @@ SM6.5 Wave Intrinsics are supported, but requires a downstream DXC compiler that
 
 ## WaveShuffle
 
-WaveShuffle is an intrinsic added to the Slang stdlibrary to expose the glsl `subgroupShuffle` intrinsics and allow loosened requirements on laneId. 
+`WaveShuffle` and `WaveBroadcastLaneAt` are Slang specific intrinsic additions to expand the options available around `WaveReadLaneAt`. 
 
-`HLSL` uses `WaveReadLaneAt` and this requires the `laneId` must be 'dynamically uniform' across the wave. WaveShuffle has the same functionality but relaxes this restriction. 
+To be clear this means they will not compile directly on 'standard' HLSL compilers such as `dxc`, but Slang HLSL *output* (which will not contain these intrinsics) can (and typically is) compiled via dxc.
 
-`WaveReadLaneAt` most obviously maps to `subgroupBroadcast` in GLSL. This has the extra restriction the index must be compile time consts. With SPIR-V 1.5 it is allowed to be 'dynamically uniform', but doesn't work on current glslang.
+The difference between them can be summarized as follows
 
-NOTE! That using WaveShuffle to target `HLSL` will produce `WaveReadLaneAt` - that means strictly speaking the restriction *still applies*, and the correct behavior will only be seen on hardware that allows the loosed requirements of laneId, on hardware that does not result of `WaveShuffle` is the same as `WaveReadLaneId` which is undefined. 
+* WaveBroadcastLaneAt - laneId must be a compile time constant 
+* WaveReadLaneAt - laneId can be dynamic but *MUST* be the same value across the Wave ie 'dynamically uniform' across the Wave
+* WaveShuffle - laneId can be truly dynamic (NOTE! That it is not strictly truly available currently on all targets, specifically HLSL)
+
+Other than the different restrictions on laneId they act identically to WaveReadLaneAt.
+
+`WaveBroadcastLaneAt` and `WaveReadLaneAt` will work on all targets that support wave intrinsics, with the only current restriction being that on GLSL targets, only scalars and vectors are supported.
+
+`WaveShuffle` will always work on CUDA/Vulkan. 
+
+On HLSL based targets currently `WaveShuffle` will be converted into `WaveReadLaneAt`. Strictly speaking this means it *requires* the `laneId` to be `dynamically uniform` across the Wave. In practice some hardware supports the loosened usage, and others does not. In the future this may be fixed in Slang and/or HLSL to work across all hardware. For now if you use `WaveShuffle` on HLSL based targets it will be necessary to confirm that `WaveReadLaneAt` has the loosened behavior for all the hardware intended. If target hardware does not support the loosened restrictions it's behavior is undefined. 
 
 ## Tesselation
 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index dcc585b9c..b5d8b3788 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -851,7 +851,7 @@ __inline__ __device__ T _waveReadFirstMultiple(T inVal)
 }
 
 template <typename T>
-__inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane)
+__inline__ __device__ T _waveShuffleMultiple(T inVal, int lane)
 {
     typedef typename ElementTypeTrait<T>::Type ElemType;
     const size_t count = sizeof(T) / sizeof(ElemType);
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index e29e47581..e2e745773 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2720,25 +2720,47 @@ __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(cuda, "_waveReadFirstMultiple($0)")
 matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
 
-// NOTE! On GLSL based targets the lane index *must* be a compile time expression!
-// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
-// It is allowed to be 'dynamically uniform within the subgroup' if it's SPIR-V 1.5.
-// TODO(JS): For now we'll use 1.5, but aim for the future for the compiler to determine
-// if the line the is compile constant, and reduce requirement to 1.3
+// NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL
+// It is provided as access to subgroupBroadcast which can only take a
+// constexpr laneId.
+// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
+// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index
+// If that's the behavior required then client code should use WaveReadLaneAt which works this way.
 __generic<T : __BuiltinType>
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
-__spirv_version(1.5)
+__spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
 __target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
-T WaveReadLaneAt(T value, int lane);
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+T WaveBroadcastLaneAt(T value, constexpr int lane);
 __generic<T : __BuiltinType, let N : int>
-__spirv_version(1.5)
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
+__spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane);
+__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+matrix<T,N,M> WaveBroadcastLaneAt(matrix<T,N,M> value, constexpr int lane);
+
+// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
+// could be used on GLSL. For now we just use subgroupShuffle
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_shuffle)
+__spirv_version(1.3)
+__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
+__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
+T WaveReadLaneAt(T value, int lane);
+__generic<T : __BuiltinType, let N : int>
+__spirv_version(1.3)
+__glsl_extension(GL_KHR_shader_subgroup_shuffle)
+__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
 vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
 __generic<T : __BuiltinType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
 matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
 
 // NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
@@ -2755,11 +2777,11 @@ __generic<T : __BuiltinType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_shuffle)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupShuffle($0, $1)")
-__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
 __target_intrinsic(hlsl, "WaveReadLaneAt")
 vector<T,N> WaveShuffle(vector<T,N> value, int lane);
 __generic<T : __BuiltinType, let N : int, let M : int>
-__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1)")
 __target_intrinsic(hlsl, "WaveReadLaneAt")
 matrix<T,N,M> WaveShuffle(matrix<T,N,M> value, int lane);
 
diff --git a/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
new file mode 100644
index 000000000..3c746476a
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
@@ -0,0 +1,28 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+
+    value += WaveBroadcastLaneAt(idx, 1);
+    
+    // vector
+    
+    {
+        float2 v = float2(idx + 1, idx + 2);
+        float2 readValue = WaveBroadcastLaneAt(v, 4 & 3);
+        
+        value += int(readValue[0] + readValue[1]);
+    }            
+   
+    outputBuffer[idx] = value;
+}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang.expected.txt b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang.expected.txt
new file mode 100644
index 000000000..e785149d2
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang.expected.txt
@@ -0,0 +1,4 @@
+4
+4
+4
+4
diff --git a/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang b/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang
new file mode 100644
index 000000000..b6f5d3847
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang
@@ -0,0 +1,41 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+// Disabled on VK because glsl can't do WaveReadLaneAt on matrix. 
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+    
+    value += WaveBroadcastLaneAt(idx, 1);
+    
+    // vector
+    
+    {
+        float2 v = float2(idx + 1, idx + 2);
+        float2 readValue = WaveBroadcastLaneAt(v, 2);
+        
+        value += int(readValue[0] + readValue[1]);
+    }
+        
+    // matrix
+    {
+        matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
+        
+        matrix<int, 2, 2> readValue = WaveBroadcastLaneAt(v, 3);
+        
+        value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
+    }
+    
+    outputBuffer[idx] = value;
+}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang.expected.txt
new file mode 100644
index 000000000..5ce1f8639
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-broadcast-lane-at.slang.expected.txt
@@ -0,0 +1,4 @@
+17
+17
+17
+17
diff --git a/tests/hlsl-intrinsic/wave-lane-at-vk.slang b/tests/hlsl-intrinsic/wave-lane-at-vk.slang
deleted file mode 100644
index 0d52f781e..000000000
--- a/tests/hlsl-intrinsic/wave-lane-at-vk.slang
+++ /dev/null
@@ -1,45 +0,0 @@
-// This is similar to wave-lane-at.slang but tests more limited supported types for vk.
-// We have this 'simple' test, because we can't do matrix (or imat) operations on GLSL/Vk target
-
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
-// TODO(JS): Disabled for now, as requires upgraded glslang
-//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
-
-//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
-RWStructuredBuffer<int> outputBuffer;
-
-// `The input lane index must be uniform across the wave.`. 
-// The same restriction applies to glsl/SPIR-V 1.5
-// So we are going to use the input buffer to achieve this.
-
-//TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
-RWStructuredBuffer<int> inputBuffer;
-
-[numthreads(4, 1, 1)]
-void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    int idx = int(dispatchThreadID.x);
-    
-    int value = 0;
-    
-    for (int i = 0; i < 4; ++i)
-    {
-        // Scalar
-        
-        // The landId is 'dynamic' but it also uniform across the wave (as required by spec)
-        const int laneId = inputBuffer[i];
-        
-        value += WaveReadLaneAt(idx, laneId);
-        
-        // vector
-        
-        {
-            float2 v = float2(idx + 1, idx + 2);
-            float2 readValue = WaveReadLaneAt(v, (laneId + 1) & 3);
-            
-            value += int(readValue[0] + readValue[1]);
-        }            
-    }
-   
-    outputBuffer[idx] = value;
-}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-lane-at-vk.slang.expected.txt b/tests/hlsl-intrinsic/wave-lane-at-vk.slang.expected.txt
deleted file mode 100644
index 4e98888c6..000000000
--- a/tests/hlsl-intrinsic/wave-lane-at-vk.slang.expected.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-1E
-1E
-1E
-1E
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang b/tests/hlsl-intrinsic/wave-lane-at.slang
deleted file mode 100644
index c3caaa4e8..000000000
--- a/tests/hlsl-intrinsic/wave-lane-at.slang
+++ /dev/null
@@ -1,54 +0,0 @@
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
-// Disabled on VK because glsl can't do WaveReadLaneAt on matrix. 
-//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
-//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
-
-//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
-RWStructuredBuffer<int> outputBuffer;
-
-// Note from HLSL: `The input lane index must be uniform across the wave.`. 
-// The same restriction applies to glsl/SPIR-V 1.5
-// So we are going to use the input buffer to achieve this.
-
-//TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
-RWStructuredBuffer<int> inputBuffer;
-
-[numthreads(4, 1, 1)]
-void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    int idx = int(dispatchThreadID.x);
-    
-    int value = 0;
-    
-    for (int i = 0; i < 4; ++i)
-    {
-        // Scalar
-        
-        // The landId is 'dynamic' but it also uniform across the wave (as required by spec)
-        const int laneId = inputBuffer[i];
-        
-        value += WaveReadLaneAt(idx, laneId);
-        
-        // vector
-        
-        {
-            float2 v = float2(idx + 1, idx + 2);
-            float2 readValue = WaveReadLaneAt(v, (laneId + 1) & 3);
-            
-            value += int(readValue[0] + readValue[1]);
-        }
-            
-        // matrix
-        {
-            matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
-            
-            matrix<int, 2, 2> readValue = WaveReadLaneAt(v, (laneId - 1) & 3);
-            
-            value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
-        }
-    }
-    
-    outputBuffer[idx] = value;
-}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
deleted file mode 100644
index c6167dbae..000000000
--- a/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-36
-36
-36
-36
diff --git a/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang b/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang
new file mode 100644
index 000000000..3bd6b36b8
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang
@@ -0,0 +1,44 @@
+// This is similar to wave-lane-at.slang but tests more limited supported types for vk.
+// We have this 'simple' test, because we can't do matrix (or imat) operations on GLSL/Vk target
+
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+// `The input lane index must be uniform across the wave.`. 
+// The same restriction applies to glsl/SPIR-V 1.5
+// So we are going to use the input buffer to achieve this.
+
+//TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
+RWStructuredBuffer<int> inputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    for (int i = 0; i < 4; ++i)
+    {
+        // Scalar
+        
+        // The landId is 'dynamic' but it also uniform across the wave (as required by spec)
+        const int laneId = inputBuffer[i];
+        
+        value += WaveReadLaneAt(idx, laneId);
+        
+        // vector
+        
+        {
+            float2 v = float2(idx + 1, idx + 2);
+            float2 readValue = WaveReadLaneAt(v, (laneId + 1) & 3);
+            
+            value += int(readValue[0] + readValue[1]);
+        }            
+    }
+   
+    outputBuffer[idx] = value;
+}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang.expected.txt b/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang.expected.txt
new file mode 100644
index 000000000..4e98888c6
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-read-lane-at-vk.slang.expected.txt
@@ -0,0 +1,4 @@
+1E
+1E
+1E
+1E
diff --git a/tests/hlsl-intrinsic/wave-read-lane-at.slang b/tests/hlsl-intrinsic/wave-read-lane-at.slang
new file mode 100644
index 000000000..c3caaa4e8
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-read-lane-at.slang
@@ -0,0 +1,54 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+// Disabled on VK because glsl can't do WaveReadLaneAt on matrix. 
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+// Note from HLSL: `The input lane index must be uniform across the wave.`. 
+// The same restriction applies to glsl/SPIR-V 1.5
+// So we are going to use the input buffer to achieve this.
+
+//TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
+RWStructuredBuffer<int> inputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    for (int i = 0; i < 4; ++i)
+    {
+        // Scalar
+        
+        // The landId is 'dynamic' but it also uniform across the wave (as required by spec)
+        const int laneId = inputBuffer[i];
+        
+        value += WaveReadLaneAt(idx, laneId);
+        
+        // vector
+        
+        {
+            float2 v = float2(idx + 1, idx + 2);
+            float2 readValue = WaveReadLaneAt(v, (laneId + 1) & 3);
+            
+            value += int(readValue[0] + readValue[1]);
+        }
+            
+        // matrix
+        {
+            matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
+            
+            matrix<int, 2, 2> readValue = WaveReadLaneAt(v, (laneId - 1) & 3);
+            
+            value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
+        }
+    }
+    
+    outputBuffer[idx] = value;
+}
\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-read-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-read-lane-at.slang.expected.txt
new file mode 100644
index 000000000..c6167dbae
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-read-lane-at.slang.expected.txt
@@ -0,0 +1,4 @@
+36
+36
+36
+36
diff --git a/tests/hlsl-intrinsic/wave-shuffle-vk.slang b/tests/hlsl-intrinsic/wave-shuffle-vk.slang
index 01fb59155..75aa392ea 100644
--- a/tests/hlsl-intrinsic/wave-shuffle-vk.slang
+++ b/tests/hlsl-intrinsic/wave-shuffle-vk.slang
@@ -2,7 +2,6 @@
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
-
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
 
-- 
cgit v1.2.3