Adds WaveShuffle intrinsic (#1298)

* Support for WaveReadLaneAt with dynamic (but uniform across Wave) on Vk by enabling VK1.4. Fixed wave-lane-at.slang test to test with laneId that is uniform across the Wave. * Added WaveShuffle intrinsic. Test for WaveShuffle intrinsic. * Added some documentation on WaveShuffle * Fix that version required for subgroupBroadcast to be non constexpr is actually 1.5
author: jsmall-nvidia <jsmall@nvidia.com> 2020-03-27 16:16:27 -0400
committer: GitHub <noreply@github.com> 2020-03-27 16:16:27 -0400
commit: e267ce24e37b9b7f98921f75abc150c1463b1d6d (patch)
tree: 331660a83ae0b72116b79b5cc8bf7a9c06555db5
parent: 5b0b8436123aa2faa9b682ed45efe2bd7edbf01b (diff)
8 files changed, 123 insertions, 6 deletions
diff --git a/docs/target-compatibility.md b/docs/target-compatibility.md
index e8edeeaf5..6967f7454 100644
--- a/docs/target-compatibility.md
+++ b/docs/target-compatibility.md
@@ -20,6 +20,7 @@ Items with ^ means there is some discussion about support later in the document
 | SM6.0 Wave Intrinsics       |     No       |   Yes        |  Partial   |     Yes       |    No
 | SM6.0 Quad Intrinsics       |     No       |   Yes        |   No +     |     No        |    No
 | SM6.5 Wave Intrinsics       |     No       |   Yes ^      |   No +     |     Yes       |    No
+| WaveShuffle                 |     No       |   Limited ^  |   Yes +    |     Yes       |    No
 | Tesselation                 |     Yes ^    |   Yes ^      |   No +     |     No        |    No
 | Graphics Pipeline           |     Yes      |   Yes        |   Yes      |     No        |    No
 | Ray Tracing DXR 1.0         |     No       |   Yes ^      |   Yes ^    |     No        |    No
@@ -56,6 +57,16 @@ tex.GetDimensions is the GetDimensions method on 'texture' objects. This is not
 
 SM6.5 Wave Intrinsics are supported, but requires a downstream DXC compiler that supports SM6.5. As it stands the DXC shipping with windows does not. 
 
+## WaveShuffle
+
+WaveShuffle is an intrinsic added to the Slang stdlibrary to expose the glsl `subgroupShuffle` intrinsics and allow loosened requirements on laneId. 
+
+`HLSL` uses `WaveReadLaneAt` and this requires the `laneId` must be 'dynamically uniform' across the wave. WaveShuffle has the same functionality but relaxes this restriction. 
+
+`WaveReadLaneAt` most obviously maps to `subgroupBroadcast` in GLSL. This has the extra restriction the index must be compile time consts. With SPIR-V 1.5 it is allowed to be 'dynamically uniform', but doesn't work on current glslang.
+
+NOTE! That using WaveShuffle to target `HLSL` will produce `WaveReadLaneAt` - that means strictly speaking the restriction *still applies*, and the correct behavior will only be seen on hardware that allows the loosed requirements of laneId, on hardware that does not result of `WaveShuffle` is the same as `WaveReadLaneId` which is undefined. 
+
 ## Tesselation
 
 Although tesselation stages should work on D3D11 and D3D12 they are not tested within our test framework, and may have problems. 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 739b8579d..e29e47581 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2722,17 +2722,18 @@ matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
 
 // NOTE! On GLSL based targets the lane index *must* be a compile time expression!
 // See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
-// It is allowed to be 'dynamically uniform within the subgroup' if it's SPIR-V 1.4.
-// TODO(JS): For now we'll use 1.4, but aim for the future for the compiler to determine
+// It is allowed to be 'dynamically uniform within the subgroup' if it's SPIR-V 1.5.
+// TODO(JS): For now we'll use 1.5, but aim for the future for the compiler to determine
 // if the line the is compile constant, and reduce requirement to 1.3
 __generic<T : __BuiltinType>
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
-__spirv_version(1.4)
+__spirv_version(1.5)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
 __target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
 T WaveReadLaneAt(T value, int lane);
 __generic<T : __BuiltinType, let N : int>
-__spirv_version(1.4)
+__spirv_version(1.5)
+__glsl_extension(GL_KHR_shader_subgroup_ballot)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
 __target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
 vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
@@ -2740,6 +2741,28 @@ __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
 matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
 
+// NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
+// which means it will only work on hardware which allows arbitrary laneIds which is not true
+// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave.
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_shuffle)
+__spirv_version(1.3)
+__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
+__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+T WaveShuffle(T value, int lane);
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_shuffle)
+__spirv_version(1.3)
+__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+vector<T,N> WaveShuffle(vector<T,N> value, int lane);
+__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
+__target_intrinsic(hlsl, "WaveReadLaneAt")
+matrix<T,N,M> WaveShuffle(matrix<T,N,M> value, int lane);
+
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))")
diff --git a/tests/hlsl-intrinsic/wave-lane-at-vk.slang b/tests/hlsl-intrinsic/wave-lane-at-vk.slang
index 137632f16..0d52f781e 100644
--- a/tests/hlsl-intrinsic/wave-lane-at-vk.slang
+++ b/tests/hlsl-intrinsic/wave-lane-at-vk.slang
@@ -9,7 +9,7 @@
 RWStructuredBuffer<int> outputBuffer;
 
 // `The input lane index must be uniform across the wave.`. 
-// The same restriction applies to glsl/SPIR-V 1.4
+// The same restriction applies to glsl/SPIR-V 1.5
 // So we are going to use the input buffer to achieve this.
 
 //TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang b/tests/hlsl-intrinsic/wave-lane-at.slang
index 8661dbc55..c3caaa4e8 100644
--- a/tests/hlsl-intrinsic/wave-lane-at.slang
+++ b/tests/hlsl-intrinsic/wave-lane-at.slang
@@ -9,7 +9,7 @@
 RWStructuredBuffer<int> outputBuffer;
 
 // Note from HLSL: `The input lane index must be uniform across the wave.`. 
-// The same restriction applies to glsl/SPIR-V 1.4
+// The same restriction applies to glsl/SPIR-V 1.5
 // So we are going to use the input buffer to achieve this.
 
 //TEST_INPUT:ubuffer(data=[1 2 3 0], stride=4):name inputBuffer
diff --git a/tests/hlsl-intrinsic/wave-shuffle-vk.slang b/tests/hlsl-intrinsic/wave-shuffle-vk.slang
new file mode 100644
index 000000000..01fb59155
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-shuffle-vk.slang
@@ -0,0 +1,33 @@
+// Disabled because main tests is wave-shuffle.slang, this just tests VK 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+    
+    value += WaveShuffle(idx, (idx + 1) & 3);
+    
+    // vector
+    
+    {
+        float2 v = float2(idx + 1, idx + 2);
+        float2 readValue = WaveShuffle(v, (idx - 1) & 3);
+        
+        value += int(readValue[0] + readValue[1]);
+    }
+    
+    outputBuffer[idx] = value;
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-shuffle-vk.slang.expected.txt b/tests/hlsl-intrinsic/wave-shuffle-vk.slang.expected.txt
new file mode 100644
index 000000000..b20444fc5
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-shuffle-vk.slang.expected.txt
@@ -0,0 +1,4 @@
+A
+5
+8
+7
diff --git a/tests/hlsl-intrinsic/wave-shuffle.slang b/tests/hlsl-intrinsic/wave-shuffle.slang
new file mode 100644
index 000000000..093babcce
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-shuffle.slang
@@ -0,0 +1,42 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//Disabled on D3D, because in general WaveShuffle requires hardware that doesn't have the 'uniform laneId across Wave' restriction. 
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+// Disabled because vk doesn't currently support matrix types. See wave-shuffle-vk.slang
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    int value = 0;
+    
+    // Scalar
+    
+    value += WaveShuffle(idx, (idx + 1) & 3);
+    
+    // vector
+    
+    {
+        float2 v = float2(idx + 1, idx + 2);
+        float2 readValue = WaveShuffle(v, (idx - 1) & 3);
+        
+        value += int(readValue[0] + readValue[1]);
+    }
+        
+    // matrix
+    {
+        matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
+        
+        matrix<int, 2, 2> readValue = WaveShuffle(v, (idx - 1) & 3);
+        
+        value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
+    }
+    
+    outputBuffer[idx] = value;
+}
+\ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-shuffle.slang.expected.txt b/tests/hlsl-intrinsic/wave-shuffle.slang.expected.txt
new file mode 100644
index 000000000..a327b0804
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-shuffle.slang.expected.txt
@@ -0,0 +1,4 @@
+19
+2
+B
+10
author	jsmall-nvidia <jsmall@nvidia.com>	2020-03-27 16:16:27 -0400
committer	GitHub <noreply@github.com>	2020-03-27 16:16:27 -0400
commit	e267ce24e37b9b7f98921f75abc150c1463b1d6d (patch)
tree	331660a83ae0b72116b79b5cc8bf7a9c06555db5
parent	5b0b8436123aa2faa9b682ed45efe2bd7edbf01b (diff)