summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--prelude/slang-cuda-prelude.h65
-rw-r--r--source/slang/hlsl.meta.slang12
-rw-r--r--tests/hlsl-intrinsic/wave-equality.slang31
-rw-r--r--tests/hlsl-intrinsic/wave-equality.slang.expected.txt4
-rw-r--r--tests/hlsl-intrinsic/wave-lane-at.slang41
-rw-r--r--tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt4
6 files changed, 154 insertions, 3 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 5f0dffd5c..c764afba1 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -750,6 +750,29 @@ __inline__ __device__ bool _waveAllEqual(T val)
}
template <typename T>
+__inline__ __device__ bool _waveAllEqualMultiple(T inVal)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ const size_t count = sizeof(T) / sizeof(ElemType);
+
+ // __match_all_sync is a synchronises so can use __activemask()
+ const int mask = __activemask();
+ int pred;
+
+ const ElemType* src = (const ElemType*)&inVal;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ __match_all_sync(mask, src[i], &pred);
+ if (pred == 0)
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+template <typename T>
__inline__ __device__ T _waveReadFirst(T val)
{
const int mask = __activemask();
@@ -757,6 +780,48 @@ __inline__ __device__ T _waveReadFirst(T val)
return __shfl_sync(mask, val, lowestLaneId);
}
+template <typename T>
+__inline__ __device__ T _waveReadFirstMultiple(T inVal)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ const size_t count = sizeof(T) / sizeof(ElemType);
+
+ T outVal;
+
+ const ElemType* src = (const ElemType*)&inVal;
+ ElemType* dst = (ElemType*)&outVal;
+
+ const int mask = __activemask();
+ const int lowestLaneId = __ffs(mask) - 1;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ dst[i] = __shfl_sync(mask, src[i], lowestLaneId);
+ }
+
+ return outVal;
+}
+
+template <typename T>
+__inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ const size_t count = sizeof(T) / sizeof(ElemType);
+
+ T outVal;
+
+ const ElemType* src = (const ElemType*)&inVal;
+ ElemType* dst = (ElemType*)&outVal;
+
+ const int mask = __activemask();
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ dst[i] = __shfl_sync(mask, src[i], lane);
+ }
+
+ return outVal;
+}
/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 39cea9ba3..4b717d540 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2441,10 +2441,10 @@ __glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($0)")
__target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
-vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
+bool WaveActiveAllEqual(vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
-matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);
+bool WaveActiveAllEqual(matrix<T,N,M> value);
__generic<T : __BuiltinType> uint4 WaveMatch(T value);
__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
@@ -2585,21 +2585,27 @@ __generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
+__target_intrinsic(cuda, "_waveReadFirstMultiple($0)")
vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveReadFirstMultiple($0)")
matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
+// NOTE! On GLSL based targets the lane index *must* be a compile time expression!
+// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)")
+__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)")
T WaveReadLaneAt(T value, int lane);
__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
__generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)")
matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
diff --git a/tests/hlsl-intrinsic/wave-equality.slang b/tests/hlsl-intrinsic/wave-equality.slang
new file mode 100644
index 000000000..d12d8cfbc
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-equality.slang
@@ -0,0 +1,31 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ int idx = int(dispatchThreadID.x);
+
+ int value = 0;
+
+ // Scalar
+
+ value |= WaveActiveAllEqual(idx * 0 + 1) ? 1 : 0; // true
+ value |= WaveActiveAllEqual(idx & 2) ? 2 : 0; // false
+
+ // Vector
+
+ int2 v0 = int2(idx & 0xf0, (idx & 0xf00) + 1); // (0, 1)
+ int2 v1 = int2(idx & 2, (idx & 2) + 1);
+
+ value |= WaveActiveAllEqual(v0) ? 0x10 : 0; // true
+ value |= WaveActiveAllEqual(v1) ? 0x20 : 0; // false
+
+ outputBuffer[idx] = value;
+} \ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-equality.slang.expected.txt b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt
new file mode 100644
index 000000000..2bf571888
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt
@@ -0,0 +1,4 @@
+11
+11
+11
+11
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang b/tests/hlsl-intrinsic/wave-lane-at.slang
new file mode 100644
index 000000000..ca05d985d
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-lane-at.slang
@@ -0,0 +1,41 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
+// Disabled because on GLSL targets the lane index *must* be a const expr - and in this test it is not.
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ int idx = int(dispatchThreadID.x);
+
+ int value = 0;
+
+ // Scalar
+
+ value += WaveReadLaneAt(idx, (idx + 1) & 3);
+
+ // vector
+
+ {
+ float2 v = float2(idx + 1, idx + 2);
+ float2 readValue = WaveReadLaneAt(v, (idx - 1) & 3);
+
+ value += int(readValue[0] + readValue[1]);
+ }
+
+ // matrix
+ {
+ matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2);
+
+ matrix<int, 2, 2> readValue = WaveReadLaneAt(v, (idx - 1) & 3);
+
+ value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]);
+ }
+
+ outputBuffer[idx] = value;
+} \ No newline at end of file
diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
new file mode 100644
index 000000000..a327b0804
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt
@@ -0,0 +1,4 @@
+19
+2
+B
+10