diff options
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 65 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 12 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-equality.slang | 31 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-equality.slang.expected.txt | 4 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-lane-at.slang | 41 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt | 4 |
6 files changed, 154 insertions, 3 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 5f0dffd5c..c764afba1 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -750,6 +750,29 @@ __inline__ __device__ bool _waveAllEqual(T val) } template <typename T> +__inline__ __device__ bool _waveAllEqualMultiple(T inVal) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + const size_t count = sizeof(T) / sizeof(ElemType); + + // __match_all_sync is a synchronises so can use __activemask() + const int mask = __activemask(); + int pred; + + const ElemType* src = (const ElemType*)&inVal; + + for (size_t i = 0; i < count; ++i) + { + __match_all_sync(mask, src[i], &pred); + if (pred == 0) + { + return false; + } + } + return true; +} + +template <typename T> __inline__ __device__ T _waveReadFirst(T val) { const int mask = __activemask(); @@ -757,6 +780,48 @@ __inline__ __device__ T _waveReadFirst(T val) return __shfl_sync(mask, val, lowestLaneId); } +template <typename T> +__inline__ __device__ T _waveReadFirstMultiple(T inVal) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + const size_t count = sizeof(T) / sizeof(ElemType); + + T outVal; + + const ElemType* src = (const ElemType*)&inVal; + ElemType* dst = (ElemType*)&outVal; + + const int mask = __activemask(); + const int lowestLaneId = __ffs(mask) - 1; + + for (size_t i = 0; i < count; ++i) + { + dst[i] = __shfl_sync(mask, src[i], lowestLaneId); + } + + return outVal; +} + +template <typename T> +__inline__ __device__ T _waveReadLaneAtMultiple(T inVal, int lane) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + const size_t count = sizeof(T) / sizeof(ElemType); + + T outVal; + + const ElemType* src = (const ElemType*)&inVal; + ElemType* dst = (ElemType*)&outVal; + + const int mask = __activemask(); + + for (size_t i = 0; i < count; ++i) + { + dst[i] = __shfl_sync(mask, src[i], lane); + } + + return outVal; +} /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 39cea9ba3..4b717d540 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2441,10 +2441,10 @@ __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") __target_intrinsic(cuda, "_waveAllEqualMultiple($0)") -vector<bool,N> WaveActiveAllEqual(vector<T,N> value); +bool WaveActiveAllEqual(vector<T,N> value); __generic<T : __BuiltinType, let N : int, let M : int> __target_intrinsic(cuda, "_waveAllEqualMultiple($0)") -matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value); +bool WaveActiveAllEqual(matrix<T,N,M> value); __generic<T : __BuiltinType> uint4 WaveMatch(T value); __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value); @@ -2585,21 +2585,27 @@ __generic<T : __BuiltinType, let N : int> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcastFirst($0)") +__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") vector<T,N> WaveReadLaneFirst(vector<T,N> expr); __generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr); +// NOTE! On GLSL based targets the lane index *must* be a compile time expression! +// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt __generic<T : __BuiltinType> __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)") +__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") T WaveReadLaneAt(T value, int lane); __generic<T : __BuiltinType, let N : int> __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") +__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane); __generic<T : __BuiltinType, let N : int, let M : int> +__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane); diff --git a/tests/hlsl-intrinsic/wave-equality.slang b/tests/hlsl-intrinsic/wave-equality.slang new file mode 100644 index 000000000..d12d8cfbc --- /dev/null +++ b/tests/hlsl-intrinsic/wave-equality.slang @@ -0,0 +1,31 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int idx = int(dispatchThreadID.x); + + int value = 0; + + // Scalar + + value |= WaveActiveAllEqual(idx * 0 + 1) ? 1 : 0; // true + value |= WaveActiveAllEqual(idx & 2) ? 2 : 0; // false + + // Vector + + int2 v0 = int2(idx & 0xf0, (idx & 0xf00) + 1); // (0, 1) + int2 v1 = int2(idx & 2, (idx & 2) + 1); + + value |= WaveActiveAllEqual(v0) ? 0x10 : 0; // true + value |= WaveActiveAllEqual(v1) ? 0x20 : 0; // false + + outputBuffer[idx] = value; +}
\ No newline at end of file diff --git a/tests/hlsl-intrinsic/wave-equality.slang.expected.txt b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt new file mode 100644 index 000000000..2bf571888 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-equality.slang.expected.txt @@ -0,0 +1,4 @@ +11 +11 +11 +11 diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang b/tests/hlsl-intrinsic/wave-lane-at.slang new file mode 100644 index 000000000..ca05d985d --- /dev/null +++ b/tests/hlsl-intrinsic/wave-lane-at.slang @@ -0,0 +1,41 @@ +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 +// Disabled because on GLSL targets the lane index *must* be a const expr - and in this test it is not. +//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int idx = int(dispatchThreadID.x); + + int value = 0; + + // Scalar + + value += WaveReadLaneAt(idx, (idx + 1) & 3); + + // vector + + { + float2 v = float2(idx + 1, idx + 2); + float2 readValue = WaveReadLaneAt(v, (idx - 1) & 3); + + value += int(readValue[0] + readValue[1]); + } + + // matrix + { + matrix<int, 2, 2> v = matrix<int, 2, 2>(idx, idx - 1, idx * 3, idx - 2); + + matrix<int, 2, 2> readValue = WaveReadLaneAt(v, (idx - 1) & 3); + + value += int(readValue[0][0] + readValue[0][1] + readValue[1][0] + readValue[1][1]); + } + + outputBuffer[idx] = value; +}
\ No newline at end of file diff --git a/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt new file mode 100644 index 000000000..a327b0804 --- /dev/null +++ b/tests/hlsl-intrinsic/wave-lane-at.slang.expected.txt @@ -0,0 +1,4 @@ +19 +2 +B +10 |
