summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/user-guide/a3-02-reference-capability-atoms.md12
-rw-r--r--docs/wave-intrinsics.md14
-rw-r--r--source/slang/glsl.meta.slang70
-rw-r--r--source/slang/hlsl.meta.slang149
-rw-r--r--source/slang/slang-capabilities.capdef20
-rw-r--r--tests/diagnostics/wave-operations-types.slang14
-rw-r--r--tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang133
-rw-r--r--tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang134
8 files changed, 507 insertions, 39 deletions
diff --git a/docs/user-guide/a3-02-reference-capability-atoms.md b/docs/user-guide/a3-02-reference-capability-atoms.md
index 296614716..a9455761c 100644
--- a/docs/user-guide/a3-02-reference-capability-atoms.md
+++ b/docs/user-guide/a3-02-reference-capability-atoms.md
@@ -407,6 +407,9 @@ Extensions
`SPV_NV_shader_subgroup_partitioned`
> Represents the SPIR-V extension for shader subgroup partitioned.
+`SPV_KHR_subgroup_rotate`
+> Represents the SPIR-V extension enables rotating values across invocations within a subgroup.
+
`SPV_NV_ray_tracing_motion_blur`
> Represents the SPIR-V extension for ray tracing motion blur.
@@ -501,6 +504,9 @@ Extensions
`spvGroupNonUniformPartitionedNV`
> Represents the SPIR-V capability for group non-uniform partitioned operations.
+`spvGroupNonUniformRotateKHR`
+> Represents the SPIR-V capability for group non-uniform rotate operations.
+
`spvRayTracingMotionBlurNV`
> Represents the SPIR-V capability for ray tracing motion blur.
@@ -699,6 +705,9 @@ Extensions
`GL_KHR_shader_subgroup_vote`
> Represents the GL_KHR_shader_subgroup_vote extension.
+`GL_KHR_shader_subgroup_rotate`
+> Represents the GL_KHR_shader_subgroup_rotate extension.
+
`GL_NV_compute_shader_derivatives`
> Represents the GL_NV_compute_shader_derivatives extension.
@@ -1132,6 +1141,9 @@ Compound Capabilities
`subgroup_partitioned`
> Capabilities required to use GLSL-style subgroup operations 'subgroup_partitioned'
+`subgroup_rotate`
+> Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate'
+
`atomic_glsl_hlsl_nvapi_cuda_metal_float1`
> (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
diff --git a/docs/wave-intrinsics.md b/docs/wave-intrinsics.md
index aa46f72a1..7f6fb7b77 100644
--- a/docs/wave-intrinsics.md
+++ b/docs/wave-intrinsics.md
@@ -236,6 +236,20 @@ void GroupMemoryBarrierWithWaveSync();
Synchronizes all lanes to the same GroupMemoryBarrierWithWaveSync in program flow. Orders group shared memory accesses such that accesses after the barrier can be seen by writes before.
+Wave Rotate Intrinsics
+======================
+
+These intrinsics are specific to Slang and were added to support the subgroup rotate functionalities provided by SPIRV (through the `GroupNonUniformRotateKHR` capability), GLSL (through the `GL_KHR_shader_subgroup_rotate
+` extension), and Metal.
+
+```
+// Supported on SPIRV, GLSL, and Metal targets.
+T WaveRotate(T value, uint delta);
+
+// Supported on SPIRV and GLSL targets.
+T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize);
+```
+
Wave Mask Intrinsics
====================
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang
index bbf0c40dd..85c8b174c 100644
--- a/source/slang/glsl.meta.slang
+++ b/source/slang/glsl.meta.slang
@@ -6110,45 +6110,6 @@ public void traceRayMotionNV(
}
}
-__generic<T : __BuiltinType>
-[ForceInline]
-void typeRequireChecks_shader_subgroup_GLSL() {
- // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
- __target_switch
- {
- case glsl:
- if (__type_equals<T, half>()
- || __type_equals<T, float16_t>()
- ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
- else if (__type_equals<T, uint8_t>()
- || __type_equals<T, int8_t>()
- ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
- else if (__type_equals<T, uint16_t>()
- || __type_equals<T, int16_t>()
- ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
- else if (__type_equals<T, uint64_t>()
- || __type_equals<T, int64_t>()
- ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
-
- __intrinsic_asm "";
- }
-}
-
-__generic<T : __BuiltinType>
-void shader_subgroup_preamble() {
- // checks needed for shader_subgroup functions; __requireTargetExtension does not work
- // (does not add the ext specified correctly to the compile output; using extended type
- // will result in error for using the type)
- __target_switch
- {
- case glsl:
- typeRequireChecks_shader_subgroup_GLSL<T>();
- default:
- return;
- }
-
-}
-
// GL_KHR_shader_subgroup_basic Built-in Variables
[require(cpp_cuda_glsl_hlsl_spirv_wgsl, subgroup_basic)]
@@ -8176,6 +8137,37 @@ public vector<T,N> subgroupQuadSwapDiagonal(vector<T,N> value)
return QuadReadAcrossDiagonal(value);
}
+// GL_KHR_shader_subgroup_rotate
+
+__generic<T : __BuiltinType>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public T subgroupRotate(T value, uint delta)
+{
+ return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public vector<T, N> subgroupRotate(vector<T, N> value, uint delta)
+{
+ return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType>
+[require(glsl_spirv, subgroup_rotate)]
+public T subgroupClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+ return WaveClusteredRotate(value, delta, clusterSize);
+
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_spirv, subgroup_rotate)]
+public vector<T, N> subgroupClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+ return WaveClusteredRotate(value, delta, clusterSize);
+}
+
//// GLSL atomic
// The following type internally is a Shader Storage Buffer
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c8a2c8c58..03321bfaf 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -16368,6 +16368,155 @@ bool IsHelperLane()
}
}
+//@hidden:
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(glsl)]
+void __requireGLSLShaderSubgroupTypeExtension()
+{
+ // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
+ if (__type_equals<T, half>()
+ || __type_equals<T, float16_t>()
+ ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
+ else if (__type_equals<T, uint8_t>()
+ || __type_equals<T, int8_t>()
+ ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
+ else if (__type_equals<T, uint16_t>()
+ || __type_equals<T, int16_t>()
+ ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
+ else if (__type_equals<T, uint64_t>()
+ || __type_equals<T, int64_t>()
+ ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
+
+ __intrinsic_asm "";
+}
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(metal)]
+void __checkMetalShaderSubgroupType()
+{
+ // These builtin types are not supported for Metal's `simd` operations.
+ if (__type_equals<T, uint8_t>()
+ || __type_equals<T, int8_t>()
+ || __type_equals<T, uint64_t>()
+ || __type_equals<T, int64_t>()
+ || __isBool<T>()
+ )
+ {
+ static_assert(false, "Unsupported type for subgroup operations in Metal. Valid types include scalars and vectors of uint/uint32_t, int/int32_t, uint16_t, int16_t, float, and half.");
+ }
+}
+
+__generic<T : __BuiltinType>
+void shader_subgroup_preamble()
+{
+ // checks needed for shader_subgroup functions; __requireTargetExtension does not work
+ // (does not add the ext specified correctly to the compile output; using extended type
+ // will result in error for using the type)
+ __target_switch
+ {
+ case glsl:
+ __requireGLSLShaderSubgroupTypeExtension<T>();
+ case metal:
+ __checkMetalShaderSubgroupType<T>();
+ default:
+ return;
+ }
+}
+
+//@public:
+
+//
+// Wave Rotate intrinsics.
+// These are Slang specific intrinsics to rotate values within a subgroup.
+//
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+T WaveRotate(T value, uint delta)
+{
+ shader_subgroup_preamble<T>();
+ __target_switch
+ {
+ case glsl:
+ __intrinsic_asm "subgroupRotate";
+ case metal:
+ __intrinsic_asm "simd_shuffle_rotate_down";
+ case spirv:
+ return spirv_asm
+ {
+ OpExtension "SPV_KHR_subgroup_rotate";
+ OpCapability GroupNonUniformRotateKHR;
+ result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+ };
+ }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+vector<T, N> WaveRotate(vector<T, N> value, uint delta)
+{
+ shader_subgroup_preamble<T>();
+ __target_switch
+ {
+ case glsl:
+ __intrinsic_asm "subgroupRotate";
+ case metal:
+ __intrinsic_asm "simd_shuffle_rotate_down";
+ case spirv:
+ return spirv_asm
+ {
+ OpExtension "SPV_KHR_subgroup_rotate";
+ OpCapability GroupNonUniformRotateKHR;
+ result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+ };
+ }
+}
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+ shader_subgroup_preamble<T>();
+ __target_switch
+ {
+ case glsl:
+ __intrinsic_asm "subgroupClusteredRotate";
+ case spirv:
+ return spirv_asm
+ {
+ OpExtension "SPV_KHR_subgroup_rotate";
+ OpCapability GroupNonUniformRotateKHR;
+ result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+ };
+ }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+ shader_subgroup_preamble<T>();
+ __target_switch
+ {
+ case glsl:
+ __intrinsic_asm "subgroupClusteredRotate";
+ case spirv:
+ return spirv_asm
+ {
+ OpExtension "SPV_KHR_subgroup_rotate";
+ OpCapability GroupNonUniformRotateKHR;
+ result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+ };
+ }
+}
+
//
// Quad Control intrinsics
//
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index b62de0f08..f4ae94978 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -513,6 +513,10 @@ def SPV_KHR_shader_clock : _spirv_1_0;
/// [EXT]
def SPV_NV_shader_subgroup_partitioned : _spirv_1_0;
+/// Represents the SPIR-V extension enables rotating values across invocations within a subgroup.
+/// [EXT]
+def SPV_KHR_subgroup_rotate : _spirv_1_3;
+
/// Represents the SPIR-V extension for ray tracing motion blur.
/// [EXT]
def SPV_NV_ray_tracing_motion_blur : _spirv_1_0;
@@ -640,6 +644,10 @@ def spvGroupNonUniformVote : _spirv_1_3;
/// [EXT]
def spvGroupNonUniformPartitionedNV : _spirv_1_3 + SPV_NV_shader_subgroup_partitioned;
+/// Represents the SPIR-V capability for group non-uniform rotate operations.
+/// [EXT]
+def spvGroupNonUniformRotateKHR : _spirv_1_3;
+
/// Represents the SPIR-V capability for ray tracing motion blur.
/// [EXT]
def spvRayTracingMotionBlurNV : SPV_NV_ray_tracing_motion_blur;
@@ -777,6 +785,7 @@ def _GL_KHR_shader_subgroup_quad : _GLSL_140;
def _GL_KHR_shader_subgroup_shuffle : _GLSL_140;
def _GL_KHR_shader_subgroup_shuffle_relative : _GLSL_140;
def _GL_KHR_shader_subgroup_vote : _GLSL_140;
+def _GL_KHR_shader_subgroup_rotate : _GLSL_140;
def _GL_NV_compute_shader_derivatives : _GLSL_450;
def _GL_NV_fragment_shader_barycentric : _GL_EXT_fragment_shader_barycentric;
@@ -982,6 +991,10 @@ alias GL_KHR_shader_subgroup_shuffle_relative = _GL_KHR_shader_subgroup_shuffle_
/// [EXT]
alias GL_KHR_shader_subgroup_vote = _GL_KHR_shader_subgroup_vote | spvGroupNonUniformVote;
+/// Represents the GL_KHR_shader_subgroup_rotate extension.
+/// [EXT]
+alias GL_KHR_shader_subgroup_rotate = _GL_KHR_shader_subgroup_rotate | spvGroupNonUniformRotateKHR;
+
/// Represents the GL_NV_compute_shader_derivatives extension.
/// [EXT]
alias GL_NV_compute_shader_derivatives = _GL_NV_compute_shader_derivatives | SPV_KHR_compute_shader_derivatives | _sm_6_6;
@@ -2069,6 +2082,13 @@ alias subgroup_quad = GL_KHR_shader_subgroup_quad
/// [Compound]
alias subgroup_partitioned = GL_NV_shader_subgroup_partitioned + subgroup_ballot_activemask | _sm_6_5 | _cuda_sm_7_0;
+
+/// Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate'
+/// [Compound]
+alias subgroup_rotate = GL_KHR_shader_subgroup_rotate
+ | metal
+ ;
+
/// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
/// [Compound]
alias atomic_glsl_hlsl_nvapi_cuda_metal_float1 = atomic_glsl_float1 | hlsl_nvapi + _sm_4_0 | _cuda_sm_2_0 | metal;
diff --git a/tests/diagnostics/wave-operations-types.slang b/tests/diagnostics/wave-operations-types.slang
new file mode 100644
index 000000000..55a6a8e91
--- /dev/null
+++ b/tests/diagnostics/wave-operations-types.slang
@@ -0,0 +1,14 @@
+//DIAGNOSTIC_TEST:SIMPLE(filecheck=CHECK): -entry computeMain -stage compute -target metal
+
+RWStructuredBuffer<uint> out;
+
+[shader("compute")]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+ // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+ // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+ // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+ out[0] = WaveRotate(true, 1);
+ out[1] = WaveRotate(uint8_t(dispatchID.x), 1);
+ out[2] = WaveRotate(uint64_t(dispatchID.x), 1);
+}
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
new file mode 100644
index 000000000..d52384c15
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
@@ -0,0 +1,133 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+#if defined(USE_GLSL_SYNTAX)
+#define __clusteredRotate subgroupClusteredRotate
+#else
+#define __clusteredRotate WaveClusteredRotate
+#endif
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#define SUBGROUP_SIZE 32
+#define DELTA 3
+#define CLUSTER_SIZE 8
+
+static uint threadIndex;
+static uint clusterIndex;
+static uint rotatedValue;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1ClusteredRotate()
+{
+ return __clusteredRotate(T(threadIndex), DELTA, CLUSTER_SIZE) == T(rotatedValue);
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVRClusteredRotate()
+{
+ typealias gvec = vector<T, N>;
+
+#if defined(USE_GLSL_SYNTAX)
+ return (__clusteredRotate(gvec(T(threadIndex)), DELTA, CLUSTER_SIZE) == gvec(T(rotatedValue)));
+#else
+ return (__clusteredRotate(gvec(T(threadIndex)), DELTA, CLUSTER_SIZE) == gvec(T(rotatedValue)))[0];
+#endif
+}
+
+bool test1ClusteredRotateBool()
+{
+ bool currentValue = (threadIndex % 2 == 0) ? true : false;
+ bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+ return __clusteredRotate(currentValue, DELTA, CLUSTER_SIZE) == rotatedValueBool;
+}
+
+__generic<let N : int>
+bool testVRClusteredRotateBool()
+{
+ typealias gvec = vector<bool, N>;
+ bool currentValue = (threadIndex % 2 == 0) ? true : false;
+ bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+
+#if defined(USE_GLSL_SYNTAX)
+ return (__clusteredRotate(gvec(currentValue), DELTA, CLUSTER_SIZE) == gvec(rotatedValueBool));
+#else
+ return (__clusteredRotate(gvec(currentValue), DELTA, CLUSTER_SIZE) == gvec(rotatedValueBool))[0];
+#endif
+}
+
+bool testClusteredRotate()
+{
+ return true
+ & test1ClusteredRotate<float>()
+ & testVRClusteredRotate<float, 2>()
+ & testVRClusteredRotate<float, 3>()
+ & testVRClusteredRotate<float, 4>()
+ & test1ClusteredRotate<half>()
+ & testVRClusteredRotate<half, 2>()
+ & testVRClusteredRotate<half, 3>()
+ & testVRClusteredRotate<half, 4>()
+ & test1ClusteredRotate<uint>()
+ & testVRClusteredRotate<uint, 2>()
+ & testVRClusteredRotate<uint, 3>()
+ & testVRClusteredRotate<uint, 4>()
+ & test1ClusteredRotate<uint16_t>()
+ & testVRClusteredRotate<uint16_t, 2>()
+ & testVRClusteredRotate<uint16_t, 3>()
+ & testVRClusteredRotate<uint16_t, 4>()
+ & test1ClusteredRotate<int>()
+ & testVRClusteredRotate<int, 2>()
+ & testVRClusteredRotate<int, 3>()
+ & testVRClusteredRotate<int, 4>()
+ & test1ClusteredRotate<int16_t>()
+ & testVRClusteredRotate<int16_t, 2>()
+ & testVRClusteredRotate<int16_t, 3>()
+ & testVRClusteredRotate<int16_t, 4>()
+ & test1ClusteredRotate<uint8_t>()
+ & testVRClusteredRotate<uint8_t, 2>()
+ & testVRClusteredRotate<uint8_t, 3>()
+ & testVRClusteredRotate<uint8_t, 4>()
+ & test1ClusteredRotate<uint64_t>()
+ & testVRClusteredRotate<uint64_t, 2>()
+ & testVRClusteredRotate<uint64_t, 3>()
+ & testVRClusteredRotate<uint64_t, 4>()
+ & test1ClusteredRotate<int8_t>()
+ & testVRClusteredRotate<int8_t, 2>()
+ & testVRClusteredRotate<int8_t, 3>()
+ & testVRClusteredRotate<int8_t, 4>()
+ & test1ClusteredRotate<int64_t>()
+ & testVRClusteredRotate<int64_t, 2>()
+ & testVRClusteredRotate<int64_t, 3>()
+ & testVRClusteredRotate<int64_t, 4>()
+ & test1ClusteredRotateBool()
+ & testVRClusteredRotateBool<2>()
+ & testVRClusteredRotateBool<3>()
+ & testVRClusteredRotateBool<4>()
+ ;
+}
+
+[shader("compute")]
+[numthreads(SUBGROUP_SIZE, 1, 1)]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+ threadIndex = dispatchID.x;
+ clusterIndex = dispatchID.x % CLUSTER_SIZE;
+
+ // Determine expected value of clustered rotate in current invocation.
+ // The values passed in are global invocation ids, and we rotate them withina cluster of size `CLUSTER_SIZE`.
+ uint clusterStart = (threadIndex / CLUSTER_SIZE) * CLUSTER_SIZE;
+ rotatedValue = clusterStart + ((threadIndex - clusterStart + DELTA) % CLUSTER_SIZE);
+
+ bool result = true
+ & testClusteredRotate()
+ ;
+
+ // CHECK: 1
+ outputBuffer[0] = uint(result);
+}
+
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
new file mode 100644
index 000000000..4b815c265
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
@@ -0,0 +1,134 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL
+
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+
+#if defined(USE_GLSL_SYNTAX)
+#define __rotate subgroupRotate
+#else
+#define __rotate WaveRotate
+#endif
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#define SUBGROUP_SIZE 32
+#define DELTA 3
+
+static uint threadIndex;
+static uint rotatedValue;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1Rotate()
+{
+ return __rotate(T(threadIndex), DELTA) == T(rotatedValue);
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVRotate()
+{
+ typealias gvec = vector<T, N>;
+
+#if defined(USE_GLSL_SYNTAX)
+ return (__rotate(gvec(T(threadIndex)), DELTA) == gvec(T(rotatedValue)));
+#else
+ return (__rotate(gvec(T(threadIndex)), DELTA) == gvec(T(rotatedValue)))[0];
+#endif
+}
+
+bool test1RotateBool()
+{
+ bool currentValue = (threadIndex % 2 == 0) ? true : false;
+ bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+ return __rotate(currentValue, DELTA) == rotatedValueBool;
+}
+
+__generic<let N : int>
+bool testVRotateBool()
+{
+ typealias gvec = vector<bool, N>;
+ bool currentValue = (threadIndex % 2 == 0) ? true : false;
+ bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+
+#if defined(USE_GLSL_SYNTAX)
+ return (__rotate(gvec(currentValue), DELTA) == gvec(rotatedValueBool));
+#else
+ return (__rotate(gvec(currentValue), DELTA) == gvec(rotatedValueBool))[0];
+#endif
+}
+
+bool testRotate()
+{
+ return true
+ & test1Rotate<float>()
+ & testVRotate<float, 2>()
+ & testVRotate<float, 3>()
+ & testVRotate<float, 4>()
+ & test1Rotate<half>()
+ & testVRotate<half, 2>()
+ & testVRotate<half, 3>()
+ & testVRotate<half, 4>()
+ & test1Rotate<uint>()
+ & testVRotate<uint, 2>()
+ & testVRotate<uint, 3>()
+ & testVRotate<uint, 4>()
+ & test1Rotate<uint16_t>()
+ & testVRotate<uint16_t, 2>()
+ & testVRotate<uint16_t, 3>()
+ & testVRotate<uint16_t, 4>()
+ & test1Rotate<int>()
+ & testVRotate<int, 2>()
+ & testVRotate<int, 3>()
+ & testVRotate<int, 4>()
+ & test1Rotate<int16_t>()
+ & testVRotate<int16_t, 2>()
+ & testVRotate<int16_t, 3>()
+ & testVRotate<int16_t, 4>()
+
+ // Subgroup rotate operations on these builtin types are not supported on Metal.
+#if !defined(METAL)
+ & test1Rotate<uint8_t>()
+ & testVRotate<uint8_t, 2>()
+ & testVRotate<uint8_t, 3>()
+ & testVRotate<uint8_t, 4>()
+ & test1Rotate<uint64_t>()
+ & testVRotate<uint64_t, 2>()
+ & testVRotate<uint64_t, 3>()
+ & testVRotate<uint64_t, 4>()
+ & test1Rotate<int8_t>()
+ & testVRotate<int8_t, 2>()
+ & testVRotate<int8_t, 3>()
+ & testVRotate<int8_t, 4>()
+ & test1Rotate<int64_t>()
+ & testVRotate<int64_t, 2>()
+ & testVRotate<int64_t, 3>()
+ & testVRotate<int64_t, 4>()
+ & test1RotateBool()
+ & testVRotateBool<2>()
+ & testVRotateBool<3>()
+ & testVRotateBool<4>()
+#endif
+ ;
+}
+
+[shader("compute")]
+[numthreads(SUBGROUP_SIZE, 1, 1)]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+ threadIndex = dispatchID.x;
+ rotatedValue = (threadIndex + DELTA) % SUBGROUP_SIZE;
+
+ bool result = true
+ & testRotate()
+ ;
+
+ // CHECK: 1
+ outputBuffer[0] = uint(result);
+}
+