8 files changed, 507 insertions, 39 deletions
diff --git a/docs/user-guide/a3-02-reference-capability-atoms.md b/docs/user-guide/a3-02-reference-capability-atoms.md
index 296614716..a9455761c 100644
--- a/docs/user-guide/a3-02-reference-capability-atoms.md
+++ b/docs/user-guide/a3-02-reference-capability-atoms.md
@@ -407,6 +407,9 @@ Extensions
 `SPV_NV_shader_subgroup_partitioned`
 > Represents the SPIR-V extension for shader subgroup partitioned.
 
+`SPV_KHR_subgroup_rotate`
+> Represents the SPIR-V extension enables rotating values across invocations within a subgroup.
+
 `SPV_NV_ray_tracing_motion_blur`
 > Represents the SPIR-V extension for ray tracing motion blur.
 
@@ -501,6 +504,9 @@ Extensions
 `spvGroupNonUniformPartitionedNV`
 > Represents the SPIR-V capability for group non-uniform partitioned operations.
 
+`spvGroupNonUniformRotateKHR`
+> Represents the SPIR-V capability for group non-uniform rotate operations.
+
 `spvRayTracingMotionBlurNV`
 > Represents the SPIR-V capability for ray tracing motion blur.
 
@@ -699,6 +705,9 @@ Extensions
 `GL_KHR_shader_subgroup_vote`
 > Represents the GL_KHR_shader_subgroup_vote extension.
 
+`GL_KHR_shader_subgroup_rotate`
+> Represents the GL_KHR_shader_subgroup_rotate extension.
+
 `GL_NV_compute_shader_derivatives`
 > Represents the GL_NV_compute_shader_derivatives extension.
 
@@ -1132,6 +1141,9 @@ Compound Capabilities
 `subgroup_partitioned`
 > Capabilities required to use GLSL-style subgroup operations 'subgroup_partitioned'
 
+`subgroup_rotate`
+> Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate'
+
 `atomic_glsl_hlsl_nvapi_cuda_metal_float1`
 > (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
 
diff --git a/docs/wave-intrinsics.md b/docs/wave-intrinsics.md
index aa46f72a1..7f6fb7b77 100644
--- a/docs/wave-intrinsics.md
+++ b/docs/wave-intrinsics.md
@@ -236,6 +236,20 @@ void GroupMemoryBarrierWithWaveSync();
 
 Synchronizes all lanes to the same GroupMemoryBarrierWithWaveSync in program flow. Orders group shared memory accesses such that accesses after the barrier can be seen by writes before.  
 
+Wave Rotate Intrinsics
+======================
+
+These intrinsics are specific to Slang and were added to support the subgroup rotate functionalities provided by SPIRV (through the `GroupNonUniformRotateKHR` capability), GLSL (through the `GL_KHR_shader_subgroup_rotate
+` extension), and Metal.
+
+```
+// Supported on SPIRV, GLSL, and Metal targets.
+T WaveRotate(T value, uint delta);
+
+// Supported on SPIRV and GLSL targets.
+T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize);
+```
+
 Wave Mask Intrinsics
 ====================
 
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang
index bbf0c40dd..85c8b174c 100644
--- a/source/slang/glsl.meta.slang
+++ b/source/slang/glsl.meta.slang
@@ -6110,45 +6110,6 @@ public void traceRayMotionNV(
     }
 }
 
-__generic<T : __BuiltinType>
-[ForceInline]
-void typeRequireChecks_shader_subgroup_GLSL() {
-    // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
-    __target_switch
-    {
-    case glsl:
-        if (__type_equals<T, half>()
-            || __type_equals<T, float16_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
-        else if (__type_equals<T, uint8_t>()
-            || __type_equals<T, int8_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
-        else if (__type_equals<T, uint16_t>()
-            || __type_equals<T, int16_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
-        else if (__type_equals<T, uint64_t>()
-            || __type_equals<T, int64_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
-
-        __intrinsic_asm "";
-    }
-}
-
-__generic<T : __BuiltinType>
-void shader_subgroup_preamble() {
-    // checks needed for shader_subgroup functions; __requireTargetExtension does not work
-    // (does not add the ext specified correctly to the compile output; using extended type
-    // will result in error for using the type)
-    __target_switch
-    {
-    case glsl:
-        typeRequireChecks_shader_subgroup_GLSL<T>();
-    default:
-        return;
-    }
-
-} 
-
 // GL_KHR_shader_subgroup_basic Built-in Variables
 
 [require(cpp_cuda_glsl_hlsl_spirv_wgsl, subgroup_basic)]
@@ -8176,6 +8137,37 @@ public vector<T,N> subgroupQuadSwapDiagonal(vector<T,N> value)
     return QuadReadAcrossDiagonal(value);
 }
 
+// GL_KHR_shader_subgroup_rotate
+
+__generic<T : __BuiltinType>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public T subgroupRotate(T value, uint delta)
+{
+    return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public vector<T, N> subgroupRotate(vector<T, N> value, uint delta)
+{
+    return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType>
+[require(glsl_spirv, subgroup_rotate)]
+public T subgroupClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+    return WaveClusteredRotate(value, delta, clusterSize);
+
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_spirv, subgroup_rotate)]
+public vector<T, N> subgroupClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+    return WaveClusteredRotate(value, delta, clusterSize);
+}
+
 //// GLSL atomic
 
 // The following type internally is a Shader Storage Buffer 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c8a2c8c58..03321bfaf 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -16368,6 +16368,155 @@ bool IsHelperLane()
     }
 }
 
+//@hidden:
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(glsl)]
+void __requireGLSLShaderSubgroupTypeExtension()
+{
+    // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
+    if (__type_equals<T, half>()
+        || __type_equals<T, float16_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
+    else if (__type_equals<T, uint8_t>()
+        || __type_equals<T, int8_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
+    else if (__type_equals<T, uint16_t>()
+        || __type_equals<T, int16_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
+    else if (__type_equals<T, uint64_t>()
+        || __type_equals<T, int64_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
+
+    __intrinsic_asm "";
+}
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(metal)]
+void __checkMetalShaderSubgroupType()
+{
+    // These builtin types are not supported for Metal's `simd` operations.
+    if (__type_equals<T, uint8_t>()
+        || __type_equals<T, int8_t>()
+        || __type_equals<T, uint64_t>()
+        || __type_equals<T, int64_t>()
+        || __isBool<T>()
+        )
+    {
+        static_assert(false, "Unsupported type for subgroup operations in Metal. Valid types include scalars and vectors of uint/uint32_t, int/int32_t, uint16_t, int16_t, float, and half.");
+    }
+}
+
+__generic<T : __BuiltinType>
+void shader_subgroup_preamble()
+{
+    // checks needed for shader_subgroup functions; __requireTargetExtension does not work
+    // (does not add the ext specified correctly to the compile output; using extended type
+    // will result in error for using the type)
+    __target_switch
+    {
+    case glsl:
+        __requireGLSLShaderSubgroupTypeExtension<T>();
+    case metal:
+        __checkMetalShaderSubgroupType<T>();
+    default:
+        return;
+    }
+}
+
+//@public:
+
+//
+// Wave Rotate intrinsics.
+// These are Slang specific intrinsics to rotate values within a subgroup.
+//
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+T WaveRotate(T value, uint delta)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupRotate";
+    case metal:
+        __intrinsic_asm "simd_shuffle_rotate_down";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+        };
+    }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+vector<T, N> WaveRotate(vector<T, N> value, uint delta)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupRotate";
+    case metal:
+        __intrinsic_asm "simd_shuffle_rotate_down";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+        };
+    }
+}
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupClusteredRotate";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+        };
+    }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupClusteredRotate";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+        };
+    }
+}
+
 //
 // Quad Control intrinsics
 //
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index b62de0f08..f4ae94978 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -513,6 +513,10 @@ def SPV_KHR_shader_clock : _spirv_1_0;
 /// [EXT]
 def SPV_NV_shader_subgroup_partitioned : _spirv_1_0;
 
+/// Represents the SPIR-V extension enables rotating values across invocations within a subgroup.
+/// [EXT]
+def SPV_KHR_subgroup_rotate : _spirv_1_3;
+
 /// Represents the SPIR-V extension for ray tracing motion blur.
 /// [EXT]
 def SPV_NV_ray_tracing_motion_blur : _spirv_1_0;
@@ -640,6 +644,10 @@ def spvGroupNonUniformVote : _spirv_1_3;
 /// [EXT]
 def spvGroupNonUniformPartitionedNV : _spirv_1_3 + SPV_NV_shader_subgroup_partitioned;
 
+/// Represents the SPIR-V capability for group non-uniform rotate operations.
+/// [EXT]
+def spvGroupNonUniformRotateKHR : _spirv_1_3;
+
 /// Represents the SPIR-V capability for ray tracing motion blur.
 /// [EXT]
 def spvRayTracingMotionBlurNV : SPV_NV_ray_tracing_motion_blur;
@@ -777,6 +785,7 @@ def _GL_KHR_shader_subgroup_quad : _GLSL_140;
 def _GL_KHR_shader_subgroup_shuffle : _GLSL_140;
 def _GL_KHR_shader_subgroup_shuffle_relative : _GLSL_140;
 def _GL_KHR_shader_subgroup_vote : _GLSL_140;
+def _GL_KHR_shader_subgroup_rotate : _GLSL_140;
 
 def _GL_NV_compute_shader_derivatives : _GLSL_450;
 def _GL_NV_fragment_shader_barycentric : _GL_EXT_fragment_shader_barycentric;
@@ -982,6 +991,10 @@ alias GL_KHR_shader_subgroup_shuffle_relative = _GL_KHR_shader_subgroup_shuffle_
 /// [EXT]
 alias GL_KHR_shader_subgroup_vote = _GL_KHR_shader_subgroup_vote | spvGroupNonUniformVote;
 
+/// Represents the GL_KHR_shader_subgroup_rotate extension.
+/// [EXT]
+alias GL_KHR_shader_subgroup_rotate = _GL_KHR_shader_subgroup_rotate | spvGroupNonUniformRotateKHR;
+
 /// Represents the GL_NV_compute_shader_derivatives extension.
 /// [EXT]
 alias GL_NV_compute_shader_derivatives = _GL_NV_compute_shader_derivatives | SPV_KHR_compute_shader_derivatives | _sm_6_6;
@@ -2069,6 +2082,13 @@ alias subgroup_quad = GL_KHR_shader_subgroup_quad
 /// [Compound]
 alias subgroup_partitioned = GL_NV_shader_subgroup_partitioned + subgroup_ballot_activemask | _sm_6_5 | _cuda_sm_7_0;
 
+
+/// Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate'
+/// [Compound]
+alias subgroup_rotate = GL_KHR_shader_subgroup_rotate 
+                       | metal
+                       ;
+
 /// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
 /// [Compound]
 alias atomic_glsl_hlsl_nvapi_cuda_metal_float1 = atomic_glsl_float1 | hlsl_nvapi + _sm_4_0 | _cuda_sm_2_0 | metal;
diff --git a/tests/diagnostics/wave-operations-types.slang b/tests/diagnostics/wave-operations-types.slang
new file mode 100644
index 000000000..55a6a8e91
--- /dev/null
+++ b/tests/diagnostics/wave-operations-types.slang
@@ -0,0 +1,14 @@
+//DIAGNOSTIC_TEST:SIMPLE(filecheck=CHECK): -entry computeMain -stage compute -target metal
+
+RWStructuredBuffer<uint> out;
+
+[shader("compute")]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+    // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+    // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+    // CHECK: Unsupported type for subgroup operations in Metal. Valid types include
+    out[0] = WaveRotate(true, 1);
+    out[1] = WaveRotate(uint8_t(dispatchID.x), 1);
+    out[2] = WaveRotate(uint64_t(dispatchID.x), 1);
+}
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
new file mode 100644
index 000000000..d52384c15
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
@@ -0,0 +1,133 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+#if defined(USE_GLSL_SYNTAX)
+#define __clusteredRotate subgroupClusteredRotate
+#else
+#define __clusteredRotate WaveClusteredRotate
+#endif
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#define SUBGROUP_SIZE 32
+#define DELTA 3
+#define CLUSTER_SIZE 8
+
+static uint threadIndex;
+static uint clusterIndex;
+static uint rotatedValue;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1ClusteredRotate()
+{
+    return __clusteredRotate(T(threadIndex), DELTA, CLUSTER_SIZE) == T(rotatedValue);
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVRClusteredRotate()
+{
+    typealias gvec = vector<T, N>;
+
+#if defined(USE_GLSL_SYNTAX)
+    return (__clusteredRotate(gvec(T(threadIndex)), DELTA, CLUSTER_SIZE) == gvec(T(rotatedValue)));
+#else
+    return (__clusteredRotate(gvec(T(threadIndex)), DELTA, CLUSTER_SIZE) == gvec(T(rotatedValue)))[0];
+#endif
+}
+
+bool test1ClusteredRotateBool()
+{
+    bool currentValue = (threadIndex % 2 == 0) ? true : false;
+    bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+    return __clusteredRotate(currentValue, DELTA, CLUSTER_SIZE) == rotatedValueBool;
+}
+
+__generic<let N : int>
+bool testVRClusteredRotateBool()
+{
+    typealias gvec = vector<bool, N>;
+    bool currentValue = (threadIndex % 2 == 0) ? true : false;
+    bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+
+#if defined(USE_GLSL_SYNTAX)
+    return (__clusteredRotate(gvec(currentValue), DELTA, CLUSTER_SIZE) == gvec(rotatedValueBool));
+#else
+    return (__clusteredRotate(gvec(currentValue), DELTA, CLUSTER_SIZE) == gvec(rotatedValueBool))[0];
+#endif
+}
+
+bool testClusteredRotate()
+{
+    return true
+        & test1ClusteredRotate<float>()
+        & testVRClusteredRotate<float, 2>()
+        & testVRClusteredRotate<float, 3>()
+        & testVRClusteredRotate<float, 4>()
+        & test1ClusteredRotate<half>()
+        & testVRClusteredRotate<half, 2>()
+        & testVRClusteredRotate<half, 3>()
+        & testVRClusteredRotate<half, 4>()
+        & test1ClusteredRotate<uint>()
+        & testVRClusteredRotate<uint, 2>()
+        & testVRClusteredRotate<uint, 3>()
+        & testVRClusteredRotate<uint, 4>()
+        & test1ClusteredRotate<uint16_t>()
+        & testVRClusteredRotate<uint16_t, 2>()
+        & testVRClusteredRotate<uint16_t, 3>()
+        & testVRClusteredRotate<uint16_t, 4>()
+        & test1ClusteredRotate<int>()
+        & testVRClusteredRotate<int, 2>()
+        & testVRClusteredRotate<int, 3>()
+        & testVRClusteredRotate<int, 4>()
+        & test1ClusteredRotate<int16_t>()
+        & testVRClusteredRotate<int16_t, 2>()
+        & testVRClusteredRotate<int16_t, 3>()
+        & testVRClusteredRotate<int16_t, 4>()
+        & test1ClusteredRotate<uint8_t>()
+        & testVRClusteredRotate<uint8_t, 2>()
+        & testVRClusteredRotate<uint8_t, 3>()
+        & testVRClusteredRotate<uint8_t, 4>()
+        & test1ClusteredRotate<uint64_t>()
+        & testVRClusteredRotate<uint64_t, 2>()
+        & testVRClusteredRotate<uint64_t, 3>()
+        & testVRClusteredRotate<uint64_t, 4>()
+        & test1ClusteredRotate<int8_t>()
+        & testVRClusteredRotate<int8_t, 2>()
+        & testVRClusteredRotate<int8_t, 3>()
+        & testVRClusteredRotate<int8_t, 4>()
+        & test1ClusteredRotate<int64_t>()
+        & testVRClusteredRotate<int64_t, 2>()
+        & testVRClusteredRotate<int64_t, 3>()
+        & testVRClusteredRotate<int64_t, 4>()
+        & test1ClusteredRotateBool()
+        & testVRClusteredRotateBool<2>()
+        & testVRClusteredRotateBool<3>()
+        & testVRClusteredRotateBool<4>()
+        ;
+}
+
+[shader("compute")]
+[numthreads(SUBGROUP_SIZE, 1, 1)]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+    threadIndex = dispatchID.x;
+    clusterIndex = dispatchID.x % CLUSTER_SIZE;
+
+    // Determine expected value of clustered rotate in current invocation.
+    // The values passed in are global invocation ids, and we rotate them withina cluster of size `CLUSTER_SIZE`.
+    uint clusterStart = (threadIndex / CLUSTER_SIZE) * CLUSTER_SIZE;
+    rotatedValue = clusterStart + ((threadIndex - clusterStart + DELTA) % CLUSTER_SIZE);
+
+    bool result = true
+            & testClusteredRotate()
+            ;
+
+    // CHECK: 1
+    outputBuffer[0] = uint(result);
+}
+
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
new file mode 100644
index 000000000..4b815c265
--- /dev/null
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
@@ -0,0 +1,134 @@
+//TEST_CATEGORY(wave, compute)
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL
+
+
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL -xslang -DUSE_GLSL_SYNTAX -allow-glsl
+
+
+#if defined(USE_GLSL_SYNTAX)
+#define __rotate subgroupRotate
+#else
+#define __rotate WaveRotate
+#endif
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+#define SUBGROUP_SIZE 32
+#define DELTA 3
+
+static uint threadIndex;
+static uint rotatedValue;
+
+__generic<T : __BuiltinArithmeticType>
+bool test1Rotate()
+{
+    return __rotate(T(threadIndex), DELTA) == T(rotatedValue);
+}
+
+__generic<T : __BuiltinArithmeticType, let N : int>
+bool testVRotate()
+{
+    typealias gvec = vector<T, N>;
+
+#if defined(USE_GLSL_SYNTAX)
+    return (__rotate(gvec(T(threadIndex)), DELTA) == gvec(T(rotatedValue)));
+#else
+    return (__rotate(gvec(T(threadIndex)), DELTA) == gvec(T(rotatedValue)))[0];
+#endif
+}
+
+bool test1RotateBool()
+{
+    bool currentValue = (threadIndex % 2 == 0) ? true : false;
+    bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+    return __rotate(currentValue, DELTA) == rotatedValueBool;
+}
+
+__generic<let N : int>
+bool testVRotateBool()
+{
+    typealias gvec = vector<bool, N>;
+    bool currentValue = (threadIndex % 2 == 0) ? true : false;
+    bool rotatedValueBool = (threadIndex % 2 == 0) ? false : true;
+
+#if defined(USE_GLSL_SYNTAX)
+    return (__rotate(gvec(currentValue), DELTA) == gvec(rotatedValueBool));
+#else
+    return (__rotate(gvec(currentValue), DELTA) == gvec(rotatedValueBool))[0];
+#endif
+}
+
+bool testRotate()
+{
+    return true
+        & test1Rotate<float>()
+        & testVRotate<float, 2>()
+        & testVRotate<float, 3>()
+        & testVRotate<float, 4>()
+        & test1Rotate<half>()
+        & testVRotate<half, 2>()
+        & testVRotate<half, 3>()
+        & testVRotate<half, 4>()
+        & test1Rotate<uint>()
+        & testVRotate<uint, 2>()
+        & testVRotate<uint, 3>()
+        & testVRotate<uint, 4>()
+        & test1Rotate<uint16_t>()
+        & testVRotate<uint16_t, 2>()
+        & testVRotate<uint16_t, 3>()
+        & testVRotate<uint16_t, 4>()
+        & test1Rotate<int>()
+        & testVRotate<int, 2>()
+        & testVRotate<int, 3>()
+        & testVRotate<int, 4>()
+        & test1Rotate<int16_t>()
+        & testVRotate<int16_t, 2>()
+        & testVRotate<int16_t, 3>()
+        & testVRotate<int16_t, 4>()
+
+        // Subgroup rotate operations on these builtin types are not supported on Metal.
+#if !defined(METAL)
+        & test1Rotate<uint8_t>()
+        & testVRotate<uint8_t, 2>()
+        & testVRotate<uint8_t, 3>()
+        & testVRotate<uint8_t, 4>()
+        & test1Rotate<uint64_t>()
+        & testVRotate<uint64_t, 2>()
+        & testVRotate<uint64_t, 3>()
+        & testVRotate<uint64_t, 4>()
+        & test1Rotate<int8_t>()
+        & testVRotate<int8_t, 2>()
+        & testVRotate<int8_t, 3>()
+        & testVRotate<int8_t, 4>()
+        & test1Rotate<int64_t>()
+        & testVRotate<int64_t, 2>()
+        & testVRotate<int64_t, 3>()
+        & testVRotate<int64_t, 4>()
+        & test1RotateBool()
+        & testVRotateBool<2>()
+        & testVRotateBool<3>()
+        & testVRotateBool<4>()
+#endif
+        ;
+}
+
+[shader("compute")]
+[numthreads(SUBGROUP_SIZE, 1, 1)]
+void computeMain(uint3 dispatchID : SV_DispatchThreadID)
+{
+    threadIndex = dispatchID.x;
+    rotatedValue = (threadIndex + DELTA) % SUBGROUP_SIZE;
+
+    bool result = true
+            & testRotate()
+            ;
+
+    // CHECK: 1
+    outputBuffer[0] = uint(result);
+}
+