Implement shader subgroup rotate intrinsics (#6878)

* Initial implementation for SPIRV, GLSL and Metal * test add bool test * Fix and improve subgroup rotate tests * Add proper GLSL extensions and proper Metal type checking * Clean up tests and add diagnostics test for subgroup type for Metal * Update wave-intrinsics docs
author: Darren Wihandi <65404740+fairywreath@users.noreply.github.com> 2025-04-22 14:04:56 -0600
committer: GitHub <noreply@github.com> 2025-04-22 20:04:56 +0000
commit: ed5940a629ae05e9571bfe355d22f0728347dcb4 (patch)
tree: 90a36c6543f0ee3748b80112a478897b027dddab /source
parent: d5220b327632a8aeeb9a89494bb37bd82fec30cb (diff)
3 files changed, 200 insertions, 39 deletions
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang
index bbf0c40dd..85c8b174c 100644
--- a/source/slang/glsl.meta.slang
+++ b/source/slang/glsl.meta.slang
@@ -6110,45 +6110,6 @@ public void traceRayMotionNV(
     }
 }
 
-__generic<T : __BuiltinType>
-[ForceInline]
-void typeRequireChecks_shader_subgroup_GLSL() {
-    // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
-    __target_switch
-    {
-    case glsl:
-        if (__type_equals<T, half>()
-            || __type_equals<T, float16_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
-        else if (__type_equals<T, uint8_t>()
-            || __type_equals<T, int8_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
-        else if (__type_equals<T, uint16_t>()
-            || __type_equals<T, int16_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
-        else if (__type_equals<T, uint64_t>()
-            || __type_equals<T, int64_t>()
-            ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
-
-        __intrinsic_asm "";
-    }
-}
-
-__generic<T : __BuiltinType>
-void shader_subgroup_preamble() {
-    // checks needed for shader_subgroup functions; __requireTargetExtension does not work
-    // (does not add the ext specified correctly to the compile output; using extended type
-    // will result in error for using the type)
-    __target_switch
-    {
-    case glsl:
-        typeRequireChecks_shader_subgroup_GLSL<T>();
-    default:
-        return;
-    }
-
-} 
-
 // GL_KHR_shader_subgroup_basic Built-in Variables
 
 [require(cpp_cuda_glsl_hlsl_spirv_wgsl, subgroup_basic)]
@@ -8176,6 +8137,37 @@ public vector<T,N> subgroupQuadSwapDiagonal(vector<T,N> value)
     return QuadReadAcrossDiagonal(value);
 }
 
+// GL_KHR_shader_subgroup_rotate
+
+__generic<T : __BuiltinType>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public T subgroupRotate(T value, uint delta)
+{
+    return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_metal_spirv, subgroup_rotate)]
+public vector<T, N> subgroupRotate(vector<T, N> value, uint delta)
+{
+    return WaveRotate(value, delta);
+}
+
+__generic<T : __BuiltinType>
+[require(glsl_spirv, subgroup_rotate)]
+public T subgroupClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+    return WaveClusteredRotate(value, delta, clusterSize);
+
+}
+
+__generic<T : __BuiltinType, let N : int>
+[require(glsl_spirv, subgroup_rotate)]
+public vector<T, N> subgroupClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+    return WaveClusteredRotate(value, delta, clusterSize);
+}
+
 //// GLSL atomic
 
 // The following type internally is a Shader Storage Buffer 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c8a2c8c58..03321bfaf 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -16368,6 +16368,155 @@ bool IsHelperLane()
     }
 }
 
+//@hidden:
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(glsl)]
+void __requireGLSLShaderSubgroupTypeExtension()
+{
+    // the following is a seperate function call, since else the `__requireTargetExtension` and associated __intrinsic_asm is ignored if the calling function also calls an __intrinsic_asm
+    if (__type_equals<T, half>()
+        || __type_equals<T, float16_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_float16");
+    else if (__type_equals<T, uint8_t>()
+        || __type_equals<T, int8_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int8");
+    else if (__type_equals<T, uint16_t>()
+        || __type_equals<T, int16_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int16");
+    else if (__type_equals<T, uint64_t>()
+        || __type_equals<T, int64_t>()
+        ) __requireTargetExtension("GL_EXT_shader_subgroup_extended_types_int64");
+
+    __intrinsic_asm "";
+}
+
+__generic<T : __BuiltinType>
+[ForceInline]
+[require(metal)]
+void __checkMetalShaderSubgroupType()
+{
+    // These builtin types are not supported for Metal's `simd` operations.
+    if (__type_equals<T, uint8_t>()
+        || __type_equals<T, int8_t>()
+        || __type_equals<T, uint64_t>()
+        || __type_equals<T, int64_t>()
+        || __isBool<T>()
+        )
+    {
+        static_assert(false, "Unsupported type for subgroup operations in Metal. Valid types include scalars and vectors of uint/uint32_t, int/int32_t, uint16_t, int16_t, float, and half.");
+    }
+}
+
+__generic<T : __BuiltinType>
+void shader_subgroup_preamble()
+{
+    // checks needed for shader_subgroup functions; __requireTargetExtension does not work
+    // (does not add the ext specified correctly to the compile output; using extended type
+    // will result in error for using the type)
+    __target_switch
+    {
+    case glsl:
+        __requireGLSLShaderSubgroupTypeExtension<T>();
+    case metal:
+        __checkMetalShaderSubgroupType<T>();
+    default:
+        return;
+    }
+}
+
+//@public:
+
+//
+// Wave Rotate intrinsics.
+// These are Slang specific intrinsics to rotate values within a subgroup.
+//
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+T WaveRotate(T value, uint delta)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupRotate";
+    case metal:
+        __intrinsic_asm "simd_shuffle_rotate_down";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+        };
+    }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_metal_spirv, subgroup_rotate)]
+vector<T, N> WaveRotate(vector<T, N> value, uint delta)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupRotate";
+    case metal:
+        __intrinsic_asm "simd_shuffle_rotate_down";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta;
+        };
+    }
+}
+
+__generic<T : __BuiltinType>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupClusteredRotate";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$T = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+        };
+    }
+}
+
+__generic<T : __BuiltinType, let N : int>
+__glsl_extension(GL_KHR_shader_subgroup_rotate)
+[require(glsl_spirv, subgroup_rotate)]
+vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
+{
+    shader_subgroup_preamble<T>();
+    __target_switch
+    {
+    case glsl:
+        __intrinsic_asm "subgroupClusteredRotate";
+    case spirv:
+        return spirv_asm
+        {
+            OpExtension "SPV_KHR_subgroup_rotate";
+            OpCapability GroupNonUniformRotateKHR;
+            result:$$vector<T,N> = OpGroupNonUniformRotateKHR Subgroup $value $delta $clusterSize;
+        };
+    }
+}
+
 //
 // Quad Control intrinsics
 //
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index b62de0f08..f4ae94978 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -513,6 +513,10 @@ def SPV_KHR_shader_clock : _spirv_1_0;
 /// [EXT]
 def SPV_NV_shader_subgroup_partitioned : _spirv_1_0;
 
+/// Represents the SPIR-V extension enables rotating values across invocations within a subgroup.
+/// [EXT]
+def SPV_KHR_subgroup_rotate : _spirv_1_3;
+
 /// Represents the SPIR-V extension for ray tracing motion blur.
 /// [EXT]
 def SPV_NV_ray_tracing_motion_blur : _spirv_1_0;
@@ -640,6 +644,10 @@ def spvGroupNonUniformVote : _spirv_1_3;
 /// [EXT]
 def spvGroupNonUniformPartitionedNV : _spirv_1_3 + SPV_NV_shader_subgroup_partitioned;
 
+/// Represents the SPIR-V capability for group non-uniform rotate operations.
+/// [EXT]
+def spvGroupNonUniformRotateKHR : _spirv_1_3;
+
 /// Represents the SPIR-V capability for ray tracing motion blur.
 /// [EXT]
 def spvRayTracingMotionBlurNV : SPV_NV_ray_tracing_motion_blur;
@@ -777,6 +785,7 @@ def _GL_KHR_shader_subgroup_quad : _GLSL_140;
 def _GL_KHR_shader_subgroup_shuffle : _GLSL_140;
 def _GL_KHR_shader_subgroup_shuffle_relative : _GLSL_140;
 def _GL_KHR_shader_subgroup_vote : _GLSL_140;
+def _GL_KHR_shader_subgroup_rotate : _GLSL_140;
 
 def _GL_NV_compute_shader_derivatives : _GLSL_450;
 def _GL_NV_fragment_shader_barycentric : _GL_EXT_fragment_shader_barycentric;
@@ -982,6 +991,10 @@ alias GL_KHR_shader_subgroup_shuffle_relative = _GL_KHR_shader_subgroup_shuffle_
 /// [EXT]
 alias GL_KHR_shader_subgroup_vote = _GL_KHR_shader_subgroup_vote | spvGroupNonUniformVote;
 
+/// Represents the GL_KHR_shader_subgroup_rotate extension.
+/// [EXT]
+alias GL_KHR_shader_subgroup_rotate = _GL_KHR_shader_subgroup_rotate | spvGroupNonUniformRotateKHR;
+
 /// Represents the GL_NV_compute_shader_derivatives extension.
 /// [EXT]
 alias GL_NV_compute_shader_derivatives = _GL_NV_compute_shader_derivatives | SPV_KHR_compute_shader_derivatives | _sm_6_6;
@@ -2069,6 +2082,13 @@ alias subgroup_quad = GL_KHR_shader_subgroup_quad
 /// [Compound]
 alias subgroup_partitioned = GL_NV_shader_subgroup_partitioned + subgroup_ballot_activemask | _sm_6_5 | _cuda_sm_7_0;
 
+
+/// Capabilities required to use GLSL-style subgroup rotate operations 'subgroup_rotate'
+/// [Compound]
+alias subgroup_rotate = GL_KHR_shader_subgroup_rotate 
+                       | metal
+                       ;
+
 /// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
 /// [Compound]
 alias atomic_glsl_hlsl_nvapi_cuda_metal_float1 = atomic_glsl_float1 | hlsl_nvapi + _sm_4_0 | _cuda_sm_2_0 | metal;
author	Darren Wihandi <65404740+fairywreath@users.noreply.github.com>	2025-04-22 14:04:56 -0600
committer	GitHub <noreply@github.com>	2025-04-22 20:04:56 +0000
commit	ed5940a629ae05e9571bfe355d22f0728347dcb4 (patch)
tree	90a36c6543f0ee3748b80112a478897b027dddab /source
parent	d5220b327632a8aeeb9a89494bb37bd82fec30cb (diff)