Support the new CoopVec builtins (#7108)

**NOTE: This is a breaking change for users who were using POC variant of DXC. In order to keep the compatibility, the users will have to use -capability hlsl_coopvec_poc to their command line. This PR adds a new capability "hlsl_coopvec_poc". When it is used, the HLSL for CoopVec will be emitted for the POC variant of DXC. When it is not used, the HLSL for CoopVec will be emitted for the DXC that officially supports the cooperative vector.
author: Jay Kwak <82421531+jkwak-work@users.noreply.github.com> 2025-05-15 02:57:47 +0000
committer: GitHub <noreply@github.com> 2025-05-14 19:57:47 -0700
commit: b4d3d3017640581c21b52a12413d3f074ab1c5c1 (patch)
tree: 2a3fc8350a590e7f342df328b7d9c4469ac40298 /source
parent: 2275e18fc052239fe67f3fda68252ad92bb83ca9 (diff)
6 files changed, 480 insertions, 103 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 34423d4f3..2382d4a9a 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -23068,6 +23068,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [ForceInline]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     __init()
     {
         this = CoopVec<T, N>(T(0));
@@ -23075,6 +23076,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [ForceInline]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     __init(T t)
     {
         this.fill(t);
@@ -23082,6 +23084,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [ForceInline]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     __init<U : __BuiltinArithmeticType>(CoopVec<U, N> other)
     {
         this.copyFrom(other);
@@ -23097,6 +23100,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [OverloadRank(-10)]
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     __init(int i)
     {
         this = CoopVec<T, N>(T(i));
@@ -23115,14 +23120,18 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// Copy values from another CoopVec instance into this one. The source CoopVec can have a different element type,
     /// in which case appropriate type conversion will be performed.
     /// @param other The source CoopVec to copy from.
-    [require(hlsl)] 
     [mutating] 
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void copyFrom<U : __BuiltinArithmeticType>(CoopVec<U,N> other)
     { 
         __target_switch 
         { 
-        case hlsl: __intrinsic_asm ".CopyFrom";
+        case hlsl:
+            __intrinsic_asm "$0 = $1";
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".CopyFrom";
         default:
             if (__isFloat<T>() && __isInt<U>())
                 this = __int_to_float_cast<T>(other);
@@ -23137,9 +23146,9 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     /// Fill all elements of this CoopVec with the specified value.
     /// @param t The value to fill all elements with.
-    [require(cooperative_vector)] 
     [mutating] 
-    [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void fill(T t)
     { 
         __target_switch
@@ -23151,10 +23160,15 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 result:$$CoopVec<T, N> = OpCompositeConstructReplicateEXT $t;
             };
         case hlsl:
-            case hlsl: __intrinsic_asm ".Fill"; 
+            for(int i = 0; i < N; ++i)
+                this[i] = t;
+            return;
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".Fill";
         default:
             for(int i = 0; i < N; ++i)
                 this[i] = t;
+            return;
         }
     }
 
@@ -23165,8 +23179,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// Store all elements of this CoopVec into a buffer at a specified offset.
     /// @param buffer The destination buffer to store the values into.
     /// @param byteOffset16ByteAligned The byte offset from the start of the buffer where the data will be stored. Must be 16-byte aligned.
-    [ForceInline]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void store(RWByteAddressBuffer buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -23178,17 +23192,19 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 // TODO: Should this be a byte offset
                 OpCooperativeVectorStoreNV $ptr $byteOffset16ByteAligned $this None;
             };
-#ifdef NOT_SUPPORTED_YET
         case hlsl:
-            this.__Store(buffer, byteOffset16ByteAligned);
-#endif
+                __intrinsic_asm "$1.Store< vector<$[0], $[1]> >($2, $0)", T, N;
+        case hlsl_coopvec_poc:
+            for(int i = 0; i < N; ++i)
+                buffer.StoreByteOffset(byteOffset16ByteAligned + __elemToByteOffset<T>(i), this[i]);
+            return;
         default:
             for(int i = 0; i < N; ++i)
                 buffer.StoreByteOffset(byteOffset16ByteAligned + __elemToByteOffset<T>(i), this[i]);
+            return;
         }
     }
 
-    [ForceInline]
     [require(cooperative_vector)]
     void store(RWStructuredBuffer<T> buffer, int32_t byteOffset16ByteAligned = 0)
     {
@@ -23201,10 +23217,6 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 // TODO: Should this be a byte offset
                 OpCooperativeVectorStoreNV $ptr $byteOffset16ByteAligned $this None;
             };
-#ifdef NOT_SUPPORTED_YET
-        case hlsl:
-            this.__Store(buffer, byteOffset16ByteAligned);
-#endif
         default:
             for(int i = 0; i < N; ++i)
                 buffer[i + __byteToElemOffset<T>(byteOffset16ByteAligned)] = this[i];
@@ -23213,19 +23225,23 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [ForceInline]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void store<let M : int>(__ref groupshared T[M] data, int32_t byteOffset16ByteAligned = 0)
     {
+        static_assert(N <= M, "The destination vector size is smaller than the input.");
         __target_switch
         {
         case spirv:
             spirv_asm{
                 OpCooperativeVectorStoreNV &data $byteOffset16ByteAligned $this None;
             };
-        case hlsl:
+        case hlsl_coopvec_poc:
             this.__Store(data, __byteToElemOffset<T>(byteOffset16ByteAligned));
+            return;
         default:
             for(int i = 0; i < N; ++i)
                 data[i + __byteToElemOffset<T>(byteOffset16ByteAligned)] = this[i];
+            return;
         }
     }
 
@@ -23236,6 +23252,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [require(spirv, cooperative_vector)]
     void storeAny<U, let M : int>(__ref groupshared U[M] data, int32_t byteOffset16ByteAligned = 0)
     {
+        static_assert(N <= M, "The destination vector size is smaller than the input.");
         __target_switch
         {
         case spirv:
@@ -23262,9 +23279,9 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// @param buffer The source buffer to load data from.
     /// @param byteOffset16ByteAligned The byte offset from the start of the buffer. Must be 16-byte aligned.
     /// @return A new cooperative vector containing the loaded values.
-    [ForceInline]
     [__NoSideEffect]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     static CoopVec<T, N> load(ByteAddressBuffer buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -23276,6 +23293,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV $ptr $byteOffset16ByteAligned None;
             };
         case hlsl:
+        case hlsl_coopvec_poc:
             CoopVec<T, N> ret;
             ret.__Load(buffer, byteOffset16ByteAligned);
             return ret;
@@ -23288,9 +23306,9 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
         return CoopVec<T, N>();
     }
 
-    [ForceInline]
     [__NoSideEffect]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     static CoopVec<T, N> load(RWByteAddressBuffer buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -23302,6 +23320,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV $ptr $byteOffset16ByteAligned None;
             };
         case hlsl:
+        case hlsl_coopvec_poc:
             CoopVec<T, N> ret;
             ret.__Load(buffer, byteOffset16ByteAligned);
             return ret;
@@ -23314,9 +23333,9 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
         return CoopVec<T, N>();
     }
 
-    [ForceInline]
     [__NoSideEffect]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     static CoopVec<T, N> load(StructuredBuffer<T> buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -23327,12 +23346,6 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             {
                 result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV $ptr $byteOffset16ByteAligned None;
             };
-#ifdef NOT_SUPPORTED_YET
-        case hlsl:
-            CoopVec<T, N> ret;
-            ret.__Load(buffer, byteOffset16ByteAligned);
-            return ret;
-#endif
         default:
             var vec = CoopVec<T, N>();
             for(int i = 0; i < N; ++i)
@@ -23342,7 +23355,6 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
         return CoopVec<T, N>();
     }
 
-    [ForceInline]
     [__NoSideEffect]
     [require(spirv, cooperative_vector)]
     static CoopVec<T, N> load(RWStructuredBuffer<T> buffer, int32_t byteOffset16ByteAligned = 0)
@@ -23355,12 +23367,6 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             {
                 result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV $ptr $byteOffset16ByteAligned None;
             };
-#ifdef NOT_SUPPORTED_YET
-        case hlsl:
-            CoopVec<T, N> ret;
-            ret.__Load(buffer, byteOffset16ByteAligned);
-            return ret;
-#endif
         default:
             var vec = CoopVec<T, N>();
             for(int i = 0; i < N; ++i)
@@ -23373,8 +23379,10 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [__NoSideEffect]
     [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     static CoopVec<T, N> load<let M : int>(__constref groupshared const T[M] data, int32_t byteOffset16ByteAligned = 0)
     {
+        static_assert(N <= M, "The destination vector size is smaller than the input.");
         __target_switch
         {
         case spirv:
@@ -23382,6 +23390,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV &data $byteOffset16ByteAligned None
             };
         case hlsl:
+        case hlsl_coopvec_poc:
             CoopVec<T, N> ret;
             ret.__Load(data, __byteToElemOffset<T>(byteOffset16ByteAligned));
             return ret;
@@ -23403,6 +23412,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [require(spirv, cooperative_vector)]
     static CoopVec<T, N> loadAny<U : __BuiltinArithmeticType, let M : int>(__constref groupshared const U[M] data, int32_t byteOffset16ByteAligned = 0)
     {
+        static_assert(N <= M, "The destination vector size is smaller than the input.");
         __target_switch
         {
         case spirv:
@@ -23452,22 +23462,28 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
         [ForceInline]
         [__NoSideEffect]
         [nonmutating]
+        [require(cooperative_vector)]
+        [require(hlsl_coopvec_poc)]
         get
         {
             __target_switch
             {
-            case hlsl: __intrinsic_asm ".ReadFromIndex";
+            case hlsl_coopvec_poc:
+                __intrinsic_asm ".ReadFromIndex";
             default: return __indexRead(index);
             }
         }
 
         [ForceInline]
         [mutating]
+        [require(cooperative_vector)]
+        [require(hlsl_coopvec_poc)]
         set
         {
             __target_switch
             {
-            case hlsl: __intrinsic_asm ".WriteToIndex";
+            case hlsl_coopvec_poc:
+                __intrinsic_asm ".WriteToIndex";
             default: __indexRef(index) = newValue;
             }
         }
@@ -23483,6 +23499,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// Creates a new cooperative vector with all elements initialized to the specified scalar value.
     /// @param t The scalar value to replicate across all elements.
     /// @return A new cooperative vector where each element equals the input value.
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     static CoopVec<T, N> replicate(T t)
     {
         CoopVec<T, N> ret;
@@ -23561,12 +23579,17 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     This __pureAdd(This other);
 
     [mutating]
+    [ForceInline]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutAdd(This other)
     {
         __target_switch 
         { 
-        case hlsl: __intrinsic_asm ".Add"; 
+        case hlsl:
+            __intrinsic_asm "$0 += $1";
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".Add";
         } 
     }
 
@@ -23575,11 +23598,15 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// @return A new cooperative vector containing the sum of the two vectors.
     // TODO: Why is this ForceInline necessary for hlsl, dxc bug?
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This add(This other)
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "$0 + $1"; 
+        case hlsl_coopvec_poc:
             This ret = this;
             ret.__mutAdd(other);
             return ret;
@@ -23592,18 +23619,29 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutSub(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Subtract"; } }
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm "$0 -= $1";
+        case hlsl_coopvec_poc: __intrinsic_asm ".Subtract";
+        }
+    }
 
     /// Performs component-wise subtraction with another cooperative vector.
     /// @param other The cooperative vector to subtract from this vector.
     /// @return A new cooperative vector containing the difference of the two vectors.
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This sub(This other)
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "$0 - $1";
+        case hlsl_coopvec_poc:
             This ret = this;
             ret.__mutSub(other);
             return ret;
@@ -23616,18 +23654,29 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutMul(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Multiply"; } }
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm "$0 *= $1";
+        case hlsl_coopvec_poc: __intrinsic_asm ".Multiply";
+        }
+    }
 
     /// Performs component-wise multiplication with another cooperative vector.
     /// @param other The cooperative vector to multiply with this vector.
     /// @return A new cooperative vector containing the product of the two vectors.
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This mul(This other)
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "$0 * $1";
+        case hlsl_coopvec_poc:
             This ret = this;
             ret.__mutMul(other);
             return ret;
@@ -23640,18 +23689,29 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutDiv(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Divide"; } }
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm "$0 /= $1";
+        case hlsl_coopvec_poc: __intrinsic_asm ".Divide";
+        }
+    }
 
     /// Performs component-wise division with another cooperative vector.
     /// @param other The cooperative vector to divide this vector by.
     /// @return A new cooperative vector containing the quotient of the two vectors.
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This div(This other)
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "$0 / $1";
+        case hlsl_coopvec_poc:
             This ret = this;
             ret.__mutDiv(other);
             return ret;
@@ -23661,18 +23721,29 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutMod(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Mod"; } }
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm "$0 %= %1";
+        case hlsl_coopvec_poc: __intrinsic_asm ".Mod";
+        }
+    }
 
     /// Performs component-wise remainder operation between two cooperative vectors.
     /// @param other The cooperative vector to compute the remainder with.
     /// @return A new cooperative vector containing the remainder of the division between corresponding components.
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This mod(This other)
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "$0 % $1";
+        case hlsl_coopvec_poc:
             This ret = this;
             ret.__mutMod(other);
             return ret;
@@ -23690,11 +23761,15 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     /// Returns a new cooperative vector where each component has its sign negated.
     /// @return A new cooperative vector containing the negated values.
     //[ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     This neg()
     {
         __target_switch
         {
         case hlsl:
+            __intrinsic_asm "-$0";
+        case hlsl_coopvec_poc:
             This ret = this;
             for(int i = 0; i < N; ++i)
                 ret[i] = -this[i];
@@ -23705,54 +23780,130 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutScalarMul(T t)
-    { __target_switch { case hlsl: __intrinsic_asm ".ScalarMultiply"; } }
+    {
+        __target_switch
+        {
+        case hlsl: __intrinsic_asm "$0 *= $1";
+        case hlsl_coopvec_poc: __intrinsic_asm ".ScalarMultiply";
+        }
+    }
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutMin(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Min"; } }
+    {
+        __target_switch
+        {
+        case hlsl: static_assert(false, "Not supported");
+        case hlsl_coopvec_poc: __intrinsic_asm ".Min";
+        }
+    }
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutMax(This other)
-    { __target_switch { case hlsl: __intrinsic_asm ".Max"; } }
+    {
+        __target_switch
+        {
+        case hlsl: static_assert(false, "Not supported");
+        case hlsl_coopvec_poc: __intrinsic_asm ".Max";
+        }
+    }
 
     [mutating]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __mutClamp(This minVal, This maxVal)
-    { __target_switch { case hlsl: __intrinsic_asm ".Clamp"; } }
+    {
+        __target_switch
+        {
+        case hlsl: static_assert(false, "Not supported");
+        case hlsl_coopvec_poc: __intrinsic_asm ".Clamp";
+        }
+    }
 
     //
     // Internal utilities for loading and storing
     //
 
     [mutating]
+    [ForceInline]
     [require(hlsl, byteaddressbuffer)]
+    [require(hlsl_coopvec_poc, byteaddressbuffer)]
     void __Load(const ByteAddressBuffer buffer, uint byteOffset, uint alignment = 0)
-    { __target_switch { case hlsl: __intrinsic_asm ".Load"; } }
+    {
+        __target_switch
+        {
+        case hlsl:
+            __intrinsic_asm "$0 = $1.Load< vector<$[0], $[1]> >($2)", T, N;
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".Load";
+        }
+    }
 
     [mutating]
+    [ForceInline]
     [require(hlsl, byteaddressbuffer_rw)]
+    [require(hlsl_coopvec_poc, byteaddressbuffer_rw)]
     void __Load(const RWByteAddressBuffer buffer, uint byteOffset, uint alignment = 0)
-    { __target_switch { case hlsl: __intrinsic_asm ".Load"; } }
+    {
+        __target_switch
+        {
+        case hlsl:
+            __intrinsic_asm "$0 = $1.Load< vector<$[0], $[1]> >($2)", T, N;
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".Load";
+        }
+    }
 
     __generic<let M : int>
     [mutating]
     // Careful, this takes the offset in elements
+    [ForceInline]
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     void __Load(__constref groupshared T buffer[M], uint elemOffset)
-    { __target_switch { case hlsl: __intrinsic_asm ".Load"; } }
+    {
+        static_assert(N <= M, "The given groupshared array is smaller than the given CoopVec");
+        __target_switch
+        {
+        case hlsl:
+            [ForceUnroll]
+            for(int i = 0; i < N; ++i)
+                this[i] = buffer[i + elemOffset];
+            return;
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".Load";
+        }
+    }
 
     [require(hlsl, byteaddressbuffer_rw)]
+    [require(hlsl_coopvec_poc, byteaddressbuffer_rw)]
     void __Store(RWByteAddressBuffer buffer, uint byteOffset, uint alignment = 0)
-    { __target_switch { case hlsl: __intrinsic_asm ".Store"; } }
+    {
+        __target_switch
+        {
+        case hlsl: static_assert(false, "Not supported");
+        case hlsl_coopvec_poc: __intrinsic_asm ".Store";
+        }
+    }
 
     __generic<let M : int>
     [require(hlsl)]
+    [require(hlsl_coopvec_poc)]
     // Careful, this takes the offset in elements
     void __Store(__ref groupshared T buffer[M], uint elemOffset)
-    { __target_switch { case hlsl: __intrinsic_asm ".Store"; } }
+    {
+        __target_switch
+        {
+        case hlsl: static_assert(false, "Not supported");
+        case hlsl_coopvec_poc: __intrinsic_asm ".Store";
+        }
+    }
 
 ${{{{
 static const struct {
@@ -23766,7 +23917,9 @@ static const struct {
 for(auto buffer : kByteAddressBufferCases) {
 }}}}
     [mutating]
+    [ForceInline]
     [require(hlsl, byteaddressbuffer_rw)]
+    [require(hlsl_coopvec_poc, byteaddressbuffer_rw)]
     void __mutMatMul<U : __BuiltinArithmeticType, let K : int>(
         CoopVec<U, K> input, uint inputInterpretationHLSL,
         $(buffer.type) matrix, uint matrixOffset, uint matrixInterpretationHLSL,
@@ -23774,12 +23927,30 @@ for(auto buffer : kByteAddressBufferCases) {
     {
         __target_switch
         {
-        case hlsl: __intrinsic_asm ".MatMul";
+        case hlsl:
+            if (T is __BuiltinSignedArithmeticType)
+            {
+                if (U is __BuiltinSignedArithmeticType)
+                    __intrinsic_asm "__builtin_MatVecMul($0, false,  $1, false, $2,  $3, $4, $5,  $6, $7, $8, $9, $10)";
+                else
+                    __intrinsic_asm "__builtin_MatVecMul($0, false,  $1,  true, $2,  $3, $4, $5,  $6, $7, $8, $9, $10)";
+            }
+            else
+            {
+                if (U is __BuiltinSignedArithmeticType)
+                    __intrinsic_asm "__builtin_MatVecMul($0, true,  $1, false, $2,  $3, $4, $5,  $6, $7, $8, $9, $10)";
+                else
+                    __intrinsic_asm "__builtin_MatVecMul($0, true,  $1,  true, $2,  $3, $4, $5,  $6, $7, $8, $9, $10)";
+            }
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".MatMul";
         }
     }
 
     [mutating]
+    [ForceInline]
     [require(hlsl, byteaddressbuffer_rw)]
+    [require(hlsl_coopvec_poc, byteaddressbuffer_rw)]
     void __mutMatMulAdd<U : __BuiltinArithmeticType, let K : int>(
         CoopVec<U, K> input, uint inputInterpretationHLSL,
         $(buffer.type) matrix, uint matrixOffset, uint matrixInterpretationHLSL,
@@ -23788,7 +23959,23 @@ for(auto buffer : kByteAddressBufferCases) {
     {
         __target_switch
         {
-        case hlsl: __intrinsic_asm ".MatMulAdd";
+        case hlsl:
+            if (T is __BuiltinSignedArithmeticType)
+            {
+                if (U is __BuiltinSignedArithmeticType)
+                    __intrinsic_asm "__builtin_MatVecMulAdd($0, false,  $1, false, $2,  $3, $4, $5,  $9, $10, $11, $12, $13,  $6, $7, $8)";
+                else
+                    __intrinsic_asm "__builtin_MatVecMulAdd($0, false,  $1,  true, $2,  $3, $4, $5,  $9, $10, $11, $12, $13,  $6, $7, $8)";
+            }
+            else
+            {
+                if (U is __BuiltinSignedArithmeticType)
+                    __intrinsic_asm "__builtin_MatVecMulAdd($0, true,  $1, false, $2,  $3, $4, $5,  $9, $10, $11, $12, $13,  $6, $7, $8)";
+                else
+                    __intrinsic_asm "__builtin_MatVecMulAdd($0, true,  $1,  true, $2,  $3, $4, $5,  $9, $10, $11, $12, $13,  $6, $7, $8)";
+            }
+        case hlsl_coopvec_poc:
+            __intrinsic_asm ".MatMulAdd";
         }
     }
 
@@ -23806,7 +23993,8 @@ for(auto buffer : kByteAddressBufferCases) {
     /// can be packed into each element of the input vector. The k parameter specifies the actual number of
     /// values to use from the packed input.
     [mutating]
-    [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void matMulAccumPacked<U : __BuiltinArithmeticType, let PackedK : int>(
         CoopVec<U, PackedK> input,
         constexpr CoopVecComponentType inputInterpretation,
@@ -23870,6 +24058,8 @@ for(auto buffer : kByteAddressBufferCases) {
     /// @param matrixStride The stride in bytes between rows/columns of the matrix.
     [mutating]
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void matMulAccum<U : __BuiltinArithmeticType, let K : int>(
         CoopVec<U, K> input,
         constexpr CoopVecComponentType inputInterpretation,
@@ -23897,7 +24087,8 @@ for(auto buffer : kByteAddressBufferCases) {
     }
 
     [mutating]
-    [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void matMulAddAccumPacked<U : __BuiltinArithmeticType, let PackedK : int>(
         CoopVec<U, PackedK> input,
         constexpr CoopVecComponentType inputInterpretation,
@@ -23976,6 +24167,8 @@ for(auto buffer : kByteAddressBufferCases) {
     /// while matMulAddAccumPacked allows k to be specified independently for packed interpretations.
     [mutating]
     [ForceInline]
+    [require(cooperative_vector)]
+    [require(hlsl_coopvec_poc)]
     void matMulAddAccum<U : __BuiltinArithmeticType, let K : int>(
         CoopVec<U, K> input,
         constexpr CoopVecComponentType inputInterpretation,
@@ -24008,6 +24201,24 @@ for(auto buffer : kByteAddressBufferCases) {
         );
     }
 
+    [ForceInline]
+    [require(hlsl, byteaddressbuffer_rw)]
+    void __OuterProductAccumulate<let K : int>(
+        CoopVec<T, K> b,
+        $(buffer.type) matrix,
+        int32_t matrixOffset,
+        uint matrixStride,
+        uint memoryLayout,
+        uint matrixInterpretation,
+    )
+    {
+        __target_switch
+        {
+        case hlsl:
+            __intrinsic_asm "__builtin_OuterProductAccumulate($0, $1, $2, $3, $6, $5, $4)";
+        }
+    }
+
 
 ${{{{
 }
@@ -24040,6 +24251,7 @@ CoopVec<T,N> __float_to_int_cast(CoopVec<U,N> val);
 __generic<T : __BuiltinArithmeticType, let N : int>
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> operator *(CoopVec<T, N> lhs, const T rhs)
 {
     __target_switch
@@ -24061,6 +24273,8 @@ CoopVec<T, N> operator *(CoopVec<T, N> lhs, const T rhs)
             return lhs;
         }
     case hlsl:
+        __intrinsic_asm "$0 * $1";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = lhs;
         ret.__mutScalarMul(rhs);
         return ret;
@@ -24076,6 +24290,7 @@ CoopVec<T, N> operator *(CoopVec<T, N> lhs, const T rhs)
 __generic<T : __BuiltinArithmeticType, let N : int>
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> operator *(const T lhs, CoopVec<T, N> rhs)
 {
     return rhs * lhs;
@@ -24083,6 +24298,7 @@ CoopVec<T, N> operator *(const T lhs, CoopVec<T, N> rhs)
 
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -24093,6 +24309,8 @@ CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
             result:$$CoopVec<T, N> = OpExtInst glsl450 FMin $x $y;
         };
     case hlsl:
+        __intrinsic_asm "min($0, $1)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutMin(y);
         return ret;
@@ -24107,6 +24325,7 @@ CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
 
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> max<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -24117,6 +24336,8 @@ CoopVec<T, N> max<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
             result:$$CoopVec<T, N> = OpExtInst glsl450 FMax $x $y;
         };
     case hlsl:
+        __intrinsic_asm "max($0, $1)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutMax(y);
         return ret;
@@ -24130,6 +24351,7 @@ CoopVec<T, N> max<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
 
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> clamp<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> minVal, CoopVec<T, N> maxVal)
 {
     __target_switch
@@ -24140,6 +24362,8 @@ CoopVec<T, N> clamp<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x
             result:$$CoopVec<T, N> = OpExtInst glsl450 FClamp $x $minVal $maxVal;
         };
     case hlsl:
+        __intrinsic_asm "clamp($0, $1, $2)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutClamp(minVal, maxVal);
         return ret;
@@ -24153,6 +24377,7 @@ CoopVec<T, N> clamp<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x
 
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> min<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -24173,6 +24398,8 @@ CoopVec<T, N> min<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVe
             };
         }
     case hlsl:
+        __intrinsic_asm "min($0, $1)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutMin(y);
         return ret;
@@ -24187,6 +24414,7 @@ CoopVec<T, N> min<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVe
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> max<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -24207,6 +24435,8 @@ CoopVec<T, N> max<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVe
             };
         }
     case hlsl:
+        __intrinsic_asm "max($0, $1)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutMax(y);
         return ret;
@@ -24220,6 +24450,7 @@ CoopVec<T, N> max<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVe
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> clamp<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> minVal, CoopVec<T, N> maxVal)
 {
     __target_switch
@@ -24240,6 +24471,8 @@ CoopVec<T, N> clamp<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, Coop
             };
         }
     case hlsl:
+        __intrinsic_asm "clamp($0, $1, $2)";
+    case hlsl_coopvec_poc:
         CoopVec<T, N> ret = x;
         ret.__mutClamp(minVal, maxVal);
         return ret;
@@ -24253,10 +24486,18 @@ CoopVec<T, N> clamp<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, Coop
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> step<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> edge, CoopVec<T, N> x)
 {
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "step($0, $1)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = step(edge[i], x[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24272,10 +24513,18 @@ CoopVec<T, N> step<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> ed
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> exp<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 {
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "exp($0)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = exp(x[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24291,10 +24540,18 @@ CoopVec<T, N> exp<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> log<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 {
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "log($0)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = log(x[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24310,10 +24567,18 @@ CoopVec<T, N> log<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> tanh<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 {
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "tanh($0)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = tanh(x[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24327,14 +24592,20 @@ CoopVec<T, N> tanh<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
     }
 }
 
-// TODO: Why does this fail when inlined on HLSL,
-// We generate some really weird code...
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> atan<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> yOverX)
 {
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "atan($0)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = atan(yOverX[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24350,6 +24621,7 @@ CoopVec<T, N> atan<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> yO
 
 // [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, N> fma<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> a, CoopVec<T, N> b, CoopVec<T, N> c)
 {
     // TODO: Investigate, why does this fail if it's not inlined
@@ -24357,6 +24629,13 @@ CoopVec<T, N> fma<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> a,
     // dxc generated substantially different code
     __target_switch
     {
+    case hlsl:
+        __intrinsic_asm "mad($0, $1, $2)";
+    case hlsl_coopvec_poc:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = mad(a[i], b[i], c[i]);
+        return ret;
     case spirv:
         return spirv_asm
         {
@@ -24612,36 +24891,77 @@ uint32_t __getSpvCoopVecComponentType(CoopVecComponentType componentType)
 [ForceInline]
 uint32_t __getHLSLCoopVecComponentType(CoopVecComponentType componentType)
 {
-    switch (componentType)
+    __target_switch
     {
-    case CoopVecComponentType::Float16:
-        return 0;
-    case CoopVecComponentType::Float32:
-        return 1;
-    case CoopVecComponentType::UnsignedInt8:
-        return 2;
-    case CoopVecComponentType::UnsignedInt16:
-        return 3;
-    case CoopVecComponentType::UnsignedInt32:
-        return 4;
-    case CoopVecComponentType::SignedInt8:
-        return 5;
-    case CoopVecComponentType::SignedInt16:
-        return 6;
-    case CoopVecComponentType::SignedInt32:
-        return 7;
-    case CoopVecComponentType::SignedInt8Packed:
-        return 8;
-    case CoopVecComponentType::UnsignedInt8Packed:
-        return 9;
-    case CoopVecComponentType::FloatE4M3:
-        return 10;
-    case CoopVecComponentType::FloatE5M2:
-        return 11;
-    default:
-        static_assert(false, "unsupported componentType value");
+    case hlsl:
+        switch (componentType)
+        {
+        case CoopVecComponentType::SignedInt16:
+            return 2;
+        case CoopVecComponentType::UnsignedInt16:
+            return 3;
+        case CoopVecComponentType::SignedInt32:
+            return 4;
+        case CoopVecComponentType::UnsignedInt32:
+            return 5;
+        case CoopVecComponentType::SignedInt64:
+            return 6;
+        case CoopVecComponentType::UnsignedInt64:
+            return 7;
+        case CoopVecComponentType::Float16:
+            return 8;
+        case CoopVecComponentType::Float32:
+            return 9;
+        case CoopVecComponentType::Float64:
+            return 10;
+        case CoopVecComponentType::SignedInt8Packed:
+            return 17;
+        case CoopVecComponentType::UnsignedInt8Packed:
+            return 18;
+        case CoopVecComponentType::UnsignedInt8:
+            return 19;
+        case CoopVecComponentType::SignedInt8:
+            return 20;
+        case CoopVecComponentType::FloatE4M3:
+            return 21;
+        case CoopVecComponentType::FloatE5M2:
+            return 22;
+        default:
+            static_assert(false, "unsupported componentType value");
+        }
+        return 0; // ComponentType::Invalid
+    case hlsl_coopvec_poc:
+        switch (componentType)
+        {
+        case CoopVecComponentType::Float16:
+            return 0;
+        case CoopVecComponentType::Float32:
+            return 1;
+        case CoopVecComponentType::UnsignedInt8:
+            return 2;
+        case CoopVecComponentType::UnsignedInt16:
+            return 3;
+        case CoopVecComponentType::UnsignedInt32:
+            return 4;
+        case CoopVecComponentType::SignedInt8:
+            return 5;
+        case CoopVecComponentType::SignedInt16:
+            return 6;
+        case CoopVecComponentType::SignedInt32:
+            return 7;
+        case CoopVecComponentType::SignedInt8Packed:
+            return 8;
+        case CoopVecComponentType::UnsignedInt8Packed:
+            return 9;
+        case CoopVecComponentType::FloatE4M3:
+            return 10;
+        case CoopVecComponentType::FloatE5M2:
+            return 11;
+        default:
+            static_assert(false, "unsupported componentType value");
+        }
+        return 32;
     }
-    return 32;
 }
 
 [ForceInline]
@@ -24711,6 +25031,7 @@ for(auto buffer : kByteAddressBufferCases_) {
 // need it
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 __generic<T : __BuiltinArithmeticType, let M : int, let PackedK : int, U : __BuiltinArithmeticType>
 CoopVec<T, M> coopVecMatMulPacked(
     CoopVec<U, PackedK> input,
@@ -24854,6 +25175,7 @@ CoopVec<T, M> coopVecMatMulPacked(
 /// @return A new cooperative vector containing the result of the matrix multiplication.
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 __generic<T : __BuiltinArithmeticType, let M : int, let K : int, U : __BuiltinArithmeticType>
 CoopVec<T, M> coopVecMatMul(
     CoopVec<U, K> input,
@@ -24900,6 +25222,7 @@ CoopVec<T, M> coopVecMatMul(
 /// values to use from the packed input.
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 CoopVec<T, M> coopVecMatMulAddPacked<T : __BuiltinArithmeticType, let M : int, let PackedK : int, U : __BuiltinArithmeticType>(
     CoopVec<U, PackedK> input,
     constexpr CoopVecComponentType inputInterpretation,
@@ -25037,6 +25360,7 @@ CoopVec<T, M> coopVecMatMulAddPacked<T : __BuiltinArithmeticType, let M : int, l
 /// @return A new cooperative vector containing the result of the matrix multiplication plus bias.
 [ForceInline]
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 __generic<T : __BuiltinArithmeticType, let M : int, let K : int, U : __BuiltinArithmeticType>
 CoopVec<T, M> coopVecMatMulAdd(
     CoopVec<U, K> input,
@@ -25087,6 +25411,7 @@ if(buffer.isRW)
 /// @param memoryLayout Specifies the memory layout of the matrix (row-major or column-major).
 /// @param matrixInterpretation Specifies how to interpret the values in the matrix.
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let N : int>(
     CoopVec<T, M> a,
     CoopVec<T, N> b,
@@ -25100,6 +25425,10 @@ void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let
     __target_switch
     {
     case hlsl:
+        uint matrixInterpretationHLSL = __getHLSLCoopVecComponentType(matrixInterpretation);
+        uint memoryLayoutHLSL = __getHLSLCoopVecMatrixLayout(memoryLayout);
+        return a.__OuterProductAccumulate(b, matrix, matrixOffset, matrixStride, memoryLayoutHLSL, matrixInterpretationHLSL);
+    case hlsl_coopvec_poc:
         __intrinsic_asm "$0.OuterProductAccumulate($1, $2, $3, $4, $5, $6)";
     case spirv:
         let matrixInterpretationSpirv : int = __getSpvCoopVecComponentType(matrixInterpretation);
@@ -25172,6 +25501,7 @@ void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let
 /// @param buffer The buffer to accumulate the sum into.
 /// @param offset Byte offset into the buffer.
 [require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
 void coopVecReduceSumAccumulate<T : __BuiltinArithmeticType, let N : int>(
     CoopVec<T, N> v,
     $(buffer.type) buffer,
@@ -25181,6 +25511,8 @@ void coopVecReduceSumAccumulate<T : __BuiltinArithmeticType, let N : int>(
     __target_switch
     {
     case hlsl:
+        __intrinsic_asm "__builtin_VectorAccumulate($0, $1, $2)";
+    case hlsl_coopvec_poc:
         __intrinsic_asm "$0.ReduceSumAccumulate($1, $2)";
     case spirv:
         let bufferPtr = buffer.GetBufferPointer();
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index 9d1a11f07..f509835aa 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -224,6 +224,10 @@ def hlsl_nvapi : hlsl;
 /// [Version]
 def hlsl_2018 : _sm_5_1;
 
+/// Represet compatibility support for the deprecated POC DXC
+/// [Version]
+def hlsl_coopvec_poc : _sm_6_8;
+
 /// Represents capabilities required for DXIL Library compilation.
 /// [Version]
 alias dxil_lib = _sm_6_3;
@@ -1118,7 +1122,7 @@ alias bufferreference_int64 = bufferreference + GL_EXT_shader_explicit_arithmeti
 /// Note that cpp and cuda are supported via a fallback non-cooperative implementation
 /// No HLSL shader model bound yet
 /// [Compound]
-alias cooperative_vector = _sm_6_8 | cpp | _cuda_sm_9_0 | spvCooperativeVectorNV;
+alias cooperative_vector = _sm_6_9 | cpp | _cuda_sm_9_0 | spvCooperativeVectorNV;
 /// Capabilities needed to train cooperative vectors
 /// [Compound]
 alias cooperative_vector_training = spvCooperativeVectorTrainingNV;
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 56668d092..141a843f2 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -109,6 +109,9 @@ CLikeSourceEmitter::CLikeSourceEmitter(const Desc& desc)
     m_codeGenContext = desc.codeGenContext;
     m_entryPointStage = desc.entryPointStage;
     m_effectiveProfile = desc.effectiveProfile;
+
+    auto targetCaps = getTargetReq()->getTargetCaps();
+    isCoopvecPoc = targetCaps.implies(CapabilityAtom::hlsl_coopvec_poc);
 }
 
 SlangResult CLikeSourceEmitter::init()
@@ -3118,18 +3121,32 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst)
         case kIROp_MakeCoopVector:
             {
                 emitType(coopVecType, getName(inst));
-                m_writer->emit(";\n");
+                m_writer->emit(isCoopvecPoc ? ";\n" : " = { ");
 
                 auto elemCount = as<IRIntLit>(coopVecType->getOperand(1));
                 IRIntegerValue elemCountValue = elemCount->getValue();
-                for (IRIntegerValue i = 0; i < elemCountValue; ++i)
+                if (isCoopvecPoc)
                 {
-                    m_writer->emit(getName(inst));
-                    m_writer->emit(".WriteToIndex(");
-                    m_writer->emit(i);
-                    m_writer->emit(", ");
+                    for (IRIntegerValue i = 0; i < elemCountValue; ++i)
+                    {
+                        m_writer->emit(getName(inst));
+                        m_writer->emit(".WriteToIndex(");
+                        m_writer->emit(i);
+                        m_writer->emit(", ");
+                        emitDereferenceOperand(inst->getOperand(i), getInfo(EmitOp::General));
+                        m_writer->emit(");\n");
+                    }
+                }
+                else
+                {
+                    IRIntegerValue i = 0;
+                    for (; i < elemCountValue - 1; ++i)
+                    {
+                        emitDereferenceOperand(inst->getOperand(i), getInfo(EmitOp::General));
+                        m_writer->emit(", ");
+                    }
                     emitDereferenceOperand(inst->getOperand(i), getInfo(EmitOp::General));
-                    m_writer->emit(");\n");
+                    m_writer->emit("};\n");
                 }
                 return;
             }
@@ -3138,7 +3155,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst)
             m_writer->emit(";\n");
 
             m_writer->emit(getName(inst));
-            m_writer->emit(".CopyFrom(");
+            m_writer->emit(isCoopvecPoc ? ".CopyFrom(" : " = (");
             emitCallExpr((IRCall*)inst, getInfo(EmitOp::General));
             m_writer->emit(");\n");
             return;
@@ -3147,7 +3164,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst)
             m_writer->emit(";\n");
 
             m_writer->emit(getName(inst));
-            m_writer->emit(".CopyFrom(");
+            m_writer->emit(isCoopvecPoc ? ".CopyFrom(" : " = (");
             emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General));
             m_writer->emit(");\n");
             return;
@@ -3409,10 +3426,21 @@ void CLikeSourceEmitter::_emitStoreImpl(IRStore* store)
     auto dstPtr = store->getPtr();
     if (isPointerOfType(dstPtr->getDataType(), kIROp_CoopVectorType))
     {
-        emitDereferenceOperand(dstPtr, getInfo(EmitOp::General));
-        m_writer->emit(".CopyFrom(");
-        emitDereferenceOperand(srcVal, getInfo(EmitOp::General));
-        m_writer->emit(");\n");
+        if (isCoopvecPoc)
+        {
+            emitDereferenceOperand(dstPtr, getInfo(EmitOp::General));
+            m_writer->emit(".CopyFrom(");
+            emitDereferenceOperand(srcVal, getInfo(EmitOp::General));
+            m_writer->emit(");\n");
+        }
+        else
+        {
+            auto prec = getInfo(EmitOp::Assign);
+            emitDereferenceOperand(dstPtr, leftSide(getInfo(EmitOp::General), prec));
+            m_writer->emit(" = ");
+            emitOperand(srcVal, rightSide(prec, getInfo(EmitOp::General)));
+            m_writer->emit(";\n");
+        }
     }
     else
     {
@@ -4705,7 +4733,7 @@ void CLikeSourceEmitter::emitVar(IRVar* varDecl)
             {
                 m_writer->emit(";\n");
                 m_writer->emit(getName(varDecl));
-                m_writer->emit(".CopyFrom(");
+                m_writer->emit(isCoopvecPoc ? ".CopyFrom(" : " = (");
                 emitDereferenceOperand(store->getVal()->getOperand(0), getInfo(EmitOp::General));
                 m_writer->emit(")");
             }
@@ -4713,7 +4741,7 @@ void CLikeSourceEmitter::emitVar(IRVar* varDecl)
             {
                 m_writer->emit(";\n");
                 m_writer->emit(getName(varDecl));
-                m_writer->emit(".CopyFrom(");
+                m_writer->emit(isCoopvecPoc ? ".CopyFrom(" : " = (");
                 emitCallExpr((IRCall*)store->getVal(), getInfo(EmitOp::General));
                 m_writer->emit(")");
             }
@@ -4726,13 +4754,14 @@ void CLikeSourceEmitter::emitVar(IRVar* varDecl)
                 {
                     m_writer->emit(";\n");
                     m_writer->emit(getName(varDecl));
-                    m_writer->emit(".WriteToIndex(");
+                    m_writer->emit(isCoopvecPoc ? ".WriteToIndex(" : "[");
                     m_writer->emit(i);
-                    m_writer->emit(", ");
+                    m_writer->emit(isCoopvecPoc ? ", " : "] = ");
                     emitDereferenceOperand(
                         store->getVal()->getOperand(i),
                         getInfo(EmitOp::General));
-                    m_writer->emit(")");
+                    if (isCoopvecPoc)
+                        m_writer->emit(")");
                 }
             }
             else
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index 28dcdcc4d..f38158c0a 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -738,6 +738,9 @@ protected:
     OrderedHashSet<IRStringLit*> m_requiredPreludes;
 
     Dictionary<const char*, IRStringLit*> m_builtinPreludes;
+
+    // Indicates if we are emiting for DXC cooperative vector POC.
+    bool isCoopvecPoc = false;
 };
 
 } // namespace Slang
diff --git a/source/slang/slang-emit-hlsl.cpp b/source/slang/slang-emit-hlsl.cpp
index 3813cf9cb..ba167676a 100644
--- a/source/slang/slang-emit-hlsl.cpp
+++ b/source/slang/slang-emit-hlsl.cpp
@@ -1153,7 +1153,7 @@ void HLSLSourceEmitter::emitVectorTypeNameImpl(IRType* elementType, IRIntegerVal
     // although we should not expect to run into types that don't
     // have a sugared form.
     //
-    m_writer->emit("vector<");
+    m_writer->emit(isCoopvecPoc ? "CoopVector<" : "vector<");
     emitType(elementType);
     m_writer->emit(",");
     m_writer->emit(elementCount);
@@ -1446,7 +1446,7 @@ void HLSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
     case kIROp_CoopVectorType:
         {
             auto coopVecType = (IRCoopVectorType*)type;
-            m_writer->emit("CoopVector<");
+            m_writer->emit(isCoopvecPoc ? "CoopVector<" : "vector<");
             emitType(coopVecType->getElementType());
             m_writer->emit(",");
             m_writer->emit(getIntVal(coopVecType->getElementCount()));
diff --git a/source/slang/slang.cpp b/source/slang/slang.cpp
index 67d13c34b..c7f6c920c 100644
--- a/source/slang/slang.cpp
+++ b/source/slang/slang.cpp
@@ -2347,7 +2347,16 @@ CapabilitySet TargetRequest::getTargetCaps()
 
     for (auto atomVal : optionSet.getArray(CompilerOptionName::Capability))
     {
-        auto toAdd = CapabilitySet((CapabilityName)atomVal.intValue);
+        CapabilitySet toAdd;
+        switch (atomVal.kind)
+        {
+        case CompilerOptionValueKind::Int:
+            toAdd = CapabilitySet(CapabilityName(atomVal.intValue));
+            break;
+        case CompilerOptionValueKind::String:
+            toAdd = CapabilitySet(findCapabilityName(atomVal.stringValue.getUnownedSlice()));
+            break;
+        }
 
         if (isGLSLTarget)
             targetCap.addSpirvVersionFromOtherAsGlslSpirvVersion(toAdd);
author	Jay Kwak <82421531+jkwak-work@users.noreply.github.com>	2025-05-15 02:57:47 +0000
committer	GitHub <noreply@github.com>	2025-05-14 19:57:47 -0700
commit	b4d3d3017640581c21b52a12413d3f074ab1c5c1 (patch)
tree	2a3fc8350a590e7f342df328b7d9c4469ac40298 /source
parent	2275e18fc052239fe67f3fda68252ad92bb83ca9 (diff)