10 files changed, 295 insertions, 7 deletions
diff --git a/docs/command-line-slangc-reference.md b/docs/command-line-slangc-reference.md
index b3cd2576a..7d18799ba 100644
--- a/docs/command-line-slangc-reference.md
+++ b/docs/command-line-slangc-reference.md
@@ -1112,6 +1112,7 @@ A capability describes an optional feature that a target may or may not support.
 * `hlsl_nvapi` 
 * `hlsl_2018` 
 * `hlsl_coopvec_poc` 
+* `optix_coopvec` 
 * `vertex` 
 * `fragment` 
 * `compute` 
diff --git a/docs/user-guide/a3-02-reference-capability-atoms.md b/docs/user-guide/a3-02-reference-capability-atoms.md
index 56809055b..9740806ea 100644
--- a/docs/user-guide/a3-02-reference-capability-atoms.md
+++ b/docs/user-guide/a3-02-reference-capability-atoms.md
@@ -153,10 +153,10 @@ Versions
 > Represents HLSL NVAPI support.
 
 `hlsl_2018`
-> Represet HLSL compatibility support.
+> Represent HLSL compatibility support.
 
 `hlsl_coopvec_poc`
-> Represet compatibility support for the deprecated POC DXC
+> Represent compatibility support for the deprecated POC DXC
 
 `dxil_lib`
 > Represents capabilities required for DXIL Library compilation.
@@ -1322,6 +1322,9 @@ Other
 ----------------------
 *Capabilities which may be deprecated*
 
+`optix_coopvec`
+> Represents capabilities required for optix cooperative vector support.
+
 `SPIRV_1_0`
 > Use `spirv_1_0` instead
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index e00108e96..8b0bade6e 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -24382,6 +24382,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     __init<U : __BuiltinArithmeticType>(CoopVec<U, N> other)
     {
         this.copyFrom(other);
@@ -24421,6 +24422,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     void copyFrom<U : __BuiltinArithmeticType>(CoopVec<U,N> other)
     { 
         __target_switch 
@@ -24429,6 +24431,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             __intrinsic_asm "$0 = $1";
         case hlsl_coopvec_poc:
             __intrinsic_asm ".CopyFrom";
+        case optix_coopvec:
+            __intrinsic_asm "optixCoopVecCvt<$TR>(*($0));";
         default:
             if (__isFloat<T>() && __isInt<U>())
                 this = __int_to_float_cast<T>(other);
@@ -24438,7 +24442,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
                 this = __real_cast<T>(other);
             else if (__isInt<T>() && __isInt<U>())
                 this = __int_cast<T>(other);
-        } 
+        }
     }
 
     /// Fill all elements of this CoopVec with the specified value.
@@ -24591,6 +24595,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [__NoSideEffect]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     static CoopVec<T, N> load(ByteAddressBuffer buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -24606,6 +24611,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             CoopVec<T, N> ret;
             ret.__Load(buffer, byteOffset16ByteAligned);
             return ret;
+        case optix_coopvec:
+            __intrinsic_asm "optixCoopVecLoad<$TR>((CUdeviceptr)(&($0)));";
         default:
             var vec = CoopVec<T, N>();
             for(int i = 0; i < N; ++i)
@@ -24618,6 +24625,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [__NoSideEffect]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     static CoopVec<T, N> load(RWByteAddressBuffer buffer, int32_t byteOffset16ByteAligned = 0)
     {
         __target_switch
@@ -24633,6 +24641,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             CoopVec<T, N> ret;
             ret.__Load(buffer, byteOffset16ByteAligned);
             return ret;
+        case optix_coopvec:
+            __intrinsic_asm "optixCoopVecLoad<$TR>((CUdeviceptr)(&($0)));";
         default:
             var vec = CoopVec<T, N>();
             for(int i = 0; i < N; ++i)
@@ -24702,6 +24712,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [__NoSideEffect]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     static CoopVec<T, N> load<let M : int>(__constref groupshared const T[M] data, int32_t byteOffset16ByteAligned = 0)
     {
         static_assert(N <= M, "The destination vector size is smaller than the input.");
@@ -24716,6 +24727,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             CoopVec<T, N> ret;
             ret.__Load(data, __byteToElemOffset<T>(byteOffset16ByteAligned));
             return ret;
+        case optix_coopvec:
+            __intrinsic_asm "optixCoopVecLoad<$TR>((CUdeviceptr)(&($0)));";
         default:
             CoopVec<T,N> result;
             for(int i = 0; i < N; ++i)
@@ -24922,6 +24935,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     This add(This other)
     {
         __target_switch
@@ -24932,6 +24946,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             This ret = this;
             ret.__mutAdd(other);
             return ret;
+        case optix_coopvec: 
+            __intrinsic_asm "optixCoopVecAdd($0, $1)";
         default: return __pureAdd(other);
         }
     }
@@ -24957,6 +24973,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     This sub(This other)
     {
         __target_switch
@@ -24967,6 +24984,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             This ret = this;
             ret.__mutSub(other);
             return ret;
+        case optix_coopvec: 
+            __intrinsic_asm "optixCoopVecSub($0, $1)";
         default: return __pureSub(other);
         }
     }
@@ -24992,6 +25011,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
     [ForceInline]
     [require(cooperative_vector)]
     [require(hlsl_coopvec_poc)]
+    [require(optix_coopvec)]
     This mul(This other)
     {
         __target_switch
@@ -25002,6 +25022,8 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
             This ret = this;
             ret.__mutMul(other);
             return ret;
+        case optix_coopvec:
+            __intrinsic_asm "optixCoopVecMul($0, $1)";
         default: return __pureMul(other);
         }
     }
@@ -25621,6 +25643,7 @@ CoopVec<T, N> operator *(const T lhs, CoopVec<T, N> rhs)
 [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -25636,6 +25659,8 @@ CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
         CoopVec<T, N> ret = x;
         ret.__mutMin(y);
         return ret;
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecMin($0, $1)";
     default:
         CoopVec<T, N> ret;
         for(int i = 0; i < N; ++i)
@@ -25648,6 +25673,7 @@ CoopVec<T, N> min<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
 [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 CoopVec<T, N> max<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x, CoopVec<T, N> y)
 {
     __target_switch
@@ -25663,6 +25689,8 @@ CoopVec<T, N> max<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x,
         CoopVec<T, N> ret = x;
         ret.__mutMax(y);
         return ret;
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecMax($0, $1)";
     default:
         CoopVec<T, N> ret;
         for(int i = 0; i < N; ++i)
@@ -25809,6 +25837,7 @@ CoopVec<T, N> clamp<T : __BuiltinIntegerType, let N : int>(CoopVec<T, N> x, Coop
 // [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 CoopVec<T, N> step<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> edge, CoopVec<T, N> x)
 {
     __target_switch
@@ -25825,6 +25854,8 @@ CoopVec<T, N> step<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> ed
         {
             result:$$CoopVec<T, N> = OpExtInst glsl450 Step $edge $x;
         };
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecStep($0, $1)";
     default:
         CoopVec<T, N> ret;
         for(int i = 0; i < N; ++i)
@@ -25890,6 +25921,43 @@ CoopVec<T, N> log<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 // [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
+CoopVec<T, N> log2<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
+{
+    __target_switch
+    {
+    default:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = log2(x[i]);
+        return ret;
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecLog2($0)";
+    }
+}
+
+// [ForceInline]
+[require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
+CoopVec<T, N> exp2<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
+{
+    __target_switch
+    {
+    default:
+        CoopVec<T, N> ret;
+        for(int i = 0; i < N; ++i)
+            ret[i] = exp2(x[i]);
+        return ret;
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecExp2($0)";
+    }
+}
+
+// [ForceInline]
+[require(cooperative_vector)]
+[require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 CoopVec<T, N> tanh<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
 {
     __target_switch
@@ -25906,6 +25974,8 @@ CoopVec<T, N> tanh<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> x)
         {
             result:$$CoopVec<T, N> = OpExtInst glsl450 Tanh $x;
         };
+    case optix_coopvec: 
+        __intrinsic_asm "optixCoopVecTanh($0)";
     default:
         CoopVec<T, N> ret;
         for(int i = 0; i < N; ++i)
@@ -25944,6 +26014,7 @@ CoopVec<T, N> atan<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> yO
 // [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 CoopVec<T, N> fma<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> a, CoopVec<T, N> b, CoopVec<T, N> c)
 {
     // TODO: Investigate, why does this fail if it's not inlined
@@ -25963,6 +26034,8 @@ CoopVec<T, N> fma<T : __BuiltinFloatingPointType, let N : int>(CoopVec<T, N> a,
         {
             result:$$CoopVec<T, N> = OpExtInst glsl450 Fma $a $b $c;
         };
+    case optix_coopvec: 
+        __intrinsic_asm "optixCoopVecFFMA($0, $1, $2)";
     default:
         CoopVec<T, N> ret;
         for(int i = 0; i < N; ++i)
@@ -26695,6 +26768,7 @@ CoopVec<T, M> coopVecMatMulAddPacked<T : __BuiltinArithmeticType, let M : int, l
 [ForceInline]
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 __generic<T : __BuiltinArithmeticType, let M : int, let K : int, U : __BuiltinArithmeticType>
 CoopVec<T, M> coopVecMatMulAdd(
     CoopVec<U, K> input,
@@ -26746,6 +26820,7 @@ if(buffer.isRW)
 /// @param matrixInterpretation Specifies how to interpret the values in the matrix.
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let N : int>(
     CoopVec<T, M> a,
     CoopVec<T, N> b,
@@ -26773,6 +26848,8 @@ void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let
             OpCapability CooperativeVectorTrainingNV;
             OpCooperativeVectorOuterProductAccumulateNV $matrixPtr $matrixOffset $a $b $memoryLayoutSpirv $matrixInterpretationSpirv $matrixStride;
         };
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecOuterProductAccumulate($0, $1, (CUdeviceptr)(&$2), $3, $4)";
     default:
         for (int i = 0; i < M; ++i)
         {
@@ -26836,6 +26913,7 @@ void coopVecOuterProductAccumulate<T : __BuiltinArithmeticType, let M : int, let
 /// @param offset Byte offset into the buffer.
 [require(cooperative_vector)]
 [require(hlsl_coopvec_poc)]
+[require(optix_coopvec)]
 void coopVecReduceSumAccumulate<T : __BuiltinArithmeticType, let N : int>(
     CoopVec<T, N> v,
     $(buffer.type) buffer,
@@ -26855,6 +26933,8 @@ void coopVecReduceSumAccumulate<T : __BuiltinArithmeticType, let N : int>(
             OpCapability CooperativeVectorTrainingNV;
             OpCooperativeVectorReduceSumAccumulateNV $bufferPtr $offset $v;
         };
+    case optix_coopvec:
+        __intrinsic_asm "optixCoopVecReduceSumAccumulate($0, (CUdeviceptr)(&$1), $2)";
     default:
         for (int i = 0; i < N; ++i)
         {
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index 7616cc201..343f89687 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -220,11 +220,11 @@ def _sm_6_9 : _sm_6_8;
 def hlsl_nvapi : hlsl;
 
 
-/// Represet HLSL compatibility support.
+/// Represent HLSL compatibility support.
 /// [Version]
 def hlsl_2018 : _sm_5_1;
 
-/// Represet compatibility support for the deprecated POC DXC
+/// Represent compatibility support for the deprecated POC DXC
 /// [Version]
 def hlsl_coopvec_poc : _sm_6_8;
 
@@ -244,6 +244,8 @@ def _cuda_sm_6_0 : _cuda_sm_5_0;
 def _cuda_sm_7_0 : _cuda_sm_6_0;
 def _cuda_sm_8_0 : _cuda_sm_7_0;
 def _cuda_sm_9_0 : _cuda_sm_8_0;
+/// Represents capabilities required for optix cooperative vector support.
+def optix_coopvec : _cuda_sm_9_0;
 
 /// All code-gen targets
 /// [Compound]
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 0092d159a..3fbf47bfa 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -112,6 +112,7 @@ CLikeSourceEmitter::CLikeSourceEmitter(const Desc& desc)
 
     auto targetCaps = getTargetReq()->getTargetCaps();
     isCoopvecPoc = targetCaps.implies(CapabilityAtom::hlsl_coopvec_poc);
+    isOptixCoopVec = targetCaps.implies(CapabilityAtom::optix_coopvec);
 }
 
 SlangResult CLikeSourceEmitter::init()
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index 78793f655..1e9deaa0d 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -744,6 +744,9 @@ protected:
 
     // Indicates if we are emiting for DXC cooperative vector POC.
     bool isCoopvecPoc = false;
+
+    // Indicates if we are emiting for Optix cooperative vector.
+    bool isOptixCoopVec = false;
 };
 
 } // namespace Slang
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 6f97a11da..8e95cebfb 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -1152,8 +1152,16 @@ void CPPSourceEmitter::_emitType(IRType* type, DeclaratorInfo* declarator)
             auto arrayType = static_cast<IRArrayType*>(type);
             auto elementType = arrayType->getElementType();
             int elementCount = int(getIntVal(arrayType->getElementCount()));
-
-            m_writer->emit("FixedArray<");
+            auto nameHint = arrayType->findDecoration<IRNameHintDecoration>();
+            bool isCoopVec = nameHint && (nameHint->getName() == UnownedStringSlice("CoopVec"));
+            if (isCoopVec && isOptixCoopVec)
+            {
+                m_writer->emit("OptixCoopVec<");
+            }
+            else
+            {
+                m_writer->emit("FixedArray<");
+            }
             _emitType(elementType, nullptr);
             m_writer->emit(", ");
             m_writer->emit(elementCount);
diff --git a/tests/cooperative-vector/exp2.slang b/tests/cooperative-vector/exp2.slang
new file mode 100644
index 000000000..ddff55453
--- /dev/null
+++ b/tests/cooperative-vector/exp2.slang
@@ -0,0 +1,27 @@
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -render-feature cooperative-vector -output-using-type -emit-spirv-directly
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-dx12 -render-feature cooperative-vector -dx12-experimental -use-dxil -output-using-type -profile cs_6_8 -Xslang... -Xdxc -Vd -X. -capability hlsl_coopvec_poc
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -output-using-type
+
+// CHECK: type: float
+// CHECK-NEXT: 2.000000
+// CHECK-NEXT: 4.000000
+// CHECK-NEXT: 8.000000
+// CHECK-NEXT: 16.000000
+
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input
+ByteAddressBuffer input;
+
+[numthreads(1, 1, 1)]
+void computeMain()
+{
+    CoopVec<float, 4> vec = coopVecLoad<4, float>(input);
+
+    CoopVec<float, 4> result = exp2(vec);
+
+    for(int i = 0; i < result.getCount(); ++i)
+        outputBuffer[i] = result[i];
+}
diff --git a/tests/cooperative-vector/log2.slang b/tests/cooperative-vector/log2.slang
new file mode 100644
index 000000000..bacdf8fde
--- /dev/null
+++ b/tests/cooperative-vector/log2.slang
@@ -0,0 +1,26 @@
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -render-feature cooperative-vector -output-using-type -emit-spirv-directly
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-dx12 -render-feature cooperative-vector -dx12-experimental -use-dxil -output-using-type -profile cs_6_8 -Xslang... -Xdxc -Vd -X. -capability hlsl_coopvec_poc
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -output-using-type
+
+// CHECK: type: float
+// CHECK-NEXT: 0.000000
+// CHECK-NEXT: 1.000000
+// CHECK-NEXT: 1.584962
+// CHECK-NEXT: 2.000000
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input
+ByteAddressBuffer input;
+
+[numthreads(1, 1, 1)]
+void computeMain()
+{
+    CoopVec<float, 4> vec = coopVecLoad<4, float>(input);
+
+    CoopVec<float, 4> result = log2(vec);
+
+    for(int i = 0; i < result.getCount(); ++i)
+        outputBuffer[i] = result[i];
+}
diff --git a/tests/cuda/optix-coopvec.slang b/tests/cuda/optix-coopvec.slang
new file mode 100644
index 000000000..58e83ebb9
--- /dev/null
+++ b/tests/cuda/optix-coopvec.slang
@@ -0,0 +1,137 @@
+//TEST:SIMPLE(filecheck=CHECK): -target cuda -capability optix_coopvec
+
+// CHECK: optixCoopVecLoad
+// CHECK: OptixCoopVec
+// CHECK: optixCoopVecTanh
+// CHECK: optixCoopVecAdd
+// CHECK: optixCoopVecCvt
+// CHECK: optixCoopVecFFMA
+// CHECK: optixCoopVecMax
+// CHECK: optixCoopVecMin
+// CHECK: optixCoopVecMul
+// CHECK: optixCoopVecOuterProductAccumulate
+// CHECK: optixCoopVecReduceSumAccumulate
+// CHECK: optixCoopVecStep
+// CHECK: optixCoopVecSub
+// CHECK: optixCoopVecLog2
+// CHECK: optixCoopVecExp2
+
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input1
+ByteAddressBuffer input1;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input2
+ByteAddressBuffer input2;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input3
+ByteAddressBuffer input3;
+
+//TEST_INPUT: set inputBuffer = ubuffer(data=[1 2 3 4 5 6 7 8 9 10 11 12], stride=4);
+uniform int32_t* inputBuffer;
+
+//TEST_INPUT:ubuffer(data=[67305985 134678021 202050057 269422093], stride=4),name=matrix
+//[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+ByteAddressBuffer matrix;
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4),name=outputMat
+RWByteAddressBuffer outputMat;
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4),name=outputMat2
+RWByteAddressBuffer outputMat2;
+
+//TEST_INPUT:ubuffer(data=[5 6 7 8], stride=4),name=bias
+ByteAddressBuffer bias;
+
+struct RayPayload
+{
+    float4 color;
+    float2x4 lssData;
+    bool isSphere;
+    bool isLss;
+};
+
+
+[numthreads(1, 1, 1)]
+[shader("closesthit")]
+void closestHitShader(inout RayPayload payload, in BuiltInTriangleIntersectionAttributes attr)
+{
+    CoopVec<float, 4> vec1 = coopVecLoad<4, float>(input1);
+    CoopVec<float, 4> vec2 = coopVecLoad<4, float>(input2);
+    CoopVec<float, 4> vec3 = coopVecLoad<4, float>(input3);
+
+    CoopVec<float, 4> resultTan = tanh(vec1);
+
+    let resultAdd = vec1 + vec2;
+
+    CoopVec<float, 4> resultCopy = coopVecLoad<4, float>(input1);
+    resultCopy.copyFrom<float>(vec2);
+
+    CoopVec<float, 4> resultFMA = fma(vec1, vec2, vec3);
+    
+    CoopVec<float, 4> vec = coopVecLoad<4, float>(input1);
+    let resultMul = coopVecMatMulAdd<float, 4, 4>(
+        vec,
+        CoopVecComponentType::Float32,
+        matrix,
+        0,
+        CoopVecComponentType::Float32,
+        bias,
+        0,
+        CoopVecComponentType::SignedInt32,
+        CoopVecMatrixLayout::RowMajor,
+        false,
+        4
+    );
+    
+    CoopVec<float, 4> resultMax = max(vec1, vec2);
+    CoopVec<float, 4> resultMin = min(vec1, vec2);
+    
+    CoopVec<float, 4> resultVecMul = vec1 * vec2;
+    
+    outputMat.Store<float>(0, float(1));
+    coopVecOuterProductAccumulate(
+        vec1,
+        vec2,
+        outputMat,
+        0,
+        32,
+        CoopVecMatrixLayout::RowMajor,
+        CoopVecComponentType::Float32,
+    );
+
+    outputMat2.Store(0, float(1));
+    coopVecReduceSumAccumulate(
+        vec1,
+        outputMat2,
+        0,
+    );
+    
+    CoopVec<float, 4> resultStep = step(vec1, vec2);
+
+    CoopVec<float, 4> resultSub = vec1 - vec2;
+    
+    CoopVec<float, 4> resultLog2 = log2(vec1);
+    
+    CoopVec<float, 4> resultExp2 = exp2(vec1);
+
+    for(int i = 0; i < resultTan.getCount(); ++i)
+    {
+        outputBuffer[i] = resultTan[i]  +
+                          resultAdd[i]  +
+                          resultCopy[i] +
+                          resultFMA[i]  +
+                          resultMul[i]  +
+                          resultMax[i]  +
+                          resultMin[i]  +
+                          resultVecMul[i] +
+                          outputMat.Load<float>(i)  +
+                          outputMat2.Load<float>(i) +
+                          resultStep[i] +
+                          resultSub[i]  +
+                          resultLog2[i] +
+                          resultExp2[i];
+    }
+}