Add optix support for coopvec (#7286)

* WiP: Add coopvec support for Optix * format code * fix minor issues * Fix review comments --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
author: Mukund Keshava <mkeshava@nvidia.com> 2025-06-10 10:18:24 +0530
committer: GitHub <noreply@github.com> 2025-06-10 04:48:24 +0000
commit: d70da65a90ccd73439895a43b3958c0ea1441f35 (patch)
tree: e6f0c1cd8413e3e213a29bf233b5fc3a3fdf2eaf /tests/cuda
parent: ab6b5f28d332f201fd96b7e05070116684d02899 (diff)
1 files changed, 137 insertions, 0 deletions
diff --git a/tests/cuda/optix-coopvec.slang b/tests/cuda/optix-coopvec.slang
new file mode 100644
index 000000000..58e83ebb9
--- /dev/null
+++ b/tests/cuda/optix-coopvec.slang
@@ -0,0 +1,137 @@
+//TEST:SIMPLE(filecheck=CHECK): -target cuda -capability optix_coopvec
+
+// CHECK: optixCoopVecLoad
+// CHECK: OptixCoopVec
+// CHECK: optixCoopVecTanh
+// CHECK: optixCoopVecAdd
+// CHECK: optixCoopVecCvt
+// CHECK: optixCoopVecFFMA
+// CHECK: optixCoopVecMax
+// CHECK: optixCoopVecMin
+// CHECK: optixCoopVecMul
+// CHECK: optixCoopVecOuterProductAccumulate
+// CHECK: optixCoopVecReduceSumAccumulate
+// CHECK: optixCoopVecStep
+// CHECK: optixCoopVecSub
+// CHECK: optixCoopVecLog2
+// CHECK: optixCoopVecExp2
+
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input1
+ByteAddressBuffer input1;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input2
+ByteAddressBuffer input2;
+
+//TEST_INPUT:ubuffer(data=[1.0 2.0 3.0 4.0], stride=4),name=input3
+ByteAddressBuffer input3;
+
+//TEST_INPUT: set inputBuffer = ubuffer(data=[1 2 3 4 5 6 7 8 9 10 11 12], stride=4);
+uniform int32_t* inputBuffer;
+
+//TEST_INPUT:ubuffer(data=[67305985 134678021 202050057 269422093], stride=4),name=matrix
+//[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+ByteAddressBuffer matrix;
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4),name=outputMat
+RWByteAddressBuffer outputMat;
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4),name=outputMat2
+RWByteAddressBuffer outputMat2;
+
+//TEST_INPUT:ubuffer(data=[5 6 7 8], stride=4),name=bias
+ByteAddressBuffer bias;
+
+struct RayPayload
+{
+    float4 color;
+    float2x4 lssData;
+    bool isSphere;
+    bool isLss;
+};
+
+
+[numthreads(1, 1, 1)]
+[shader("closesthit")]
+void closestHitShader(inout RayPayload payload, in BuiltInTriangleIntersectionAttributes attr)
+{
+    CoopVec<float, 4> vec1 = coopVecLoad<4, float>(input1);
+    CoopVec<float, 4> vec2 = coopVecLoad<4, float>(input2);
+    CoopVec<float, 4> vec3 = coopVecLoad<4, float>(input3);
+
+    CoopVec<float, 4> resultTan = tanh(vec1);
+
+    let resultAdd = vec1 + vec2;
+
+    CoopVec<float, 4> resultCopy = coopVecLoad<4, float>(input1);
+    resultCopy.copyFrom<float>(vec2);
+
+    CoopVec<float, 4> resultFMA = fma(vec1, vec2, vec3);
+    
+    CoopVec<float, 4> vec = coopVecLoad<4, float>(input1);
+    let resultMul = coopVecMatMulAdd<float, 4, 4>(
+        vec,
+        CoopVecComponentType::Float32,
+        matrix,
+        0,
+        CoopVecComponentType::Float32,
+        bias,
+        0,
+        CoopVecComponentType::SignedInt32,
+        CoopVecMatrixLayout::RowMajor,
+        false,
+        4
+    );
+    
+    CoopVec<float, 4> resultMax = max(vec1, vec2);
+    CoopVec<float, 4> resultMin = min(vec1, vec2);
+    
+    CoopVec<float, 4> resultVecMul = vec1 * vec2;
+    
+    outputMat.Store<float>(0, float(1));
+    coopVecOuterProductAccumulate(
+        vec1,
+        vec2,
+        outputMat,
+        0,
+        32,
+        CoopVecMatrixLayout::RowMajor,
+        CoopVecComponentType::Float32,
+    );
+
+    outputMat2.Store(0, float(1));
+    coopVecReduceSumAccumulate(
+        vec1,
+        outputMat2,
+        0,
+    );
+    
+    CoopVec<float, 4> resultStep = step(vec1, vec2);
+
+    CoopVec<float, 4> resultSub = vec1 - vec2;
+    
+    CoopVec<float, 4> resultLog2 = log2(vec1);
+    
+    CoopVec<float, 4> resultExp2 = exp2(vec1);
+
+    for(int i = 0; i < resultTan.getCount(); ++i)
+    {
+        outputBuffer[i] = resultTan[i]  +
+                          resultAdd[i]  +
+                          resultCopy[i] +
+                          resultFMA[i]  +
+                          resultMul[i]  +
+                          resultMax[i]  +
+                          resultMin[i]  +
+                          resultVecMul[i] +
+                          outputMat.Load<float>(i)  +
+                          outputMat2.Load<float>(i) +
+                          resultStep[i] +
+                          resultSub[i]  +
+                          resultLog2[i] +
+                          resultExp2[i];
+    }
+}
author	Mukund Keshava <mkeshava@nvidia.com>	2025-06-10 10:18:24 +0530
committer	GitHub <noreply@github.com>	2025-06-10 04:48:24 +0000
commit	d70da65a90ccd73439895a43b3958c0ea1441f35 (patch)
tree	e6f0c1cd8413e3e213a29bf233b5fc3a3fdf2eaf /tests/cuda
parent	ab6b5f28d332f201fd96b7e05070116684d02899 (diff)