Preliminary CUDA half maths (#1827)

* #include an absolute path didn't work - because paths were taken to always be relative. * Split out StringEscapeUtil. * Added StringEscapeUtil. * Fix typo in unix quoting type. * Small comment improvements. * Try to fix linux linking issue. * Fix typo. * Attempt to fix linux link issue. * Update VS proj even though nothing really changed. * Fix another typo issue. * Fix for windows issue. Fixed bug. * Make separate Utils for escaping. * Fix typo. * Split out into StringEscapeHandler. * Windows shell does handle removing quotes (so remove code to remove them). * Handle unescaping if not initiating using the shell. * Slight improvement around shell like decoding. * Simplify command extraction. * Add shared-library category type. * Fix bug in command extraction. * Typo in transcendental category. * Enable unit-test on in smoke test category. * Make parsing failing output as a failing test. * Fixes for transcendental tests. Disable tests that do not work. * Changed category parsing. * Removed the TestResult parameter from _gatherTestsForFile. Made testsList only output. * Remove testing if all tests were disabled. * Make args of CommandLine always unescaped. * Add category. * Don't need escaping on unix/linux. * Remove some no longer used functions. * Add requireSMVersion to CUDAExtensionTracker. * half-calc.slang now works for CUDA. * bit-cast-16-bit works on CUDA. * WIP handling of CUDA vector<half> types. * Half swizzle CUDA. * Half vector test. * Fix swizzle half bug. * Fix compilation issue with narrowing to Index. Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
author: jsmall-nvidia <jsmall@nvidia.com> 2021-04-30 16:51:25 -0400
committer: GitHub <noreply@github.com> 2021-04-30 13:51:25 -0700
commit: 1a4a51301d084dd1c8c5906eb810eb6caf6f3963 (patch)
tree: 3eac138d918853f88bb8e2b5f14ed36a57e54d7a
parent: c45f368ae404798db67a601749c6e0047fba75ef (diff)
8 files changed, 274 insertions, 49 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index c6de56641..61702824c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -5,7 +5,7 @@
 // are passed down. 
 
 #ifdef SLANG_CUDA_ENABLE_HALF
-#include <cuda_fp16.h>
+#   include <cuda_fp16.h>
 #endif
 
 #ifdef SLANG_CUDA_ENABLE_OPTIX
@@ -65,8 +65,50 @@ struct __half4 { __half2 xy; __half2 zw; };
 
 // Mechanism to make half vectors
 SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { __half3 o; o.xy = __halves2half2(x, y); o.z = z; return o; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { __half4 o; o.xy = __halves2half2(x, y); o.zw = __halves2half2(z, w); return o; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; }
+
+// Half3 maths ops
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3 &lh, const __half3 &rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &lh, const __half3 &rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3 &lh, const __half3 &rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3 &lh, const __half3 &rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; }
+
+#if 0
+// We need to return the vector<bool> type
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half3 &lh, const __half3 &rh) { return __hbeq2(lh.xy, rh.xy) && __heq(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half3 &lh, const __half3 &rh) { return __hbneu2(lh.xy, rh.xy) && __hneu(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half3 &lh, const __half3 &rh) { return __hbgt2(lh.xy, rh.xy) && __hgt(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half3 &lh, const __half3 &rh) { return __hblt2(lh.xy, rh.xy) && __hlt(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half3 &lh, const __half3 &rh) { return __hbge2(lh.xy, rh.xy) && __hge(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half3 &lh, const __half3 &rh) { return __hble2(lh.xy, rh.xy) && __hle(lh.z, rh.z); }
+#endif
+
+// Half4 maths ops
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4 &lh, const __half4 &rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &lh, const __half4 &rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4 &lh, const __half4 &rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4 &lh, const __half4 &rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; }
+
+#if 0
+// We need to return vector<bool> type
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half4 &lh, const __half4 &rh) { return __hbeq2(lh.xy, rh.xy) && __hbeq2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half4 &lh, const __half4 &rh) { return __hbneu2(lh.xy, rh.xy) && __hbneu2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half4 &lh, const __half4 &rh) { return __hbgt2(lh.xy, rh.xy) && __hbgt2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half4 &lh, const __half4 &rh) { return __hblt2(lh.xy, rh.xy) && __hblt2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half4 &lh, const __half4 &rh) { return __hbge2(lh.xy, rh.xy) && __hbge2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half4 &lh, const __half4 &rh) { return __hble2(lh.xy, rh.xy) && __hble2(lh.zw, rh.zw); }
+#endif
 
 // Use the round nearest as the default - it is the only one defined
 SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); }
diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp
index 1d416634a..736250219 100755
--- a/source/slang/slang-compiler.cpp
+++ b/source/slang/slang-compiler.cpp
@@ -1413,6 +1413,8 @@ SlangResult dissassembleDXILUsingDXC(
             // Look for the version
             if (auto cudaTracker = as<CUDAExtensionTracker>(source.extensionTracker))
             {
+                cudaTracker->finalize();
+
                 if (cudaTracker->m_smVersion.isSet())
                 {
                     DownstreamCompiler::CapabilityVersion version;
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index a259ea933..5f7eada68 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -10,6 +10,21 @@
 
 namespace Slang {
 
+
+
+void CUDAExtensionTracker::finalize()
+{
+    if (isBaseTypeRequired(BaseType::Half))
+    {
+        // The cuda_fp16.hpp header indicates the need is for version 5.3, but when this is tried
+        // NVRTC says it cannot load builtins.
+        // The lowest version that this does work for is 6.0, so that's what we use here.
+
+        // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
+        requireSMVersion(SemanticVersion(6, 0));
+    }
+}
+
 static bool _isSingleNameBasicType(IROp op)
 {
     switch (op)
@@ -152,17 +167,74 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
     return Super::calcScalarFuncName(op, type, outBuilder);
 }
 
-SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
 {
-    SLANG_UNUSED(target);
+    typedef HLSLIntrinsic::Op Op;
+
+    if (auto vecType = as <IRVectorType>(specOp->returnType))
+    {
+        if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+        {
+            if (baseType->getBaseType() == BaseType::Half)
+            {
+                switch (specOp->op)
+                {
+                    case Op::Init:
+                    case Op::Add:
+                    case Op::Mul:
+                    case Op::Div:
+
+                    case Op::Neg:
+
+                    case Op::ConstructFromScalar:
+
+                    case Op::Leq:
+                    case Op::Less:
+                    case Op::Greater:
+                    case Op::Geq:
+                    case Op::Neq:
+                    case Op::Eql:
+                    {
+                        return;
+                    }
+                }
+            }
+        }
+    }
 
-    if (target == CodeGenTarget::CSource)
+    switch (specOp->op)
     {
-        return Super::calcTypeName(type, target, out);
+        case Op::Init:
+        {
+            // Special case handling
+            auto returnType = specOp->returnType;
+
+            if (auto vecType = as <IRVectorType>(returnType))
+            {
+                if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+                {
+                    if (baseType->getBaseType() == BaseType::Half)
+                    {
+                        // Defined already in cuda-prelude.h
+                        return;
+                    }
+                }
+            }
+
+            break;
+        }
+        default: break;
     }
 
-    // We allow C source, because if we need a name 
-    SLANG_ASSERT(target == CodeGenTarget::CUDASource);
+    Super::emitSpecializedOperationDefinition(specOp);
+}
+
+SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+{
+    SLANG_UNUSED(target);
+
+    // The names CUDA produces are all compatible with 'C' (ie they aren't templated types)
+    SLANG_ASSERT(target == CodeGenTarget::CUDASource || target == CodeGenTarget::CSource);
 
     switch (type->getOp())
     {
@@ -180,30 +252,6 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target,
             out << prefix << vecCount;
             return SLANG_OK;
         }
-
-#if 0
-        case kIROp_MatrixType:
-        {
-            auto matType = static_cast<IRMatrixType*>(type);
-
-            auto elementType = matType->getElementType();
-            const auto rowCount = int(getIntVal(matType->getRowCount()));
-            const auto colCount = int(getIntVal(matType->getColumnCount()));
-
-            out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
-            return SLANG_OK;
-        }
-        case kIROp_UnsizedArrayType:
-        {
-            auto arrayType = static_cast<IRUnsizedArrayType*>(type);
-            auto elementType = arrayType->getElementType();
-
-            out << "Array<";
-            SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out));
-            out << ">";
-            return SLANG_OK;
-        }
-#endif
         default:
         {
             if (isNominalOp(type->getOp()))
@@ -519,10 +567,102 @@ void CUDASourceEmitter::_emitInitializerList(IRType* elementType, IRUse* operand
     m_writer->emit("\n}");
 }
 
+void CUDASourceEmitter::_emitGetHalfVectorElement(IRInst* base, Index index, Index vecSize, const EmitOpInfo& inOuterPrec)
+{
+    SLANG_ASSERT(index < vecSize);
+
+    EmitOpInfo outerPrec = inOuterPrec;
+    
+    auto prec = getInfo(EmitOp::Postfix);
+    const bool needClose = maybeEmitParens(outerPrec, prec);
+
+    emitOperand(base, leftSide(outerPrec, prec));
+
+    m_writer->emit(".");
+
+    switch (vecSize)
+    {
+        default: 
+        {
+            char const* kComponents[] = { "x", "y", "z", "w" };
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+        case 3:
+        {
+            char const* kComponents[] = { "xy.x", "xy.y", "z"};
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+        case 4:
+        {
+            char const* kComponents[] = { "xy.x", "xy.y", "zw.x", "zw.y" };
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+    }
+
+     maybeCloseParens(needClose);
+}
+
 bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
 {
     switch(inst->getOp())
     {
+        case kIROp_swizzle:
+        {
+            // We need to special case for half types.
+            auto swizzleInst = static_cast<IRSwizzle*>(inst);
+
+            IRInst* baseInst = swizzleInst->getBase();
+            IRType* baseType = baseInst->getDataType();
+
+            // If we are swizzling from a built in type, 
+            if (as<IRBasicType>(baseType))
+            {
+                // Just use the default behavior
+            }
+            else if (auto vecType = as<IRVectorType>(baseType))
+            {
+                if (auto basicType = as<IRBasicType>(vecType->getElementType()))
+                {
+                    if (basicType->getBaseType() == BaseType::Half)
+                    {
+                        const Index vecElementCount = Index(getIntVal(vecType->getElementCount()));
+
+                        const Index elementCount = Index(swizzleInst->getElementCount());
+                        if (elementCount == 1)
+                        {
+                            const Index index = Index(getIntVal(swizzleInst->getElementIndex(0)));
+                            _emitGetHalfVectorElement(baseInst, index, vecElementCount, inOuterPrec);
+                        }
+                        else
+                        {
+                            auto outerPrec = getInfo(EmitOp::General);
+
+                            m_writer->emit("make___half");
+                            m_writer->emitInt64(elementCount);
+                            m_writer->emit("(");
+
+                            for (Index i = 0; i < elementCount; ++i)
+                            {
+                                if (i)
+                                {
+                                    m_writer->emit(", ");
+                                }
+
+                                const Index index = Index(getIntVal(swizzleInst->getElementIndex(i)));
+                                _emitGetHalfVectorElement(baseInst, index, vecElementCount, outerPrec);
+                            }
+
+                            m_writer->emit(")");
+                        }
+                        return true;
+                    }
+                }
+            }
+            break;
+        }
         case kIROp_Construct:
         {
             // Simple constructor call
@@ -558,7 +698,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
         }
         case kIROp_WaveMaskBallot:
         {
-            _requireCUDASMVersion(SemanticVersion(7, 0));
+             m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
 
             m_writer->emit("__ballot_sync(");
             emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -569,7 +709,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
         }
         case kIROp_WaveMaskMatch:
         {
-            _requireCUDASMVersion(SemanticVersion(7, 0));
+             m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
 
             m_writer->emit("__match_any_sync(");
             emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -584,14 +724,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
     return Super::tryEmitInstExprImpl(inst, inOuterPrec);
 }
 
-void CUDASourceEmitter::_requireCUDASMVersion(SemanticVersion const& version)
-{
-    if (version > m_extensionTracker->m_smVersion)
-    {
-        m_extensionTracker->m_smVersion = version;
-    }
-}
-
 void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
 {
     // Does this function declare any requirements on CUDA capabilities
@@ -603,7 +735,7 @@ void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
         {
             SemanticVersion version;
             version.setFromInteger(SemanticVersion::IntegerType(smDecoration->getCUDASMVersion()));
-            _requireCUDASMVersion(version);
+            m_extensionTracker->requireSMVersion(version);
         }
     }
 }
diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h
index a5d227c6b..b73948525 100644
--- a/source/slang/slang-emit-cuda.h
+++ b/source/slang/slang-emit-cuda.h
@@ -18,7 +18,14 @@ public:
     void requireBaseType(BaseType baseType) { m_baseTypeFlags |= _getFlag(baseType); }
     bool isBaseTypeRequired(BaseType baseType) { return (m_baseTypeFlags & _getFlag(baseType)) != 0; }
 
+        /// Ensure that the generated code is compiled for at least CUDA SM `version`
+    void requireSMVersion(const SemanticVersion& smVersion) { m_smVersion = (smVersion > m_smVersion) ? smVersion : m_smVersion; }
+
+        /// Should be called before reading out values. 
+    void finalize();
+
 protected:
+
     static BaseTypeFlags _getFlag(BaseType baseType) { return BaseTypeFlags(1) << int(baseType); }
 
     BaseTypeFlags m_baseTypeFlags = 0; 
@@ -86,15 +93,16 @@ protected:
     // CPPSourceEmitter overrides 
     virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) SLANG_OVERRIDE;
     virtual SlangResult calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder) SLANG_OVERRIDE;
-    
+
+    virtual void emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp) SLANG_OVERRIDE;
+
     SlangResult _calcCUDATextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName);
 
     void _emitInitializerList(IRType* elementType, IRUse* operands, Index operandCount);
     void _emitInitializerListValue(IRType* elementType, IRInst* value);
 
-        /// Ensure that the generated code is compiled for at least CUDA SM `version`
-    void _requireCUDASMVersion(SemanticVersion const& version);
-
+    void _emitGetHalfVectorElement(IRInst* baseInst, Index index, Index vecSize, const EmitOpInfo& inOuterPrec);
+    
     RefPtr<CUDAExtensionTracker> m_extensionTracker;
 };
 
diff --git a/tests/compute/half-calc.slang b/tests/compute/half-calc.slang
index 57efebe53..e0dd01315 100644
--- a/tests/compute/half-calc.slang
+++ b/tests/compute/half-calc.slang
@@ -1,6 +1,6 @@
 //DISABLE_TEST(compute):COMPARE_COMPUTE:-dx12 -compute -use-dxil -profile cs_6_2 -render-features half -shaderobj
 //TEST(compute):COMPARE_COMPUTE:-vk -compute -profile cs_6_2 -render-features half -shaderobj
-
+//TEST(compute):COMPARE_COMPUTE:-cuda -compute -render-features half -shaderobj
 
 // Test for doing a calculation using half
 
diff --git a/tests/compute/half-vector-calc.slang b/tests/compute/half-vector-calc.slang
new file mode 100644
index 000000000..5594c38fd
--- /dev/null
+++ b/tests/compute/half-vector-calc.slang
@@ -0,0 +1,35 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE:-dx12 -compute -output-using-type -use-dxil -profile cs_6_2 -render-features half -shaderobj
+//TEST(compute):COMPARE_COMPUTE:-vk -compute -output-using-type -profile cs_6_2 -render-features half -shaderobj
+//TEST(compute):COMPARE_COMPUTE:-cuda -compute -output-using-type -render-features half -shaderobj
+
+// Test for doing a calculation using half
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint tid = dispatchThreadID.x;
+    int x = tid.x;
+    
+    half2 v3 = half2(float(x));
+    
+    half2 v0 = half2(x * 2.0f, x * 0.5f);
+    half3 v1 = half3(x * 2.0f, x * 0.5f, x - 1.0f);
+    half4 v2 = half4(x + 1, x - 1, x + 2 , x - 2);
+    
+    v1 += v0.yxy;
+    v1 += v2.wzy;
+    v2 += v0.xyxy;
+    
+    v0 = v0 + v0 * v0;
+    v1 = v1 + v1 * v1;
+    v2 = v2 + v2 * v2;
+    
+    half o2 = v2.x + v2.y + v2.z + v2.w;
+    half o1 = v1.x + v1.y + v1.z;
+    half o0 = v0.x + v0.y;
+        
+    outputBuffer[tid] = o0 + o1 + o2 + v3.y;
+}
diff --git a/tests/compute/half-vector-calc.slang.expected.txt b/tests/compute/half-vector-calc.slang.expected.txt
new file mode 100644
index 000000000..64beb1dd1
--- /dev/null
+++ b/tests/compute/half-vector-calc.slang.expected.txt
@@ -0,0 +1,5 @@
+type: float
+20.000000
+98.500000
+292.000000
+600.500000
diff --git a/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang b/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
index 0241ff9cd..28f8973f6 100644
--- a/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
+++ b/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
@@ -2,6 +2,7 @@
 
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_2 -shaderobj
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
 //TEST_INPUT:ubuffer(data=[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], stride=4):name inputBuffer
 RWStructuredBuffer<int> inputBuffer;
author	jsmall-nvidia <jsmall@nvidia.com>	2021-04-30 16:51:25 -0400
committer	GitHub <noreply@github.com>	2021-04-30 13:51:25 -0700
commit	1a4a51301d084dd1c8c5906eb810eb6caf6f3963 (patch)
tree	3eac138d918853f88bb8e2b5f14ed36a57e54d7a
parent	c45f368ae404798db67a601749c6e0047fba75ef (diff)