summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2021-04-30 16:51:25 -0400
committerGitHub <noreply@github.com>2021-04-30 13:51:25 -0700
commit1a4a51301d084dd1c8c5906eb810eb6caf6f3963 (patch)
tree3eac138d918853f88bb8e2b5f14ed36a57e54d7a
parentc45f368ae404798db67a601749c6e0047fba75ef (diff)
Preliminary CUDA half maths (#1827)
* #include an absolute path didn't work - because paths were taken to always be relative. * Split out StringEscapeUtil. * Added StringEscapeUtil. * Fix typo in unix quoting type. * Small comment improvements. * Try to fix linux linking issue. * Fix typo. * Attempt to fix linux link issue. * Update VS proj even though nothing really changed. * Fix another typo issue. * Fix for windows issue. Fixed bug. * Make separate Utils for escaping. * Fix typo. * Split out into StringEscapeHandler. * Windows shell does handle removing quotes (so remove code to remove them). * Handle unescaping if not initiating using the shell. * Slight improvement around shell like decoding. * Simplify command extraction. * Add shared-library category type. * Fix bug in command extraction. * Typo in transcendental category. * Enable unit-test on in smoke test category. * Make parsing failing output as a failing test. * Fixes for transcendental tests. Disable tests that do not work. * Changed category parsing. * Removed the TestResult parameter from _gatherTestsForFile. Made testsList only output. * Remove testing if all tests were disabled. * Make args of CommandLine always unescaped. * Add category. * Don't need escaping on unix/linux. * Remove some no longer used functions. * Add requireSMVersion to CUDAExtensionTracker. * half-calc.slang now works for CUDA. * bit-cast-16-bit works on CUDA. * WIP handling of CUDA vector<half> types. * Half swizzle CUDA. * Half vector test. * Fix swizzle half bug. * Fix compilation issue with narrowing to Index. Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
-rw-r--r--prelude/slang-cuda-prelude.h48
-rwxr-xr-xsource/slang/slang-compiler.cpp2
-rw-r--r--source/slang/slang-emit-cuda.cpp214
-rw-r--r--source/slang/slang-emit-cuda.h16
-rw-r--r--tests/compute/half-calc.slang2
-rw-r--r--tests/compute/half-vector-calc.slang35
-rw-r--r--tests/compute/half-vector-calc.slang.expected.txt5
-rw-r--r--tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang1
8 files changed, 274 insertions, 49 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index c6de56641..61702824c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -5,7 +5,7 @@
// are passed down.
#ifdef SLANG_CUDA_ENABLE_HALF
-#include <cuda_fp16.h>
+# include <cuda_fp16.h>
#endif
#ifdef SLANG_CUDA_ENABLE_OPTIX
@@ -65,8 +65,50 @@ struct __half4 { __half2 xy; __half2 zw; };
// Mechanism to make half vectors
SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { __half3 o; o.xy = __halves2half2(x, y); o.z = z; return o; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { __half4 o; o.xy = __halves2half2(x, y); o.zw = __halves2half2(z, w); return o; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; }
+
+// Half3 maths ops
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3 &lh, const __half3 &rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &lh, const __half3 &rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3 &lh, const __half3 &rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3 &lh, const __half3 &rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; }
+
+#if 0
+// We need to return the vector<bool> type
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half3 &lh, const __half3 &rh) { return __hbeq2(lh.xy, rh.xy) && __heq(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half3 &lh, const __half3 &rh) { return __hbneu2(lh.xy, rh.xy) && __hneu(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half3 &lh, const __half3 &rh) { return __hbgt2(lh.xy, rh.xy) && __hgt(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half3 &lh, const __half3 &rh) { return __hblt2(lh.xy, rh.xy) && __hlt(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half3 &lh, const __half3 &rh) { return __hbge2(lh.xy, rh.xy) && __hge(lh.z, rh.z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half3 &lh, const __half3 &rh) { return __hble2(lh.xy, rh.xy) && __hle(lh.z, rh.z); }
+#endif
+
+// Half4 maths ops
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4 &lh, const __half4 &rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &lh, const __half4 &rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4 &lh, const __half4 &rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4 &lh, const __half4 &rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; }
+
+#if 0
+// We need to return vector<bool> type
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half4 &lh, const __half4 &rh) { return __hbeq2(lh.xy, rh.xy) && __hbeq2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half4 &lh, const __half4 &rh) { return __hbneu2(lh.xy, rh.xy) && __hbneu2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half4 &lh, const __half4 &rh) { return __hbgt2(lh.xy, rh.xy) && __hbgt2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half4 &lh, const __half4 &rh) { return __hblt2(lh.xy, rh.xy) && __hblt2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half4 &lh, const __half4 &rh) { return __hbge2(lh.xy, rh.xy) && __hbge2(lh.zw, rh.zw); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half4 &lh, const __half4 &rh) { return __hble2(lh.xy, rh.xy) && __hble2(lh.zw, rh.zw); }
+#endif
// Use the round nearest as the default - it is the only one defined
SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); }
diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp
index 1d416634a..736250219 100755
--- a/source/slang/slang-compiler.cpp
+++ b/source/slang/slang-compiler.cpp
@@ -1413,6 +1413,8 @@ SlangResult dissassembleDXILUsingDXC(
// Look for the version
if (auto cudaTracker = as<CUDAExtensionTracker>(source.extensionTracker))
{
+ cudaTracker->finalize();
+
if (cudaTracker->m_smVersion.isSet())
{
DownstreamCompiler::CapabilityVersion version;
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index a259ea933..5f7eada68 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -10,6 +10,21 @@
namespace Slang {
+
+
+void CUDAExtensionTracker::finalize()
+{
+ if (isBaseTypeRequired(BaseType::Half))
+ {
+ // The cuda_fp16.hpp header indicates the need is for version 5.3, but when this is tried
+ // NVRTC says it cannot load builtins.
+ // The lowest version that this does work for is 6.0, so that's what we use here.
+
+ // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
+ requireSMVersion(SemanticVersion(6, 0));
+ }
+}
+
static bool _isSingleNameBasicType(IROp op)
{
switch (op)
@@ -152,17 +167,74 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
return Super::calcScalarFuncName(op, type, outBuilder);
}
-SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
{
- SLANG_UNUSED(target);
+ typedef HLSLIntrinsic::Op Op;
+
+ if (auto vecType = as <IRVectorType>(specOp->returnType))
+ {
+ if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+ {
+ if (baseType->getBaseType() == BaseType::Half)
+ {
+ switch (specOp->op)
+ {
+ case Op::Init:
+ case Op::Add:
+ case Op::Mul:
+ case Op::Div:
+
+ case Op::Neg:
+
+ case Op::ConstructFromScalar:
+
+ case Op::Leq:
+ case Op::Less:
+ case Op::Greater:
+ case Op::Geq:
+ case Op::Neq:
+ case Op::Eql:
+ {
+ return;
+ }
+ }
+ }
+ }
+ }
- if (target == CodeGenTarget::CSource)
+ switch (specOp->op)
{
- return Super::calcTypeName(type, target, out);
+ case Op::Init:
+ {
+ // Special case handling
+ auto returnType = specOp->returnType;
+
+ if (auto vecType = as <IRVectorType>(returnType))
+ {
+ if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+ {
+ if (baseType->getBaseType() == BaseType::Half)
+ {
+ // Defined already in cuda-prelude.h
+ return;
+ }
+ }
+ }
+
+ break;
+ }
+ default: break;
}
- // We allow C source, because if we need a name
- SLANG_ASSERT(target == CodeGenTarget::CUDASource);
+ Super::emitSpecializedOperationDefinition(specOp);
+}
+
+SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+{
+ SLANG_UNUSED(target);
+
+ // The names CUDA produces are all compatible with 'C' (ie they aren't templated types)
+ SLANG_ASSERT(target == CodeGenTarget::CUDASource || target == CodeGenTarget::CSource);
switch (type->getOp())
{
@@ -180,30 +252,6 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target,
out << prefix << vecCount;
return SLANG_OK;
}
-
-#if 0
- case kIROp_MatrixType:
- {
- auto matType = static_cast<IRMatrixType*>(type);
-
- auto elementType = matType->getElementType();
- const auto rowCount = int(getIntVal(matType->getRowCount()));
- const auto colCount = int(getIntVal(matType->getColumnCount()));
-
- out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
- return SLANG_OK;
- }
- case kIROp_UnsizedArrayType:
- {
- auto arrayType = static_cast<IRUnsizedArrayType*>(type);
- auto elementType = arrayType->getElementType();
-
- out << "Array<";
- SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out));
- out << ">";
- return SLANG_OK;
- }
-#endif
default:
{
if (isNominalOp(type->getOp()))
@@ -519,10 +567,102 @@ void CUDASourceEmitter::_emitInitializerList(IRType* elementType, IRUse* operand
m_writer->emit("\n}");
}
+void CUDASourceEmitter::_emitGetHalfVectorElement(IRInst* base, Index index, Index vecSize, const EmitOpInfo& inOuterPrec)
+{
+ SLANG_ASSERT(index < vecSize);
+
+ EmitOpInfo outerPrec = inOuterPrec;
+
+ auto prec = getInfo(EmitOp::Postfix);
+ const bool needClose = maybeEmitParens(outerPrec, prec);
+
+ emitOperand(base, leftSide(outerPrec, prec));
+
+ m_writer->emit(".");
+
+ switch (vecSize)
+ {
+ default:
+ {
+ char const* kComponents[] = { "x", "y", "z", "w" };
+ m_writer->emit(kComponents[index]);
+ break;
+ }
+ case 3:
+ {
+ char const* kComponents[] = { "xy.x", "xy.y", "z"};
+ m_writer->emit(kComponents[index]);
+ break;
+ }
+ case 4:
+ {
+ char const* kComponents[] = { "xy.x", "xy.y", "zw.x", "zw.y" };
+ m_writer->emit(kComponents[index]);
+ break;
+ }
+ }
+
+ maybeCloseParens(needClose);
+}
+
bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
{
switch(inst->getOp())
{
+ case kIROp_swizzle:
+ {
+ // We need to special case for half types.
+ auto swizzleInst = static_cast<IRSwizzle*>(inst);
+
+ IRInst* baseInst = swizzleInst->getBase();
+ IRType* baseType = baseInst->getDataType();
+
+ // If we are swizzling from a built in type,
+ if (as<IRBasicType>(baseType))
+ {
+ // Just use the default behavior
+ }
+ else if (auto vecType = as<IRVectorType>(baseType))
+ {
+ if (auto basicType = as<IRBasicType>(vecType->getElementType()))
+ {
+ if (basicType->getBaseType() == BaseType::Half)
+ {
+ const Index vecElementCount = Index(getIntVal(vecType->getElementCount()));
+
+ const Index elementCount = Index(swizzleInst->getElementCount());
+ if (elementCount == 1)
+ {
+ const Index index = Index(getIntVal(swizzleInst->getElementIndex(0)));
+ _emitGetHalfVectorElement(baseInst, index, vecElementCount, inOuterPrec);
+ }
+ else
+ {
+ auto outerPrec = getInfo(EmitOp::General);
+
+ m_writer->emit("make___half");
+ m_writer->emitInt64(elementCount);
+ m_writer->emit("(");
+
+ for (Index i = 0; i < elementCount; ++i)
+ {
+ if (i)
+ {
+ m_writer->emit(", ");
+ }
+
+ const Index index = Index(getIntVal(swizzleInst->getElementIndex(i)));
+ _emitGetHalfVectorElement(baseInst, index, vecElementCount, outerPrec);
+ }
+
+ m_writer->emit(")");
+ }
+ return true;
+ }
+ }
+ }
+ break;
+ }
case kIROp_Construct:
{
// Simple constructor call
@@ -558,7 +698,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
}
case kIROp_WaveMaskBallot:
{
- _requireCUDASMVersion(SemanticVersion(7, 0));
+ m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
m_writer->emit("__ballot_sync(");
emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -569,7 +709,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
}
case kIROp_WaveMaskMatch:
{
- _requireCUDASMVersion(SemanticVersion(7, 0));
+ m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
m_writer->emit("__match_any_sync(");
emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -584,14 +724,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
return Super::tryEmitInstExprImpl(inst, inOuterPrec);
}
-void CUDASourceEmitter::_requireCUDASMVersion(SemanticVersion const& version)
-{
- if (version > m_extensionTracker->m_smVersion)
- {
- m_extensionTracker->m_smVersion = version;
- }
-}
-
void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
{
// Does this function declare any requirements on CUDA capabilities
@@ -603,7 +735,7 @@ void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
{
SemanticVersion version;
version.setFromInteger(SemanticVersion::IntegerType(smDecoration->getCUDASMVersion()));
- _requireCUDASMVersion(version);
+ m_extensionTracker->requireSMVersion(version);
}
}
}
diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h
index a5d227c6b..b73948525 100644
--- a/source/slang/slang-emit-cuda.h
+++ b/source/slang/slang-emit-cuda.h
@@ -18,7 +18,14 @@ public:
void requireBaseType(BaseType baseType) { m_baseTypeFlags |= _getFlag(baseType); }
bool isBaseTypeRequired(BaseType baseType) { return (m_baseTypeFlags & _getFlag(baseType)) != 0; }
+ /// Ensure that the generated code is compiled for at least CUDA SM `version`
+ void requireSMVersion(const SemanticVersion& smVersion) { m_smVersion = (smVersion > m_smVersion) ? smVersion : m_smVersion; }
+
+ /// Should be called before reading out values.
+ void finalize();
+
protected:
+
static BaseTypeFlags _getFlag(BaseType baseType) { return BaseTypeFlags(1) << int(baseType); }
BaseTypeFlags m_baseTypeFlags = 0;
@@ -86,15 +93,16 @@ protected:
// CPPSourceEmitter overrides
virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) SLANG_OVERRIDE;
virtual SlangResult calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder) SLANG_OVERRIDE;
-
+
+ virtual void emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp) SLANG_OVERRIDE;
+
SlangResult _calcCUDATextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName);
void _emitInitializerList(IRType* elementType, IRUse* operands, Index operandCount);
void _emitInitializerListValue(IRType* elementType, IRInst* value);
- /// Ensure that the generated code is compiled for at least CUDA SM `version`
- void _requireCUDASMVersion(SemanticVersion const& version);
-
+ void _emitGetHalfVectorElement(IRInst* baseInst, Index index, Index vecSize, const EmitOpInfo& inOuterPrec);
+
RefPtr<CUDAExtensionTracker> m_extensionTracker;
};
diff --git a/tests/compute/half-calc.slang b/tests/compute/half-calc.slang
index 57efebe53..e0dd01315 100644
--- a/tests/compute/half-calc.slang
+++ b/tests/compute/half-calc.slang
@@ -1,6 +1,6 @@
//DISABLE_TEST(compute):COMPARE_COMPUTE:-dx12 -compute -use-dxil -profile cs_6_2 -render-features half -shaderobj
//TEST(compute):COMPARE_COMPUTE:-vk -compute -profile cs_6_2 -render-features half -shaderobj
-
+//TEST(compute):COMPARE_COMPUTE:-cuda -compute -render-features half -shaderobj
// Test for doing a calculation using half
diff --git a/tests/compute/half-vector-calc.slang b/tests/compute/half-vector-calc.slang
new file mode 100644
index 000000000..5594c38fd
--- /dev/null
+++ b/tests/compute/half-vector-calc.slang
@@ -0,0 +1,35 @@
+//DISABLE_TEST(compute):COMPARE_COMPUTE:-dx12 -compute -output-using-type -use-dxil -profile cs_6_2 -render-features half -shaderobj
+//TEST(compute):COMPARE_COMPUTE:-vk -compute -output-using-type -profile cs_6_2 -render-features half -shaderobj
+//TEST(compute):COMPARE_COMPUTE:-cuda -compute -output-using-type -render-features half -shaderobj
+
+// Test for doing a calculation using half
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint tid = dispatchThreadID.x;
+ int x = tid.x;
+
+ half2 v3 = half2(float(x));
+
+ half2 v0 = half2(x * 2.0f, x * 0.5f);
+ half3 v1 = half3(x * 2.0f, x * 0.5f, x - 1.0f);
+ half4 v2 = half4(x + 1, x - 1, x + 2 , x - 2);
+
+ v1 += v0.yxy;
+ v1 += v2.wzy;
+ v2 += v0.xyxy;
+
+ v0 = v0 + v0 * v0;
+ v1 = v1 + v1 * v1;
+ v2 = v2 + v2 * v2;
+
+ half o2 = v2.x + v2.y + v2.z + v2.w;
+ half o1 = v1.x + v1.y + v1.z;
+ half o0 = v0.x + v0.y;
+
+ outputBuffer[tid] = o0 + o1 + o2 + v3.y;
+}
diff --git a/tests/compute/half-vector-calc.slang.expected.txt b/tests/compute/half-vector-calc.slang.expected.txt
new file mode 100644
index 000000000..64beb1dd1
--- /dev/null
+++ b/tests/compute/half-vector-calc.slang.expected.txt
@@ -0,0 +1,5 @@
+type: float
+20.000000
+98.500000
+292.000000
+600.500000
diff --git a/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang b/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
index 0241ff9cd..28f8973f6 100644
--- a/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
+++ b/tests/hlsl-intrinsic/bit-cast/bit-cast-16-bit.slang
@@ -2,6 +2,7 @@
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_2 -shaderobj
//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
//TEST_INPUT:ubuffer(data=[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], stride=4):name inputBuffer
RWStructuredBuffer<int> inputBuffer;