Preliminary CUDA half maths (#1827)

* #include an absolute path didn't work - because paths were taken to always be relative. * Split out StringEscapeUtil. * Added StringEscapeUtil. * Fix typo in unix quoting type. * Small comment improvements. * Try to fix linux linking issue. * Fix typo. * Attempt to fix linux link issue. * Update VS proj even though nothing really changed. * Fix another typo issue. * Fix for windows issue. Fixed bug. * Make separate Utils for escaping. * Fix typo. * Split out into StringEscapeHandler. * Windows shell does handle removing quotes (so remove code to remove them). * Handle unescaping if not initiating using the shell. * Slight improvement around shell like decoding. * Simplify command extraction. * Add shared-library category type. * Fix bug in command extraction. * Typo in transcendental category. * Enable unit-test on in smoke test category. * Make parsing failing output as a failing test. * Fixes for transcendental tests. Disable tests that do not work. * Changed category parsing. * Removed the TestResult parameter from _gatherTestsForFile. Made testsList only output. * Remove testing if all tests were disabled. * Make args of CommandLine always unescaped. * Add category. * Don't need escaping on unix/linux. * Remove some no longer used functions. * Add requireSMVersion to CUDAExtensionTracker. * half-calc.slang now works for CUDA. * bit-cast-16-bit works on CUDA. * WIP handling of CUDA vector<half> types. * Half swizzle CUDA. * Half vector test. * Fix swizzle half bug. * Fix compilation issue with narrowing to Index. Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
author: jsmall-nvidia <jsmall@nvidia.com> 2021-04-30 16:51:25 -0400
committer: GitHub <noreply@github.com> 2021-04-30 13:51:25 -0700
commit: 1a4a51301d084dd1c8c5906eb810eb6caf6f3963 (patch)
tree: 3eac138d918853f88bb8e2b5f14ed36a57e54d7a /source
parent: c45f368ae404798db67a601749c6e0047fba75ef (diff)
3 files changed, 187 insertions, 45 deletions
diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp
index 1d416634a..736250219 100755
--- a/source/slang/slang-compiler.cpp
+++ b/source/slang/slang-compiler.cpp
@@ -1413,6 +1413,8 @@ SlangResult dissassembleDXILUsingDXC(
             // Look for the version
             if (auto cudaTracker = as<CUDAExtensionTracker>(source.extensionTracker))
             {
+                cudaTracker->finalize();
+
                 if (cudaTracker->m_smVersion.isSet())
                 {
                     DownstreamCompiler::CapabilityVersion version;
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index a259ea933..5f7eada68 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -10,6 +10,21 @@
 
 namespace Slang {
 
+
+
+void CUDAExtensionTracker::finalize()
+{
+    if (isBaseTypeRequired(BaseType::Half))
+    {
+        // The cuda_fp16.hpp header indicates the need is for version 5.3, but when this is tried
+        // NVRTC says it cannot load builtins.
+        // The lowest version that this does work for is 6.0, so that's what we use here.
+
+        // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
+        requireSMVersion(SemanticVersion(6, 0));
+    }
+}
+
 static bool _isSingleNameBasicType(IROp op)
 {
     switch (op)
@@ -152,17 +167,74 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
     return Super::calcScalarFuncName(op, type, outBuilder);
 }
 
-SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
 {
-    SLANG_UNUSED(target);
+    typedef HLSLIntrinsic::Op Op;
+
+    if (auto vecType = as <IRVectorType>(specOp->returnType))
+    {
+        if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+        {
+            if (baseType->getBaseType() == BaseType::Half)
+            {
+                switch (specOp->op)
+                {
+                    case Op::Init:
+                    case Op::Add:
+                    case Op::Mul:
+                    case Op::Div:
+
+                    case Op::Neg:
+
+                    case Op::ConstructFromScalar:
+
+                    case Op::Leq:
+                    case Op::Less:
+                    case Op::Greater:
+                    case Op::Geq:
+                    case Op::Neq:
+                    case Op::Eql:
+                    {
+                        return;
+                    }
+                }
+            }
+        }
+    }
 
-    if (target == CodeGenTarget::CSource)
+    switch (specOp->op)
     {
-        return Super::calcTypeName(type, target, out);
+        case Op::Init:
+        {
+            // Special case handling
+            auto returnType = specOp->returnType;
+
+            if (auto vecType = as <IRVectorType>(returnType))
+            {
+                if (auto baseType = as<IRBasicType>(vecType->getElementType()))
+                {
+                    if (baseType->getBaseType() == BaseType::Half)
+                    {
+                        // Defined already in cuda-prelude.h
+                        return;
+                    }
+                }
+            }
+
+            break;
+        }
+        default: break;
     }
 
-    // We allow C source, because if we need a name 
-    SLANG_ASSERT(target == CodeGenTarget::CUDASource);
+    Super::emitSpecializedOperationDefinition(specOp);
+}
+
+SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
+{
+    SLANG_UNUSED(target);
+
+    // The names CUDA produces are all compatible with 'C' (ie they aren't templated types)
+    SLANG_ASSERT(target == CodeGenTarget::CUDASource || target == CodeGenTarget::CSource);
 
     switch (type->getOp())
     {
@@ -180,30 +252,6 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target,
             out << prefix << vecCount;
             return SLANG_OK;
         }
-
-#if 0
-        case kIROp_MatrixType:
-        {
-            auto matType = static_cast<IRMatrixType*>(type);
-
-            auto elementType = matType->getElementType();
-            const auto rowCount = int(getIntVal(matType->getRowCount()));
-            const auto colCount = int(getIntVal(matType->getColumnCount()));
-
-            out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
-            return SLANG_OK;
-        }
-        case kIROp_UnsizedArrayType:
-        {
-            auto arrayType = static_cast<IRUnsizedArrayType*>(type);
-            auto elementType = arrayType->getElementType();
-
-            out << "Array<";
-            SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out));
-            out << ">";
-            return SLANG_OK;
-        }
-#endif
         default:
         {
             if (isNominalOp(type->getOp()))
@@ -519,10 +567,102 @@ void CUDASourceEmitter::_emitInitializerList(IRType* elementType, IRUse* operand
     m_writer->emit("\n}");
 }
 
+void CUDASourceEmitter::_emitGetHalfVectorElement(IRInst* base, Index index, Index vecSize, const EmitOpInfo& inOuterPrec)
+{
+    SLANG_ASSERT(index < vecSize);
+
+    EmitOpInfo outerPrec = inOuterPrec;
+    
+    auto prec = getInfo(EmitOp::Postfix);
+    const bool needClose = maybeEmitParens(outerPrec, prec);
+
+    emitOperand(base, leftSide(outerPrec, prec));
+
+    m_writer->emit(".");
+
+    switch (vecSize)
+    {
+        default: 
+        {
+            char const* kComponents[] = { "x", "y", "z", "w" };
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+        case 3:
+        {
+            char const* kComponents[] = { "xy.x", "xy.y", "z"};
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+        case 4:
+        {
+            char const* kComponents[] = { "xy.x", "xy.y", "zw.x", "zw.y" };
+            m_writer->emit(kComponents[index]);
+            break;
+        }
+    }
+
+     maybeCloseParens(needClose);
+}
+
 bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
 {
     switch(inst->getOp())
     {
+        case kIROp_swizzle:
+        {
+            // We need to special case for half types.
+            auto swizzleInst = static_cast<IRSwizzle*>(inst);
+
+            IRInst* baseInst = swizzleInst->getBase();
+            IRType* baseType = baseInst->getDataType();
+
+            // If we are swizzling from a built in type, 
+            if (as<IRBasicType>(baseType))
+            {
+                // Just use the default behavior
+            }
+            else if (auto vecType = as<IRVectorType>(baseType))
+            {
+                if (auto basicType = as<IRBasicType>(vecType->getElementType()))
+                {
+                    if (basicType->getBaseType() == BaseType::Half)
+                    {
+                        const Index vecElementCount = Index(getIntVal(vecType->getElementCount()));
+
+                        const Index elementCount = Index(swizzleInst->getElementCount());
+                        if (elementCount == 1)
+                        {
+                            const Index index = Index(getIntVal(swizzleInst->getElementIndex(0)));
+                            _emitGetHalfVectorElement(baseInst, index, vecElementCount, inOuterPrec);
+                        }
+                        else
+                        {
+                            auto outerPrec = getInfo(EmitOp::General);
+
+                            m_writer->emit("make___half");
+                            m_writer->emitInt64(elementCount);
+                            m_writer->emit("(");
+
+                            for (Index i = 0; i < elementCount; ++i)
+                            {
+                                if (i)
+                                {
+                                    m_writer->emit(", ");
+                                }
+
+                                const Index index = Index(getIntVal(swizzleInst->getElementIndex(i)));
+                                _emitGetHalfVectorElement(baseInst, index, vecElementCount, outerPrec);
+                            }
+
+                            m_writer->emit(")");
+                        }
+                        return true;
+                    }
+                }
+            }
+            break;
+        }
         case kIROp_Construct:
         {
             // Simple constructor call
@@ -558,7 +698,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
         }
         case kIROp_WaveMaskBallot:
         {
-            _requireCUDASMVersion(SemanticVersion(7, 0));
+             m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
 
             m_writer->emit("__ballot_sync(");
             emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -569,7 +709,7 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
         }
         case kIROp_WaveMaskMatch:
         {
-            _requireCUDASMVersion(SemanticVersion(7, 0));
+             m_extensionTracker->requireSMVersion(SemanticVersion(7, 0));
 
             m_writer->emit("__match_any_sync(");
             emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
@@ -584,14 +724,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
     return Super::tryEmitInstExprImpl(inst, inOuterPrec);
 }
 
-void CUDASourceEmitter::_requireCUDASMVersion(SemanticVersion const& version)
-{
-    if (version > m_extensionTracker->m_smVersion)
-    {
-        m_extensionTracker->m_smVersion = version;
-    }
-}
-
 void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
 {
     // Does this function declare any requirements on CUDA capabilities
@@ -603,7 +735,7 @@ void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst)
         {
             SemanticVersion version;
             version.setFromInteger(SemanticVersion::IntegerType(smDecoration->getCUDASMVersion()));
-            _requireCUDASMVersion(version);
+            m_extensionTracker->requireSMVersion(version);
         }
     }
 }
diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h
index a5d227c6b..b73948525 100644
--- a/source/slang/slang-emit-cuda.h
+++ b/source/slang/slang-emit-cuda.h
@@ -18,7 +18,14 @@ public:
     void requireBaseType(BaseType baseType) { m_baseTypeFlags |= _getFlag(baseType); }
     bool isBaseTypeRequired(BaseType baseType) { return (m_baseTypeFlags & _getFlag(baseType)) != 0; }
 
+        /// Ensure that the generated code is compiled for at least CUDA SM `version`
+    void requireSMVersion(const SemanticVersion& smVersion) { m_smVersion = (smVersion > m_smVersion) ? smVersion : m_smVersion; }
+
+        /// Should be called before reading out values. 
+    void finalize();
+
 protected:
+
     static BaseTypeFlags _getFlag(BaseType baseType) { return BaseTypeFlags(1) << int(baseType); }
 
     BaseTypeFlags m_baseTypeFlags = 0; 
@@ -86,15 +93,16 @@ protected:
     // CPPSourceEmitter overrides 
     virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) SLANG_OVERRIDE;
     virtual SlangResult calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder) SLANG_OVERRIDE;
-    
+
+    virtual void emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp) SLANG_OVERRIDE;
+
     SlangResult _calcCUDATextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName);
 
     void _emitInitializerList(IRType* elementType, IRUse* operands, Index operandCount);
     void _emitInitializerListValue(IRType* elementType, IRInst* value);
 
-        /// Ensure that the generated code is compiled for at least CUDA SM `version`
-    void _requireCUDASMVersion(SemanticVersion const& version);
-
+    void _emitGetHalfVectorElement(IRInst* baseInst, Index index, Index vecSize, const EmitOpInfo& inOuterPrec);
+    
     RefPtr<CUDAExtensionTracker> m_extensionTracker;
 };
author	jsmall-nvidia <jsmall@nvidia.com>	2021-04-30 16:51:25 -0400
committer	GitHub <noreply@github.com>	2021-04-30 13:51:25 -0700
commit	1a4a51301d084dd1c8c5906eb810eb6caf6f3963 (patch)
tree	3eac138d918853f88bb8e2b5f14ed36a57e54d7a /source
parent	c45f368ae404798db67a601749c6e0047fba75ef (diff)