WIP: CPU like CUDA binding (#1164)

* CUDA generated first test compiles. * WIP on enabling CUDA in render-test. * Detect CUDA_PATH environmental variable to build build cuda support into render-test. Added WIP cuda-compute-util.cpp/h Added CUDA as a renderer type. * Fix libraries needed for cuda in premake. * Added -enable-cuda premake option. Defaults to false. * Creates CUDA device, loads PTX and finds entry point. * Fix some erroneous cruft from slang-cuda-prelude.h * Made CUDA use C++ like ABI for generated code. Fix small bug in C++ output semantics.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-01-10 15:00:13 -0500
committer: GitHub <noreply@github.com> 2020-01-10 15:00:13 -0500
commit: ef41dfc605f7868c0ccc7dde05982232b7d49589 (patch)
tree: a2abe79250c234c65f968976db2578341ec77437 /source/slang/slang-emit-cuda.cpp
parent: f2a123d727316d8203820a332da1348f78ad9ad6 (diff)
1 files changed, 126 insertions, 80 deletions
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 980e94a29..12807e9e2 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -168,33 +168,7 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target,
             const auto rowCount = int(GetIntVal(matType->getRowCount()));
             const auto colCount = int(GetIntVal(matType->getColumnCount()));
 
-            if (target == CodeGenTarget::CPPSource)
-            {
-                out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
-            }
-            else
-            {
-                out << "Mat";
-                const UnownedStringSlice postFix = _getCTypeVecPostFix(_getCType(elementType->op));
-                out << postFix;
-                if (postFix.size() > 1)
-                {
-                    out << "_";
-                }
-                out << rowCount;
-                out << colCount;
-            }
-            return SLANG_OK;
-        }
-        case kIROp_ArrayType:
-        {
-            auto arrayType = static_cast<IRArrayType*>(type);
-            auto elementType = arrayType->getElementType();
-            int elementCount = int(GetIntVal(arrayType->getElementCount()));
-
-            out << "FixedArray<";
-            SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out));
-            out << ", " << elementCount << ">";
+            out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
             return SLANG_OK;
         }
         case kIROp_UnsizedArrayType:
@@ -250,37 +224,11 @@ void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniform
 
 void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor)
 {
-    auto profile = m_effectiveProfile;
-    auto stage = entryPointDecor->getProfile().GetStage();
-
-    switch (stage)
-    {
-        case Stage::Compute:
-        {
-            Int sizeAlongAxis[kThreadGroupAxisCount];
-            getComputeThreadGroupSize(irFunc, sizeAlongAxis);
-
-#if 1
-            m_writer->emit("// [numthreads(");
-            for (int ii = 0; ii < kThreadGroupAxisCount; ++ii)
-            {
-                if (ii != 0) m_writer->emit(", ");
-                m_writer->emit(sizeAlongAxis[ii]);
-            }
-            m_writer->emit(")]\n");
-#endif
-
-            m_writer->emit("extern \"C\" __global__ ");
-            break;
-        }
-        
-        // TODO: There are other stages that will need this kind of handling.
-        default:
-            break;
-    }
+    SLANG_UNUSED(irFunc);
+    SLANG_UNUSED(entryPointDecor);
 }
 
-void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const&  outerPrec)
+void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec)
 {
     if (shouldFoldInstIntoUseSites(inst))
     {
@@ -326,7 +274,8 @@ void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const&  outerPr
         }
         default: break;
     }
-    m_writer->emit(getName(inst));
+
+    Super::emitOperandImpl(inst, outerPrec);
 }
 
 bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
@@ -398,19 +347,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
     return false;
 }
 
-bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType)
-{
-    SLANG_UNUSED(varDecl);
-    SLANG_UNUSED(varType);
-
-    // (__device__/__constant__/__shared__/__managed__)
-
-    m_writer->emit("__device__ ");
-
-    // Use the default impl otherwise
-    return false;
-}
-
 void CUDASourceEmitter::emitLayoutDirectivesImpl(TargetRequest* targetReq)
 {
     SLANG_UNUSED(targetReq);
@@ -462,15 +398,8 @@ void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func)
 
 void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func)
 {
-    if (IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>())
-    {
-        // If its an entry point, we let the entry point attribute control the output
-    }
-    else
-    {
-        // If it's not an entry point mark as device
-        m_writer->emit("__device__ ");    
-    }
+    // Mark as run on device. Don't need to worry about entry point, as that is output separtely to call the __device_ implementation
+    m_writer->emit("__device__ ");    
 
     CLikeSourceEmitter::emitSimpleFuncImpl(func);
 }
@@ -547,8 +476,125 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module)
     List<EmitAction> actions;
 
     computeEmitActions(module, actions);
-    executeEmitActions(actions);
 
+
+    _emitForwardDeclarations(actions);
+
+    IRGlobalParam* entryPointGlobalParams = nullptr;
+
+    // Output the global parameters in a 'UniformState' structure
+    {
+        m_writer->emit("struct UniformState\n{\n");
+        m_writer->indent();
+
+        // We need these to be prefixed by __device__
+        _emitUniformStateMembers(actions, &entryPointGlobalParams);
+
+        m_writer->dedent();
+        m_writer->emit("\n};\n\n");
+    }
+
+    // Output the 'Context' which will be used for execution
+    {
+        m_writer->emit("struct Context\n{\n");
+        m_writer->indent();
+
+        m_writer->emit("UniformState* uniformState;\n");
+
+        if (entryPointGlobalParams)
+        {
+            emitGlobalInst(entryPointGlobalParams);
+        }
+
+        // Output all the thread locals 
+        for (auto action : actions)
+        {
+            if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op))
+            {
+                emitGlobalInst(action.inst);
+            }
+        }
+
+        // Finally output the functions as methods on the context
+        for (auto action : actions)
+        {
+            if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst))
+            {
+                emitGlobalInst(action.inst);
+            }
+        }
+
+        m_writer->dedent();
+        m_writer->emit("};\n\n");
+    }
+
+    // Finally we need to output dll entry points
+
+    for (auto action : actions)
+    {
+        if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst))
+        {
+            IRFunc* func = as<IRFunc>(action.inst);
+
+            IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>();
+
+            if (entryPointDecor && entryPointDecor->getProfile().GetStage() == Stage::Compute)
+            {
+                Int sizeAlongAxis[kThreadGroupAxisCount];
+                getComputeThreadGroupSize(func, sizeAlongAxis);
+
+                // 
+                m_writer->emit("// [numthreads(");
+                for (int ii = 0; ii < kThreadGroupAxisCount; ++ii)
+                {
+                    if (ii != 0) m_writer->emit(", ");
+                    m_writer->emit(sizeAlongAxis[ii]);
+                }
+                m_writer->emit(")]\n");
+
+                String funcName = getName(func);
+
+                m_writer->emit("extern \"C\" __global__  ");
+               
+                auto resultType = func->getResultType();
+
+                // Emit the actual function
+                emitEntryPointAttributes(func, entryPointDecor);
+                emitType(resultType, funcName);
+
+                m_writer->emit("(UniformEntryPointParams* params, UniformState* uniformState)");
+                emitSemantics(func);
+                m_writer->emit("\n{\n");
+                m_writer->indent();
+
+                // Initialize when constructing so that globals are zeroed
+                m_writer->emit("Context context = {};\n");
+                m_writer->emit("context.uniformState = uniformState;\n");
+
+                if (entryPointGlobalParams)
+                {
+                    auto varDecl = entryPointGlobalParams;
+                    auto rawType = varDecl->getDataType();
+
+                    auto varType = rawType;
+
+                    m_writer->emit("context.");
+                    m_writer->emit(getName(varDecl));
+                    m_writer->emit(" =  (");
+                    emitType(varType);
+                    m_writer->emit("*)params; \n");
+                }
+
+                m_writer->emit("context.");
+                m_writer->emit(funcName);
+                m_writer->emit("();\n");
+
+                m_writer->dedent();
+                m_writer->emit("}\n");
+            }
+        }
+    }
+    
 }
author	jsmall-nvidia <jsmall@nvidia.com>	2020-01-10 15:00:13 -0500
committer	GitHub <noreply@github.com>	2020-01-10 15:00:13 -0500
commit	ef41dfc605f7868c0ccc7dde05982232b7d49589 (patch)
tree	a2abe79250c234c65f968976db2578341ec77437 /source/slang/slang-emit-cuda.cpp
parent	f2a123d727316d8203820a332da1348f78ad9ad6 (diff)