diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-01-10 15:00:13 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-01-10 15:00:13 -0500 |
| commit | ef41dfc605f7868c0ccc7dde05982232b7d49589 (patch) | |
| tree | a2abe79250c234c65f968976db2578341ec77437 /source/slang/slang-emit-cuda.cpp | |
| parent | f2a123d727316d8203820a332da1348f78ad9ad6 (diff) | |
WIP: CPU like CUDA binding (#1164)
* CUDA generated first test compiles.
* WIP on enabling CUDA in render-test.
* Detect CUDA_PATH environmental variable to build build cuda support into render-test.
Added WIP cuda-compute-util.cpp/h
Added CUDA as a renderer type.
* Fix libraries needed for cuda in premake.
* Added -enable-cuda premake option. Defaults to false.
* Creates CUDA device, loads PTX and finds entry point.
* Fix some erroneous cruft from slang-cuda-prelude.h
* Made CUDA use C++ like ABI for generated code.
Fix small bug in C++ output semantics.
Diffstat (limited to 'source/slang/slang-emit-cuda.cpp')
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 206 |
1 files changed, 126 insertions, 80 deletions
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 980e94a29..12807e9e2 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -168,33 +168,7 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, const auto rowCount = int(GetIntVal(matType->getRowCount())); const auto colCount = int(GetIntVal(matType->getColumnCount())); - if (target == CodeGenTarget::CPPSource) - { - out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; - } - else - { - out << "Mat"; - const UnownedStringSlice postFix = _getCTypeVecPostFix(_getCType(elementType->op)); - out << postFix; - if (postFix.size() > 1) - { - out << "_"; - } - out << rowCount; - out << colCount; - } - return SLANG_OK; - } - case kIROp_ArrayType: - { - auto arrayType = static_cast<IRArrayType*>(type); - auto elementType = arrayType->getElementType(); - int elementCount = int(GetIntVal(arrayType->getElementCount())); - - out << "FixedArray<"; - SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out)); - out << ", " << elementCount << ">"; + out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; return SLANG_OK; } case kIROp_UnsizedArrayType: @@ -250,37 +224,11 @@ void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniform void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) { - auto profile = m_effectiveProfile; - auto stage = entryPointDecor->getProfile().GetStage(); - - switch (stage) - { - case Stage::Compute: - { - Int sizeAlongAxis[kThreadGroupAxisCount]; - getComputeThreadGroupSize(irFunc, sizeAlongAxis); - -#if 1 - m_writer->emit("// [numthreads("); - for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) - { - if (ii != 0) m_writer->emit(", "); - m_writer->emit(sizeAlongAxis[ii]); - } - m_writer->emit(")]\n"); -#endif - - m_writer->emit("extern \"C\" __global__ "); - break; - } - - // TODO: There are other stages that will need this kind of handling. - default: - break; - } + SLANG_UNUSED(irFunc); + SLANG_UNUSED(entryPointDecor); } -void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) +void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) { if (shouldFoldInstIntoUseSites(inst)) { @@ -326,7 +274,8 @@ void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPr } default: break; } - m_writer->emit(getName(inst)); + + Super::emitOperandImpl(inst, outerPrec); } bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) @@ -398,19 +347,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu return false; } -bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) -{ - SLANG_UNUSED(varDecl); - SLANG_UNUSED(varType); - - // (__device__/__constant__/__shared__/__managed__) - - m_writer->emit("__device__ "); - - // Use the default impl otherwise - return false; -} - void CUDASourceEmitter::emitLayoutDirectivesImpl(TargetRequest* targetReq) { SLANG_UNUSED(targetReq); @@ -462,15 +398,8 @@ void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func) void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func) { - if (IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>()) - { - // If its an entry point, we let the entry point attribute control the output - } - else - { - // If it's not an entry point mark as device - m_writer->emit("__device__ "); - } + // Mark as run on device. Don't need to worry about entry point, as that is output separtely to call the __device_ implementation + m_writer->emit("__device__ "); CLikeSourceEmitter::emitSimpleFuncImpl(func); } @@ -547,8 +476,125 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) List<EmitAction> actions; computeEmitActions(module, actions); - executeEmitActions(actions); + + _emitForwardDeclarations(actions); + + IRGlobalParam* entryPointGlobalParams = nullptr; + + // Output the global parameters in a 'UniformState' structure + { + m_writer->emit("struct UniformState\n{\n"); + m_writer->indent(); + + // We need these to be prefixed by __device__ + _emitUniformStateMembers(actions, &entryPointGlobalParams); + + m_writer->dedent(); + m_writer->emit("\n};\n\n"); + } + + // Output the 'Context' which will be used for execution + { + m_writer->emit("struct Context\n{\n"); + m_writer->indent(); + + m_writer->emit("UniformState* uniformState;\n"); + + if (entryPointGlobalParams) + { + emitGlobalInst(entryPointGlobalParams); + } + + // Output all the thread locals + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op)) + { + emitGlobalInst(action.inst); + } + } + + // Finally output the functions as methods on the context + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst)) + { + emitGlobalInst(action.inst); + } + } + + m_writer->dedent(); + m_writer->emit("};\n\n"); + } + + // Finally we need to output dll entry points + + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst)) + { + IRFunc* func = as<IRFunc>(action.inst); + + IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>(); + + if (entryPointDecor && entryPointDecor->getProfile().GetStage() == Stage::Compute) + { + Int sizeAlongAxis[kThreadGroupAxisCount]; + getComputeThreadGroupSize(func, sizeAlongAxis); + + // + m_writer->emit("// [numthreads("); + for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) + { + if (ii != 0) m_writer->emit(", "); + m_writer->emit(sizeAlongAxis[ii]); + } + m_writer->emit(")]\n"); + + String funcName = getName(func); + + m_writer->emit("extern \"C\" __global__ "); + + auto resultType = func->getResultType(); + + // Emit the actual function + emitEntryPointAttributes(func, entryPointDecor); + emitType(resultType, funcName); + + m_writer->emit("(UniformEntryPointParams* params, UniformState* uniformState)"); + emitSemantics(func); + m_writer->emit("\n{\n"); + m_writer->indent(); + + // Initialize when constructing so that globals are zeroed + m_writer->emit("Context context = {};\n"); + m_writer->emit("context.uniformState = uniformState;\n"); + + if (entryPointGlobalParams) + { + auto varDecl = entryPointGlobalParams; + auto rawType = varDecl->getDataType(); + + auto varType = rawType; + + m_writer->emit("context."); + m_writer->emit(getName(varDecl)); + m_writer->emit(" = ("); + emitType(varType); + m_writer->emit("*)params; \n"); + } + + m_writer->emit("context."); + m_writer->emit(funcName); + m_writer->emit("();\n"); + + m_writer->dedent(); + m_writer->emit("}\n"); + } + } + } + } |
