From ef41dfc605f7868c0ccc7dde05982232b7d49589 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Fri, 10 Jan 2020 15:00:13 -0500 Subject: WIP: CPU like CUDA binding (#1164) * CUDA generated first test compiles. * WIP on enabling CUDA in render-test. * Detect CUDA_PATH environmental variable to build build cuda support into render-test. Added WIP cuda-compute-util.cpp/h Added CUDA as a renderer type. * Fix libraries needed for cuda in premake. * Added -enable-cuda premake option. Defaults to false. * Creates CUDA device, loads PTX and finds entry point. * Fix some erroneous cruft from slang-cuda-prelude.h * Made CUDA use C++ like ABI for generated code. Fix small bug in C++ output semantics. --- source/slang/slang-emit-cpp.cpp | 147 +++++++++++++++------------- source/slang/slang-emit-cpp.h | 19 ++++ source/slang/slang-emit-cuda.cpp | 206 ++++++++++++++++++++++++--------------- source/slang/slang-emit-cuda.h | 2 +- 4 files changed, 224 insertions(+), 150 deletions(-) (limited to 'source') diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp index f28c79a86..848ebd6e0 100644 --- a/source/slang/slang-emit-cpp.cpp +++ b/source/slang/slang-emit-cpp.cpp @@ -2085,7 +2085,7 @@ void CPPSourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPre } } -static bool _isVariable(IROp op) +/* static */bool CPPSourceEmitter::_isVariable(IROp op) { switch (op) { @@ -2104,18 +2104,6 @@ static bool _isFunction(IROp op) return op == kIROp_Func; } -struct GlobalParamInfo -{ - typedef GlobalParamInfo ThisType; - bool operator<(const ThisType& rhs) const { return offset < rhs.offset; } - bool operator==(const ThisType& rhs) const { return offset == rhs.offset; } - bool operator!=(const ThisType& rhs) const { return !(*this == rhs); } - - IRInst* inst; - UInt offset; - UInt size; -}; - void CPPSourceEmitter::_emitEntryPointDefinitionStart(IRFunc* func, IRGlobalParam* entryPointGlobalParams, const String& funcName, const UnownedStringSlice& varyingTypeName) { auto resultType = func->getResultType(); @@ -2129,8 +2117,9 @@ void CPPSourceEmitter::_emitEntryPointDefinitionStart(IRFunc* func, IRGlobalPara m_writer->emit("("); m_writer->emit(varyingTypeName); - m_writer->emit("* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)\n{\n"); + m_writer->emit("* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)"); emitSemantics(func); + m_writer->emit("\n{\n"); m_writer->indent(); // Initialize when constructing so that globals are zeroed @@ -2324,16 +2313,8 @@ void CPPSourceEmitter::_emitInitAxisValues(const Int sizeAlongAxis[kThreadGroupA m_writer->emit("};\n"); } -void CPPSourceEmitter::emitModuleImpl(IRModule* module) +void CPPSourceEmitter::_emitForwardDeclarations(const List& actions) { - // Setup all built in types used in the module - m_typeSet.addAllBuiltinTypes(module); - // If any matrix types are used, then we need appropriate vector types too. - m_typeSet.addVectorForMatrixTypes(); - - List actions; - computeEmitActions(module, actions); - // Emit forward declarations. Don't emit variables that need to be grouped or function definitions (which will ref those types) for (auto action : actions) { @@ -2355,67 +2336,95 @@ void CPPSourceEmitter::emitModuleImpl(IRModule* module) break; } } +} - IRGlobalParam* entryPointGlobalParams = nullptr; +void CPPSourceEmitter::_calcGlobalParams(const List& actions, List& outParams, IRGlobalParam** outEntryPointGlobalParams) +{ + outParams.clear(); + *outEntryPointGlobalParams = nullptr; - // Output the global parameters in a 'UniformState' structure + IRGlobalParam* entryPointGlobalParams = nullptr; + for (auto action : actions) { - m_writer->emit("struct UniformState\n{\n"); - m_writer->indent(); - - List params; - - for (auto action : actions) + if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalParam) { - if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalParam) - { - auto inst = action.inst; + auto inst = action.inst; - if (inst->findDecorationImpl(kIROp_EntryPointParamDecoration)) - { - // Should only be one instruction marked this way - SLANG_ASSERT(entryPointGlobalParams == nullptr); - entryPointGlobalParams = as(inst); - continue; - } + if (inst->findDecorationImpl(kIROp_EntryPointParamDecoration)) + { + // Should only be one instruction marked this way + SLANG_ASSERT(entryPointGlobalParams == nullptr); + entryPointGlobalParams = as(inst); + continue; + } - IRVarLayout* varLayout = CLikeSourceEmitter::getVarLayout(action.inst); - SLANG_ASSERT(varLayout); + IRVarLayout* varLayout = CLikeSourceEmitter::getVarLayout(action.inst); + SLANG_ASSERT(varLayout); - IRVarOffsetAttr* offsetAttr = varLayout->findOffsetAttr(LayoutResourceKind::Uniform); - IRTypeLayout* typeLayout = varLayout->getTypeLayout(); - IRTypeSizeAttr* sizeAttr = typeLayout->findSizeAttr(LayoutResourceKind::Uniform); + IRVarOffsetAttr* offsetAttr = varLayout->findOffsetAttr(LayoutResourceKind::Uniform); + IRTypeLayout* typeLayout = varLayout->getTypeLayout(); + IRTypeSizeAttr* sizeAttr = typeLayout->findSizeAttr(LayoutResourceKind::Uniform); - GlobalParamInfo paramInfo; - paramInfo.inst = action.inst; - // Index is the byte offset for uniform - paramInfo.offset = offsetAttr ? offsetAttr->getOffset() : 0; - paramInfo.size = sizeAttr ? sizeAttr->getFiniteSize() : 0; + GlobalParamInfo paramInfo; + paramInfo.inst = action.inst; + // Index is the byte offset for uniform + paramInfo.offset = offsetAttr ? offsetAttr->getOffset() : 0; + paramInfo.size = sizeAttr ? sizeAttr->getFiniteSize() : 0; - params.add(paramInfo); - } + outParams.add(paramInfo); } + } - // We want to sort by layout offset, and insert suitable padding - params.sort(); + // We want to sort by layout offset, and insert suitable padding + outParams.sort(); - int padIndex = 0; - size_t offset = 0; - for (const auto& paramInfo : params) - { - if (offset < paramInfo.offset) - { - // We want to output some padding - StringBuilder builder; - builder << "uint8_t _pad" << (padIndex++) << "[" << (paramInfo.offset - offset) << "];\n"; - } + *outEntryPointGlobalParams = entryPointGlobalParams; +} + +void CPPSourceEmitter::_emitUniformStateMembers(const List& actions, IRGlobalParam** outEntryPointGlobalParams) +{ + List params; + _calcGlobalParams(actions, params, outEntryPointGlobalParams); - emitGlobalInst(paramInfo.inst); - // Set offset after this - offset = paramInfo.offset + paramInfo.size; + int padIndex = 0; + size_t offset = 0; + for (const auto& paramInfo : params) + { + if (offset < paramInfo.offset) + { + // We want to output some padding + StringBuilder builder; + builder << "uint8_t _pad" << (padIndex++) << "[" << (paramInfo.offset - offset) << "];\n"; } - m_writer->emit("\n"); + emitGlobalInst(paramInfo.inst); + // Set offset after this + offset = paramInfo.offset + paramInfo.size; + } + m_writer->emit("\n"); +} + +void CPPSourceEmitter::emitModuleImpl(IRModule* module) +{ + // Setup all built in types used in the module + m_typeSet.addAllBuiltinTypes(module); + // If any matrix types are used, then we need appropriate vector types too. + m_typeSet.addVectorForMatrixTypes(); + + List actions; + computeEmitActions(module, actions); + + _emitForwardDeclarations(actions); + + IRGlobalParam* entryPointGlobalParams = nullptr; + + // Output the global parameters in a 'UniformState' structure + { + m_writer->emit("struct UniformState\n{\n"); + m_writer->indent(); + + _emitUniformStateMembers(actions, &entryPointGlobalParams); + m_writer->dedent(); m_writer->emit("\n};\n\n"); } diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h index d71983c92..df1dec380 100644 --- a/source/slang/slang-emit-cpp.h +++ b/source/slang/slang-emit-cpp.h @@ -37,6 +37,18 @@ public: int colCount; }; + struct GlobalParamInfo + { + typedef GlobalParamInfo ThisType; + bool operator<(const ThisType& rhs) const { return offset < rhs.offset; } + bool operator==(const ThisType& rhs) const { return offset == rhs.offset; } + bool operator!=(const ThisType& rhs) const { return !(*this == rhs); } + + IRInst* inst; + UInt offset; + UInt size; + }; + virtual void useType(IRType* type); virtual void emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec); virtual void emitTypeDefinition(IRType* type); @@ -69,12 +81,17 @@ protected: // Replaceable for classes derived from CPPSourceEmitter virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out); + void emitIntrinsicCallExpr( IRCall* inst, IRTargetIntrinsicDecoration* targetIntrinsic, EmitOpInfo const& inOuterPrec); + void _emitForwardDeclarations(const List& actions); + void _calcGlobalParams(const List& actions, List& outParams, IRGlobalParam** outEntryPointGlobalParams); + void _emitUniformStateMembers(const List& actions, IRGlobalParam** outEntryPointGlobalParams); + void _emitVecMatMulDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp); void _emitAryDefinition(const HLSLIntrinsic* specOp); @@ -119,6 +136,8 @@ protected: HLSLIntrinsic* _addIntrinsic(HLSLIntrinsic::Op op, IRType* returnType, IRType*const* argTypes, Index argTypeCount); + static bool _isVariable(IROp op); + Dictionary m_typeNameMap; Dictionary m_intrinsicNameMap; diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 980e94a29..12807e9e2 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -168,33 +168,7 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, const auto rowCount = int(GetIntVal(matType->getRowCount())); const auto colCount = int(GetIntVal(matType->getColumnCount())); - if (target == CodeGenTarget::CPPSource) - { - out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; - } - else - { - out << "Mat"; - const UnownedStringSlice postFix = _getCTypeVecPostFix(_getCType(elementType->op)); - out << postFix; - if (postFix.size() > 1) - { - out << "_"; - } - out << rowCount; - out << colCount; - } - return SLANG_OK; - } - case kIROp_ArrayType: - { - auto arrayType = static_cast(type); - auto elementType = arrayType->getElementType(); - int elementCount = int(GetIntVal(arrayType->getElementCount())); - - out << "FixedArray<"; - SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out)); - out << ", " << elementCount << ">"; + out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; return SLANG_OK; } case kIROp_UnsizedArrayType: @@ -250,37 +224,11 @@ void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniform void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) { - auto profile = m_effectiveProfile; - auto stage = entryPointDecor->getProfile().GetStage(); - - switch (stage) - { - case Stage::Compute: - { - Int sizeAlongAxis[kThreadGroupAxisCount]; - getComputeThreadGroupSize(irFunc, sizeAlongAxis); - -#if 1 - m_writer->emit("// [numthreads("); - for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) - { - if (ii != 0) m_writer->emit(", "); - m_writer->emit(sizeAlongAxis[ii]); - } - m_writer->emit(")]\n"); -#endif - - m_writer->emit("extern \"C\" __global__ "); - break; - } - - // TODO: There are other stages that will need this kind of handling. - default: - break; - } + SLANG_UNUSED(irFunc); + SLANG_UNUSED(entryPointDecor); } -void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) +void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) { if (shouldFoldInstIntoUseSites(inst)) { @@ -326,7 +274,8 @@ void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPr } default: break; } - m_writer->emit(getName(inst)); + + Super::emitOperandImpl(inst, outerPrec); } bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) @@ -398,19 +347,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu return false; } -bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) -{ - SLANG_UNUSED(varDecl); - SLANG_UNUSED(varType); - - // (__device__/__constant__/__shared__/__managed__) - - m_writer->emit("__device__ "); - - // Use the default impl otherwise - return false; -} - void CUDASourceEmitter::emitLayoutDirectivesImpl(TargetRequest* targetReq) { SLANG_UNUSED(targetReq); @@ -462,15 +398,8 @@ void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func) void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func) { - if (IREntryPointDecoration* entryPointDecor = func->findDecoration()) - { - // If its an entry point, we let the entry point attribute control the output - } - else - { - // If it's not an entry point mark as device - m_writer->emit("__device__ "); - } + // Mark as run on device. Don't need to worry about entry point, as that is output separtely to call the __device_ implementation + m_writer->emit("__device__ "); CLikeSourceEmitter::emitSimpleFuncImpl(func); } @@ -547,8 +476,125 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) List actions; computeEmitActions(module, actions); - executeEmitActions(actions); + + _emitForwardDeclarations(actions); + + IRGlobalParam* entryPointGlobalParams = nullptr; + + // Output the global parameters in a 'UniformState' structure + { + m_writer->emit("struct UniformState\n{\n"); + m_writer->indent(); + + // We need these to be prefixed by __device__ + _emitUniformStateMembers(actions, &entryPointGlobalParams); + + m_writer->dedent(); + m_writer->emit("\n};\n\n"); + } + + // Output the 'Context' which will be used for execution + { + m_writer->emit("struct Context\n{\n"); + m_writer->indent(); + + m_writer->emit("UniformState* uniformState;\n"); + + if (entryPointGlobalParams) + { + emitGlobalInst(entryPointGlobalParams); + } + + // Output all the thread locals + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op)) + { + emitGlobalInst(action.inst); + } + } + + // Finally output the functions as methods on the context + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as(action.inst)) + { + emitGlobalInst(action.inst); + } + } + + m_writer->dedent(); + m_writer->emit("};\n\n"); + } + + // Finally we need to output dll entry points + + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as(action.inst)) + { + IRFunc* func = as(action.inst); + + IREntryPointDecoration* entryPointDecor = func->findDecoration(); + + if (entryPointDecor && entryPointDecor->getProfile().GetStage() == Stage::Compute) + { + Int sizeAlongAxis[kThreadGroupAxisCount]; + getComputeThreadGroupSize(func, sizeAlongAxis); + + // + m_writer->emit("// [numthreads("); + for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) + { + if (ii != 0) m_writer->emit(", "); + m_writer->emit(sizeAlongAxis[ii]); + } + m_writer->emit(")]\n"); + + String funcName = getName(func); + + m_writer->emit("extern \"C\" __global__ "); + + auto resultType = func->getResultType(); + + // Emit the actual function + emitEntryPointAttributes(func, entryPointDecor); + emitType(resultType, funcName); + + m_writer->emit("(UniformEntryPointParams* params, UniformState* uniformState)"); + emitSemantics(func); + m_writer->emit("\n{\n"); + m_writer->indent(); + + // Initialize when constructing so that globals are zeroed + m_writer->emit("Context context = {};\n"); + m_writer->emit("context.uniformState = uniformState;\n"); + + if (entryPointGlobalParams) + { + auto varDecl = entryPointGlobalParams; + auto rawType = varDecl->getDataType(); + + auto varType = rawType; + + m_writer->emit("context."); + m_writer->emit(getName(varDecl)); + m_writer->emit(" = ("); + emitType(varType); + m_writer->emit("*)params; \n"); + } + + m_writer->emit("context."); + m_writer->emit(funcName); + m_writer->emit("();\n"); + + m_writer->dedent(); + m_writer->emit("}\n"); + } + } + } + } diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h index 1c69c975f..c3c88e156 100644 --- a/source/slang/slang-emit-cuda.h +++ b/source/slang/slang-emit-cuda.h @@ -47,7 +47,7 @@ protected: virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE; virtual void emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) SLANG_OVERRIDE; - virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; + //virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE; virtual void emitPreprocessorDirectivesImpl() SLANG_OVERRIDE; -- cgit v1.2.3