diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-01-10 15:00:13 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-01-10 15:00:13 -0500 |
| commit | ef41dfc605f7868c0ccc7dde05982232b7d49589 (patch) | |
| tree | a2abe79250c234c65f968976db2578341ec77437 | |
| parent | f2a123d727316d8203820a332da1348f78ad9ad6 (diff) | |
WIP: CPU like CUDA binding (#1164)
* CUDA generated first test compiles.
* WIP on enabling CUDA in render-test.
* Detect CUDA_PATH environmental variable to build build cuda support into render-test.
Added WIP cuda-compute-util.cpp/h
Added CUDA as a renderer type.
* Fix libraries needed for cuda in premake.
* Added -enable-cuda premake option. Defaults to false.
* Creates CUDA device, loads PTX and finds entry point.
* Fix some erroneous cruft from slang-cuda-prelude.h
* Made CUDA use C++ like ABI for generated code.
Fix small bug in C++ output semantics.
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 8 | ||||
| -rw-r--r-- | source/slang/slang-emit-cpp.cpp | 147 | ||||
| -rw-r--r-- | source/slang/slang-emit-cpp.h | 19 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 206 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.h | 2 |
5 files changed, 231 insertions, 151 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 4d4681baf..6e20d55c0 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -9,4 +9,10 @@ struct FixedArray __device__ T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; } T m_data[SIZE]; -};
\ No newline at end of file +}; + +/* Type that defines the uniform entry point params. The actual content of this type is dependent on the entry point parameters, and can be +found via reflection or defined such that it matches the shader appropriately. +*/ +struct UniformEntryPointParams; +struct UniformState;
\ No newline at end of file diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp index f28c79a86..848ebd6e0 100644 --- a/source/slang/slang-emit-cpp.cpp +++ b/source/slang/slang-emit-cpp.cpp @@ -2085,7 +2085,7 @@ void CPPSourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPre } } -static bool _isVariable(IROp op) +/* static */bool CPPSourceEmitter::_isVariable(IROp op) { switch (op) { @@ -2104,18 +2104,6 @@ static bool _isFunction(IROp op) return op == kIROp_Func; } -struct GlobalParamInfo -{ - typedef GlobalParamInfo ThisType; - bool operator<(const ThisType& rhs) const { return offset < rhs.offset; } - bool operator==(const ThisType& rhs) const { return offset == rhs.offset; } - bool operator!=(const ThisType& rhs) const { return !(*this == rhs); } - - IRInst* inst; - UInt offset; - UInt size; -}; - void CPPSourceEmitter::_emitEntryPointDefinitionStart(IRFunc* func, IRGlobalParam* entryPointGlobalParams, const String& funcName, const UnownedStringSlice& varyingTypeName) { auto resultType = func->getResultType(); @@ -2129,8 +2117,9 @@ void CPPSourceEmitter::_emitEntryPointDefinitionStart(IRFunc* func, IRGlobalPara m_writer->emit("("); m_writer->emit(varyingTypeName); - m_writer->emit("* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)\n{\n"); + m_writer->emit("* varyingInput, UniformEntryPointParams* params, UniformState* uniformState)"); emitSemantics(func); + m_writer->emit("\n{\n"); m_writer->indent(); // Initialize when constructing so that globals are zeroed @@ -2324,16 +2313,8 @@ void CPPSourceEmitter::_emitInitAxisValues(const Int sizeAlongAxis[kThreadGroupA m_writer->emit("};\n"); } -void CPPSourceEmitter::emitModuleImpl(IRModule* module) +void CPPSourceEmitter::_emitForwardDeclarations(const List<EmitAction>& actions) { - // Setup all built in types used in the module - m_typeSet.addAllBuiltinTypes(module); - // If any matrix types are used, then we need appropriate vector types too. - m_typeSet.addVectorForMatrixTypes(); - - List<EmitAction> actions; - computeEmitActions(module, actions); - // Emit forward declarations. Don't emit variables that need to be grouped or function definitions (which will ref those types) for (auto action : actions) { @@ -2355,67 +2336,95 @@ void CPPSourceEmitter::emitModuleImpl(IRModule* module) break; } } +} - IRGlobalParam* entryPointGlobalParams = nullptr; +void CPPSourceEmitter::_calcGlobalParams(const List<EmitAction>& actions, List<GlobalParamInfo>& outParams, IRGlobalParam** outEntryPointGlobalParams) +{ + outParams.clear(); + *outEntryPointGlobalParams = nullptr; - // Output the global parameters in a 'UniformState' structure + IRGlobalParam* entryPointGlobalParams = nullptr; + for (auto action : actions) { - m_writer->emit("struct UniformState\n{\n"); - m_writer->indent(); - - List<GlobalParamInfo> params; - - for (auto action : actions) + if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalParam) { - if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalParam) - { - auto inst = action.inst; + auto inst = action.inst; - if (inst->findDecorationImpl(kIROp_EntryPointParamDecoration)) - { - // Should only be one instruction marked this way - SLANG_ASSERT(entryPointGlobalParams == nullptr); - entryPointGlobalParams = as<IRGlobalParam>(inst); - continue; - } + if (inst->findDecorationImpl(kIROp_EntryPointParamDecoration)) + { + // Should only be one instruction marked this way + SLANG_ASSERT(entryPointGlobalParams == nullptr); + entryPointGlobalParams = as<IRGlobalParam>(inst); + continue; + } - IRVarLayout* varLayout = CLikeSourceEmitter::getVarLayout(action.inst); - SLANG_ASSERT(varLayout); + IRVarLayout* varLayout = CLikeSourceEmitter::getVarLayout(action.inst); + SLANG_ASSERT(varLayout); - IRVarOffsetAttr* offsetAttr = varLayout->findOffsetAttr(LayoutResourceKind::Uniform); - IRTypeLayout* typeLayout = varLayout->getTypeLayout(); - IRTypeSizeAttr* sizeAttr = typeLayout->findSizeAttr(LayoutResourceKind::Uniform); + IRVarOffsetAttr* offsetAttr = varLayout->findOffsetAttr(LayoutResourceKind::Uniform); + IRTypeLayout* typeLayout = varLayout->getTypeLayout(); + IRTypeSizeAttr* sizeAttr = typeLayout->findSizeAttr(LayoutResourceKind::Uniform); - GlobalParamInfo paramInfo; - paramInfo.inst = action.inst; - // Index is the byte offset for uniform - paramInfo.offset = offsetAttr ? offsetAttr->getOffset() : 0; - paramInfo.size = sizeAttr ? sizeAttr->getFiniteSize() : 0; + GlobalParamInfo paramInfo; + paramInfo.inst = action.inst; + // Index is the byte offset for uniform + paramInfo.offset = offsetAttr ? offsetAttr->getOffset() : 0; + paramInfo.size = sizeAttr ? sizeAttr->getFiniteSize() : 0; - params.add(paramInfo); - } + outParams.add(paramInfo); } + } - // We want to sort by layout offset, and insert suitable padding - params.sort(); + // We want to sort by layout offset, and insert suitable padding + outParams.sort(); - int padIndex = 0; - size_t offset = 0; - for (const auto& paramInfo : params) - { - if (offset < paramInfo.offset) - { - // We want to output some padding - StringBuilder builder; - builder << "uint8_t _pad" << (padIndex++) << "[" << (paramInfo.offset - offset) << "];\n"; - } + *outEntryPointGlobalParams = entryPointGlobalParams; +} + +void CPPSourceEmitter::_emitUniformStateMembers(const List<EmitAction>& actions, IRGlobalParam** outEntryPointGlobalParams) +{ + List<GlobalParamInfo> params; + _calcGlobalParams(actions, params, outEntryPointGlobalParams); - emitGlobalInst(paramInfo.inst); - // Set offset after this - offset = paramInfo.offset + paramInfo.size; + int padIndex = 0; + size_t offset = 0; + for (const auto& paramInfo : params) + { + if (offset < paramInfo.offset) + { + // We want to output some padding + StringBuilder builder; + builder << "uint8_t _pad" << (padIndex++) << "[" << (paramInfo.offset - offset) << "];\n"; } - m_writer->emit("\n"); + emitGlobalInst(paramInfo.inst); + // Set offset after this + offset = paramInfo.offset + paramInfo.size; + } + m_writer->emit("\n"); +} + +void CPPSourceEmitter::emitModuleImpl(IRModule* module) +{ + // Setup all built in types used in the module + m_typeSet.addAllBuiltinTypes(module); + // If any matrix types are used, then we need appropriate vector types too. + m_typeSet.addVectorForMatrixTypes(); + + List<EmitAction> actions; + computeEmitActions(module, actions); + + _emitForwardDeclarations(actions); + + IRGlobalParam* entryPointGlobalParams = nullptr; + + // Output the global parameters in a 'UniformState' structure + { + m_writer->emit("struct UniformState\n{\n"); + m_writer->indent(); + + _emitUniformStateMembers(actions, &entryPointGlobalParams); + m_writer->dedent(); m_writer->emit("\n};\n\n"); } diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h index d71983c92..df1dec380 100644 --- a/source/slang/slang-emit-cpp.h +++ b/source/slang/slang-emit-cpp.h @@ -37,6 +37,18 @@ public: int colCount; }; + struct GlobalParamInfo + { + typedef GlobalParamInfo ThisType; + bool operator<(const ThisType& rhs) const { return offset < rhs.offset; } + bool operator==(const ThisType& rhs) const { return offset == rhs.offset; } + bool operator!=(const ThisType& rhs) const { return !(*this == rhs); } + + IRInst* inst; + UInt offset; + UInt size; + }; + virtual void useType(IRType* type); virtual void emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec); virtual void emitTypeDefinition(IRType* type); @@ -69,12 +81,17 @@ protected: // Replaceable for classes derived from CPPSourceEmitter virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out); + void emitIntrinsicCallExpr( IRCall* inst, IRTargetIntrinsicDecoration* targetIntrinsic, EmitOpInfo const& inOuterPrec); + void _emitForwardDeclarations(const List<EmitAction>& actions); + void _calcGlobalParams(const List<EmitAction>& actions, List<GlobalParamInfo>& outParams, IRGlobalParam** outEntryPointGlobalParams); + void _emitUniformStateMembers(const List<EmitAction>& actions, IRGlobalParam** outEntryPointGlobalParams); + void _emitVecMatMulDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp); void _emitAryDefinition(const HLSLIntrinsic* specOp); @@ -119,6 +136,8 @@ protected: HLSLIntrinsic* _addIntrinsic(HLSLIntrinsic::Op op, IRType* returnType, IRType*const* argTypes, Index argTypeCount); + static bool _isVariable(IROp op); + Dictionary<IRType*, StringSlicePool::Handle> m_typeNameMap; Dictionary<const HLSLIntrinsic*, StringSlicePool::Handle> m_intrinsicNameMap; diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 980e94a29..12807e9e2 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -168,33 +168,7 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, const auto rowCount = int(GetIntVal(matType->getRowCount())); const auto colCount = int(GetIntVal(matType->getColumnCount())); - if (target == CodeGenTarget::CPPSource) - { - out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; - } - else - { - out << "Mat"; - const UnownedStringSlice postFix = _getCTypeVecPostFix(_getCType(elementType->op)); - out << postFix; - if (postFix.size() > 1) - { - out << "_"; - } - out << rowCount; - out << colCount; - } - return SLANG_OK; - } - case kIROp_ArrayType: - { - auto arrayType = static_cast<IRArrayType*>(type); - auto elementType = arrayType->getElementType(); - int elementCount = int(GetIntVal(arrayType->getElementCount())); - - out << "FixedArray<"; - SLANG_RETURN_ON_FAIL(_calcTypeName(elementType, target, out)); - out << ", " << elementCount << ">"; + out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">"; return SLANG_OK; } case kIROp_UnsizedArrayType: @@ -250,37 +224,11 @@ void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniform void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) { - auto profile = m_effectiveProfile; - auto stage = entryPointDecor->getProfile().GetStage(); - - switch (stage) - { - case Stage::Compute: - { - Int sizeAlongAxis[kThreadGroupAxisCount]; - getComputeThreadGroupSize(irFunc, sizeAlongAxis); - -#if 1 - m_writer->emit("// [numthreads("); - for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) - { - if (ii != 0) m_writer->emit(", "); - m_writer->emit(sizeAlongAxis[ii]); - } - m_writer->emit(")]\n"); -#endif - - m_writer->emit("extern \"C\" __global__ "); - break; - } - - // TODO: There are other stages that will need this kind of handling. - default: - break; - } + SLANG_UNUSED(irFunc); + SLANG_UNUSED(entryPointDecor); } -void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) +void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) { if (shouldFoldInstIntoUseSites(inst)) { @@ -326,7 +274,8 @@ void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPr } default: break; } - m_writer->emit(getName(inst)); + + Super::emitOperandImpl(inst, outerPrec); } bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) @@ -398,19 +347,6 @@ bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu return false; } -bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) -{ - SLANG_UNUSED(varDecl); - SLANG_UNUSED(varType); - - // (__device__/__constant__/__shared__/__managed__) - - m_writer->emit("__device__ "); - - // Use the default impl otherwise - return false; -} - void CUDASourceEmitter::emitLayoutDirectivesImpl(TargetRequest* targetReq) { SLANG_UNUSED(targetReq); @@ -462,15 +398,8 @@ void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func) void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func) { - if (IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>()) - { - // If its an entry point, we let the entry point attribute control the output - } - else - { - // If it's not an entry point mark as device - m_writer->emit("__device__ "); - } + // Mark as run on device. Don't need to worry about entry point, as that is output separtely to call the __device_ implementation + m_writer->emit("__device__ "); CLikeSourceEmitter::emitSimpleFuncImpl(func); } @@ -547,8 +476,125 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) List<EmitAction> actions; computeEmitActions(module, actions); - executeEmitActions(actions); + + _emitForwardDeclarations(actions); + + IRGlobalParam* entryPointGlobalParams = nullptr; + + // Output the global parameters in a 'UniformState' structure + { + m_writer->emit("struct UniformState\n{\n"); + m_writer->indent(); + + // We need these to be prefixed by __device__ + _emitUniformStateMembers(actions, &entryPointGlobalParams); + + m_writer->dedent(); + m_writer->emit("\n};\n\n"); + } + + // Output the 'Context' which will be used for execution + { + m_writer->emit("struct Context\n{\n"); + m_writer->indent(); + + m_writer->emit("UniformState* uniformState;\n"); + + if (entryPointGlobalParams) + { + emitGlobalInst(entryPointGlobalParams); + } + + // Output all the thread locals + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op)) + { + emitGlobalInst(action.inst); + } + } + + // Finally output the functions as methods on the context + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst)) + { + emitGlobalInst(action.inst); + } + } + + m_writer->dedent(); + m_writer->emit("};\n\n"); + } + + // Finally we need to output dll entry points + + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst)) + { + IRFunc* func = as<IRFunc>(action.inst); + + IREntryPointDecoration* entryPointDecor = func->findDecoration<IREntryPointDecoration>(); + + if (entryPointDecor && entryPointDecor->getProfile().GetStage() == Stage::Compute) + { + Int sizeAlongAxis[kThreadGroupAxisCount]; + getComputeThreadGroupSize(func, sizeAlongAxis); + + // + m_writer->emit("// [numthreads("); + for (int ii = 0; ii < kThreadGroupAxisCount; ++ii) + { + if (ii != 0) m_writer->emit(", "); + m_writer->emit(sizeAlongAxis[ii]); + } + m_writer->emit(")]\n"); + + String funcName = getName(func); + + m_writer->emit("extern \"C\" __global__ "); + + auto resultType = func->getResultType(); + + // Emit the actual function + emitEntryPointAttributes(func, entryPointDecor); + emitType(resultType, funcName); + + m_writer->emit("(UniformEntryPointParams* params, UniformState* uniformState)"); + emitSemantics(func); + m_writer->emit("\n{\n"); + m_writer->indent(); + + // Initialize when constructing so that globals are zeroed + m_writer->emit("Context context = {};\n"); + m_writer->emit("context.uniformState = uniformState;\n"); + + if (entryPointGlobalParams) + { + auto varDecl = entryPointGlobalParams; + auto rawType = varDecl->getDataType(); + + auto varType = rawType; + + m_writer->emit("context."); + m_writer->emit(getName(varDecl)); + m_writer->emit(" = ("); + emitType(varType); + m_writer->emit("*)params; \n"); + } + + m_writer->emit("context."); + m_writer->emit(funcName); + m_writer->emit("();\n"); + + m_writer->dedent(); + m_writer->emit("}\n"); + } + } + } + } diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h index 1c69c975f..c3c88e156 100644 --- a/source/slang/slang-emit-cuda.h +++ b/source/slang/slang-emit-cuda.h @@ -47,7 +47,7 @@ protected: virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE; virtual void emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPrec) SLANG_OVERRIDE; - virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; + //virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE; virtual void emitPreprocessorDirectivesImpl() SLANG_OVERRIDE; |
