diff options
| -rw-r--r-- | source/slang/slang-emit-c-like.cpp | 7 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.h | 2 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 127 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.h | 3 | ||||
| -rw-r--r-- | source/slang/slang-emit.cpp | 20 | ||||
| -rw-r--r-- | source/slang/slang-ir-explicit-global-context.cpp | 50 | ||||
| -rw-r--r-- | source/slang/slang.vcxproj | 2 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.cpp | 95 |
8 files changed, 223 insertions, 83 deletions
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index ce3dc8957..4c6c89ef5 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -660,6 +660,11 @@ String CLikeSourceEmitter::scrubName(const String& name) return sb.ProduceString(); } +String CLikeSourceEmitter::generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor) +{ + return entryPointDecor->getName()->getStringSlice(); +} + String CLikeSourceEmitter::generateName(IRInst* inst) { // If the instruction names something @@ -686,7 +691,7 @@ String CLikeSourceEmitter::generateName(IRInst* inst) return "main"; } - return entryPointDecor->getName()->getStringSlice(); + return generateEntryPointNameImpl(entryPointDecor); } // If we have a name hint on the instruction, then we will try to use that diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h index d813a819e..b89d5d1c4 100644 --- a/source/slang/slang-emit-c-like.h +++ b/source/slang/slang-emit-c-like.h @@ -156,6 +156,8 @@ public: String scrubName(const String& name); String generateName(IRInst* inst); + virtual String generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor); + String getName(IRInst* inst); void emitDeclarator(IRDeclaratorInfo* declarator); diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 6f24d5b74..acd913865 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -239,7 +239,15 @@ void CUDASourceEmitter::emitLayoutSemanticsImpl(IRInst* inst, char const* unifor void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniformParameterGroupType* type) { - Super::emitParameterGroupImpl(varDecl, type); + auto elementType = type->getElementType(); + + m_writer->emit("extern \"C\" __constant__ "); + emitType(elementType, "SLANG_globalParams"); + m_writer->emit(";\n"); + + m_writer->emit("#define "); + m_writer->emit(getName(varDecl)); + m_writer->emit(" (&SLANG_globalParams)\n"); } void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) @@ -260,6 +268,59 @@ void CUDASourceEmitter::emitFunctionPreambleImpl(IRInst* inst) } } +String CUDASourceEmitter::generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor) +{ + // We have an entry-point function in the IR module, which we + // will want to emit as a `__global__` function in the generated + // CUDA C++. + // + // The most common case will be a compute kernel, in which case + // we will emit the function more or less as-is, including + // usingits original name as the name of the global symbol. + // + String funcName = Super::generateEntryPointNameImpl(entryPointDecor); + String globalSymbolName = funcName; + + // We also suport emitting ray tracing kernels for use with + // OptiX, and in that case the name of the global symbol + // must be prefixed to indicate to the OptiX runtime what + // stage it is to be compiled for. + // + auto stage = entryPointDecor->getProfile().getStage(); + switch( stage ) + { + default: + break; + +#define CASE(STAGE, PREFIX) \ + case Stage::STAGE: globalSymbolName = #PREFIX + funcName; break + + // Optix 7 Guide, Section 6.1 (Program input) + // + // > The input PTX should include one or more NVIDIA OptiX programs. + // > The type of program affects how the program can be used during + // > the execution of the pipeline. These program types are specified + // by prefixing the program’s name with the following: + // + // > Program type Function name prefix + CASE( RayGeneration, __raygen__); + CASE( Intersection, __intersection__); + CASE( AnyHit, __anyhit__); + CASE( ClosestHit, __closesthit__); + CASE( Miss, __miss__); + CASE( Callable, __direct_callable__); + // + // There are two stages (or "program types") supported by OptiX + // that Slang currently cannot target: + // + // CASE(ContinuationCallable, __continuation_callable__); + // CASE(Exception, __exception__); + // +#undef CASE + } + + return globalSymbolName; +} void CUDASourceEmitter::emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec) { @@ -642,6 +703,24 @@ void CUDASourceEmitter::emitPreprocessorDirectivesImpl() } } +bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) +{ + // A global shader parameter in the IR for CUDA output will + // either be the unique constant buffer that wraps all the + // global-scope parameters in the original code (which is + // handled as a special-case before this routine would be + // called), or it is one of the system-defined varying inputs + // like `threadIdx`. We won't need to emit anything in the + // output code for the latter case, so we need to emit + // nothing here and return `true` so that the base class + // uses our logic instead of the default. + // + SLANG_UNUSED(varDecl); + SLANG_UNUSED(varType); + return true; +} + + void CUDASourceEmitter::emitModuleImpl(IRModule* module) { // Setup all built in types used in the module @@ -660,51 +739,7 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) // TODO(JS): We may need to generate types (for example for matrices) - // TODO(JS): We need to determine which functions we need to inline - - // The IR will usually come in an order that respects - // dependencies between global declarations, but this - // isn't guaranteed, so we need to be careful about - // the order in which we emit things. - - List<EmitAction> actions; - - computeEmitActions(module, actions); - - - _emitForwardDeclarations(actions); - - // Output group shared variables - - { - for (auto action : actions) - { - if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && as<IRGroupSharedRate>(action.inst->getRate())) - { - emitGlobalInst(action.inst); - } - } - } - - { - // Output all the thread locals - for (auto action : actions) - { - if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && !as<IRGroupSharedRate>(action.inst->getRate())) - { - emitGlobalInst(action.inst); - } - } - - // Finally output the functions as methods on the context - for (auto action : actions) - { - if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst)) - { - emitGlobalInst(action.inst); - } - } - } + CLikeSourceEmitter::emitModuleImpl(module); } diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h index ae78a5e57..156d5fab1 100644 --- a/source/slang/slang-emit-cuda.h +++ b/source/slang/slang-emit-cuda.h @@ -57,12 +57,13 @@ protected: virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE; virtual void emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE; virtual void emitFunctionPreambleImpl(IRInst* inst) SLANG_OVERRIDE; + virtual String generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor) SLANG_OVERRIDE; virtual void emitLoopControlDecorationImpl(IRLoopControlDecoration* decl) SLANG_OVERRIDE; virtual void handleCallExprDecorationsImpl(IRInst* funcValue) SLANG_OVERRIDE; - //virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; + virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE; virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE; virtual void emitPreprocessorDirectivesImpl() SLANG_OVERRIDE; diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 7d8a4074e..af01e4b0d 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -242,20 +242,19 @@ Result linkAndOptimizeIR( CollectEntryPointUniformParamsOptions passOptions; switch( target ) { - default: + case CodeGenTarget::CUDASource: break; case CodeGenTarget::CPPSource: - case CodeGenTarget::CUDASource: passOptions.alwaysCreateCollectedParam = true; + default: + collectEntryPointUniformParams(irModule, passOptions); + #if 0 + dumpIRIfEnabled(compileRequest, irModule, "ENTRY POINT UNIFORMS COLLECTED"); + #endif + validateIRModuleIfEnabled(compileRequest, irModule); break; } - - collectEntryPointUniformParams(irModule, passOptions); - #if 0 - dumpIRIfEnabled(compileRequest, irModule, "ENTRY POINT UNIFORMS COLLECTED"); - #endif - validateIRModuleIfEnabled(compileRequest, irModule); } switch( target ) @@ -637,7 +636,10 @@ Result linkAndOptimizeIR( case CodeGenTarget::CUDASource: moveGlobalVarInitializationToEntryPoints(irModule); introduceExplicitGlobalContext(irModule, target); - convertEntryPointPtrParamsToRawPtrs(irModule); + if(target == CodeGenTarget::CPPSource) + { + convertEntryPointPtrParamsToRawPtrs(irModule); + } #if 0 dumpIRIfEnabled(compileRequest, irModule, "EXPLICIT GLOBAL CONTEXT INTRODUCED"); #endif diff --git a/source/slang/slang-ir-explicit-global-context.cpp b/source/slang/slang-ir-explicit-global-context.cpp index 8f11bce2c..32efd51e8 100644 --- a/source/slang/slang-ir-explicit-global-context.cpp +++ b/source/slang/slang-ir-explicit-global-context.cpp @@ -31,17 +31,6 @@ struct IntroduceExplicitGlobalContextPass IRBuilder builder(&sharedBuilder); - // The global context will be represneted by a `struct` - // type with a name hint of `KernelContext`. - // - m_contextStructType = builder.createStructType(); - builder.addNameHintDecoration(m_contextStructType, UnownedTerminatedStringSlice("KernelContext")); - - // The context will usually be passed around by pointer, - // so we get and cache that pointer type up front. - // - m_contextStructPtrType = builder.getPtrType(m_contextStructType); - // The transformation we will perform will need to affect // global variables, global shader parameters, and entry-point // function (at the very least), and we start with an explicit @@ -107,7 +96,13 @@ struct IntroduceExplicitGlobalContextPass // Note: If we ever changed out mind about the representation // and wanted to support multiple global parameters, we could // easily generalize this code to work with a list. - // + + // For CUDA output, we want to leave the global uniform + // parameter where it is, because it will translate to + // a global `__constant__` variable. + if(m_target == CodeGenTarget::CUDASource) + continue; + SLANG_ASSERT(!m_globalUniformsParam); m_globalUniformsParam = globalParam; } @@ -132,9 +127,36 @@ struct IntroduceExplicitGlobalContextPass } } + // If there are no global-scope entities that require processing, + // then we can completely skip the work of this pass for CUDA. + // + // Note: We cannot skip the rest of the pass for CPU, because + // it is responsible for introducing the explicit entry-point + // parameter that is used for passing in the global param(s). + // + if( m_target == CodeGenTarget::CUDASource ) + { + if( !m_globalUniformsParam && (m_globalVars.getCount() == 0) ) + { + return; + } + } + // Now that we've capture all the relevant global entities from the IR, // we can being to transform them in an appropriate order. // + // The global context will be represneted by a `struct` + // type with a name hint of `KernelContext`. + // + m_contextStructType = builder.createStructType(); + builder.addNameHintDecoration(m_contextStructType, UnownedTerminatedStringSlice("KernelContext")); + + // The context will usually be passed around by pointer, + // so we get and cache that pointer type up front. + // + m_contextStructPtrType = builder.getPtrType(m_contextStructType); + + // The first step will be to create fields in the `KernelContext` // type to represent any global parameters or global variables. // @@ -270,9 +292,9 @@ struct IntroduceExplicitGlobalContextPass // globalUniformsParam->insertBefore(firstOrdinary); } - else + else if(m_target == CodeGenTarget::CPPSource) { - // The nature of our current ABI for entry points on CPU/CUDA + // The nature of our current ABI for entry points on CPU // means that we need an explicit parameter to be *declared* // for the global uniforms, even if it is never used. // diff --git a/source/slang/slang.vcxproj b/source/slang/slang.vcxproj index f20a4a322..aaece7095 100644 --- a/source/slang/slang.vcxproj +++ b/source/slang/slang.vcxproj @@ -404,4 +404,4 @@ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <ImportGroup Label="ExtensionTargets"> </ImportGroup> -</Project>
\ No newline at end of file +</Project>
\ No newline at end of file diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp index 304784518..5acddf94f 100644 --- a/tools/render-test/cuda/cuda-compute-util.cpp +++ b/tools/render-test/cuda/cuda-compute-util.cpp @@ -979,6 +979,22 @@ static SlangResult _loadAndInvokeComputeProgram( ScopeCUDAModule cudaModule; SLANG_RETURN_ON_FAIL(cudaModule.load(kernelDesc.codeBegin)); + // The global-scope shader parameters in the input Slang program + // will be collected into a single `__constant__` global variable + // in the output CUDA module. + // + // We need to query the address of the `__constant__` variable + // so that we can copy parameter data into it when invoking + // a kernel. + // + // The Slang compiler always names this symbol `SLANG_globalParams` + // so that it is easy to look up independent of the module or + // entry point in question. + // + CUdeviceptr globalParamsSymbol = 0; + size_t globalParamsSymbolSize = 0; + cuModuleGetGlobal(&globalParamsSymbol, &globalParamsSymbolSize, cudaModule, "SLANG_globalParams"); + slang::EntryPointReflection* entryPoint = nullptr; auto entryPointCount = reflection->getEntryPointCount(); SLANG_ASSERT(entryPointCount == 1); @@ -999,25 +1015,82 @@ static SlangResult _loadAndInvokeComputeProgram( int sharedSizeInBytes; SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint)); - // Work out the args - CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue()); - CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue()); - - // NOTE! These are pointers to the cuda memory pointers - void* args[] = { &entryPointCUDAData , &uniformCUDAData }; - + // A single CUDA kernel can be invoked with thread groups + // of different shapes/sizes, but an HLSL/Slang compute + // kernel always has a fixed thread group shape baked in. + // We use reflection to query the thread-group size that + // the kernel expects, so that we can use the right size + // when invoking the kernel. + // SlangUInt numThreadsPerAxis[3]; entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); - // Launch + // The argument data for the kernel has been set up in `bindRoot`, + // which encapsulates global buffers for both the global and + // entry-point parameter data. + // + // In the case of global parameters, we just need to extract the + // device address of the parameter data, so we can copy it into + // the `SLANG_globalParams` symbol. + // + { + CUdeviceptr globalParamsCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue()); + cudaMemcpyAsync( + (void*) globalParamsSymbol, + (void*) globalParamsCUDAData, + globalParamsSymbolSize, + cudaMemcpyDeviceToDevice, + cudaStream); + } + // + // In the case of the entry-point parameters, we have to deal with + // two different wrinkles. + // + // First, the `bindRoot` will have the entry-point argument data + // stored in a GPU-memory buffer, but we actually need it to be + // in host CPU memory. We handle that for now by allocating a + // temporary host memory buffer (if needed) and copying the data + // from device to host. + // + auto entryPointBindValue = bindRoot.getEntryPointValue(); + CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(entryPointBindValue); + size_t entryPointDataSize = entryPointBindValue ? entryPointBindValue->m_sizeInBytes : 0; + void* entryPointHostData = nullptr; + if(entryPointDataSize) + { + entryPointHostData = alloca(entryPointDataSize); + cudaMemcpy(entryPointHostData, (void*)entryPointCUDAData, entryPointDataSize, cudaMemcpyDeviceToHost); + } + // + // Second, the argument data for the entry-point parameters has + // been allocated and filled in as a single buffer, but `cuLaunchKernel` + // defaults to taking pointers to each of the kernel arguments. + // + // We could loop over the entry-point parameters using the refleciton + // information, and set up a pointer to each using the offset stored + // for it in the reflection data. Such an approach would require + // us to create and fill in a dynamically-sized array here. + // + // Instead, we take advantage of a documented but seldom-used feature + // of `cuLaunchKernel` that allows the argument data for all of the + // kernel "launch parameters" to be specified as a single buffer. + // + void* extraOptions[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, (void*) entryPointHostData, + CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize, + CU_LAUNCH_PARAM_END, + }; + + // Once we have all the decessary data extracted and/or + // set up, we can launch the kernel and see what happens. + // auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint, dispatchSize[0], dispatchSize[1], dispatchSize[2], int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block 0, // Shared memory size cudaStream, // Stream. 0 is no stream. - args, // Args - nullptr); // extra - + nullptr, // Not using traditional argument passing + extraOptions); // Instead passing kernel arguments via "extra" options SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult); // Do a sync here. Makes sure any issues are detected early and not on some implicit sync |
