diff options
| author | lucy96chen <47800040+lucy96chen@users.noreply.github.com> | 2022-07-25 13:36:43 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-07-25 13:36:43 -0700 |
| commit | ce6e946f6f4882aba8a62392ae791c948633e2e3 (patch) | |
| tree | fde46847900ef80ee111a6b612f7e0931b15fdf2 /tools/gfx/cuda/cuda-command-queue.cpp | |
| parent | 129294a58d2a51308af78ad5d8d436c026863259 (diff) | |
Split render-cuda.cpp into smaller files (#2334)
* render-cuda split, compile errors galore due to missing includes etc.
* render-cuda split and fully compiles
* Ran premake.bat to disable cuda; Added all new files
* Removed render-cuda files
* CI fixes
* Rerun CI
Diffstat (limited to 'tools/gfx/cuda/cuda-command-queue.cpp')
| -rw-r--r-- | tools/gfx/cuda/cuda-command-queue.cpp | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp new file mode 100644 index 000000000..60e81246d --- /dev/null +++ b/tools/gfx/cuda/cuda-command-queue.cpp @@ -0,0 +1,225 @@ +// cuda-command-queue.cpp +#include "cuda-command-queue.h" + +#include "cuda-buffer.h" +#include "cuda-command-buffer.h" +#include "cuda-query.h" +#include "cuda-shader-object-layout.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid) +{ + if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) + return static_cast<ICommandQueue*>(this); + return nullptr; +} + +void CommandQueueImpl::init(DeviceImpl* inRenderer) +{ + renderer = inRenderer; + m_desc.type = ICommandQueue::QueueType::Graphics; + cuStreamCreate(&stream, 0); +} +CommandQueueImpl::~CommandQueueImpl() +{ + cuStreamSynchronize(stream); + cuStreamDestroy(stream); + currentPipeline = nullptr; + currentRootObject = nullptr; +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers( + GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) +{ + SLANG_UNUSED(valueToSignal); + // TODO: implement fence. + assert(fence == nullptr); + for (GfxIndex i = 0; i < count; i++) + { + execute(static_cast<CommandBufferImpl*>(commandBuffers[i])); + } +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost() +{ + auto resultCode = cuStreamSynchronize(stream); + if (resultCode != cudaSuccess) + SLANG_CUDA_HANDLE_ERROR(resultCode); +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice( + GfxCount fenceCount, IFence** fences, uint64_t* waitValues) +{ + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle) +{ + return SLANG_FAIL; +} + +void CommandQueueImpl::setPipelineState(IPipelineState* state) +{ + currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state); +} + +Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object) +{ + currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object); + if (currentRootObject) + return SLANG_OK; + return SLANG_E_INVALID_ARG; +} + +void CommandQueueImpl::dispatchCompute(int x, int y, int z) +{ + // Specialize the compute kernel based on the shader object bindings. + RefPtr<PipelineStateBase> newPipeline; + renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); + currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr()); + + // Find out thread group size from program reflection. + auto& kernelName = currentPipeline->shaderProgram->kernelName; + auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout()); + int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); + SLANG_ASSERT(kernelId != -1); + UInt threadGroupSize[3]; + programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); + + int sharedSizeInBytes; + cuFuncGetAttribute( + &sharedSizeInBytes, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + currentPipeline->shaderProgram->cudaKernel); + + // Copy global parameter data to the `SLANG_globalParams` symbol. + { + CUdeviceptr globalParamsSymbol = 0; + size_t globalParamsSymbolSize = 0; + cuModuleGetGlobal( + &globalParamsSymbol, + &globalParamsSymbolSize, + currentPipeline->shaderProgram->cudaModule, + "SLANG_globalParams"); + + CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); + cudaMemcpyAsync( + (void*)globalParamsSymbol, + (void*)globalParamsCUDAData, + globalParamsSymbolSize, + cudaMemcpyDefault, + 0); + } + // + // The argument data for the entry-point parameters are already + // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. + // + auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); + auto entryPointDataSize = + currentRootObject->entryPointObjects[kernelId]->getBufferSize(); + + void* extraOptions[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + entryPointBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &entryPointDataSize, + CU_LAUNCH_PARAM_END, + }; + + // Once we have all the necessary data extracted and/or + // set up, we can launch the kernel and see what happens. + // + auto cudaLaunchResult = cuLaunchKernel( + currentPipeline->shaderProgram->cudaKernel, + x, + y, + z, + int(threadGroupSize[0]), + int(threadGroupSize[1]), + int(threadGroupSize[2]), + sharedSizeInBytes, + stream, + nullptr, + extraOptions); + + SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); +} + +void CommandQueueImpl::copyBuffer( + IBufferResource* dst, + size_t dstOffset, + IBufferResource* src, + size_t srcOffset, + size_t size) +{ + auto dstImpl = static_cast<BufferResourceImpl*>(dst); + auto srcImpl = static_cast<BufferResourceImpl*>(src); + cudaMemcpy( + (uint8_t*)dstImpl->m_cudaMemory + dstOffset, + (uint8_t*)srcImpl->m_cudaMemory + srcOffset, + size, + cudaMemcpyDefault); +} + +void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) +{ + auto dstImpl = static_cast<BufferResourceImpl*>(dst); + cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); +} + +void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index) +{ + auto poolImpl = static_cast<QueryPoolImpl*>(pool); + cuEventRecord(poolImpl->m_events[index], stream); +} + +void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer) +{ + for (auto& cmd : commandBuffer->m_commands) + { + switch (cmd.name) + { + case CommandName::SetPipelineState: + setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0])); + break; + case CommandName::BindRootShaderObject: + bindRootShaderObject( + commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0])); + break; + case CommandName::DispatchCompute: + dispatchCompute( + int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); + break; + case CommandName::CopyBuffer: + copyBuffer( + commandBuffer->getObject<BufferResource>(cmd.operands[0]), + cmd.operands[1], + commandBuffer->getObject<BufferResource>(cmd.operands[2]), + cmd.operands[3], + cmd.operands[4]); + break; + case CommandName::UploadBufferData: + uploadBufferData( + commandBuffer->getObject<BufferResource>(cmd.operands[0]), + cmd.operands[1], + cmd.operands[2], + commandBuffer->getData<uint8_t>(cmd.operands[3])); + break; + case CommandName::WriteTimestamp: + writeTimestamp( + commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]), + (SlangInt)cmd.operands[1]); + } + } +} + +} // namespace cuda +#endif +} // namespace gfx |
