From ce6e946f6f4882aba8a62392ae791c948633e2e3 Mon Sep 17 00:00:00 2001 From: lucy96chen <47800040+lucy96chen@users.noreply.github.com> Date: Mon, 25 Jul 2022 13:36:43 -0700 Subject: Split render-cuda.cpp into smaller files (#2334) * render-cuda split, compile errors galore due to missing includes etc. * render-cuda split and fully compiles * Ran premake.bat to disable cuda; Added all new files * Removed render-cuda files * CI fixes * Rerun CI --- tools/gfx/cuda/cuda-command-queue.cpp | 225 ++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 tools/gfx/cuda/cuda-command-queue.cpp (limited to 'tools/gfx/cuda/cuda-command-queue.cpp') diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp new file mode 100644 index 000000000..60e81246d --- /dev/null +++ b/tools/gfx/cuda/cuda-command-queue.cpp @@ -0,0 +1,225 @@ +// cuda-command-queue.cpp +#include "cuda-command-queue.h" + +#include "cuda-buffer.h" +#include "cuda-command-buffer.h" +#include "cuda-query.h" +#include "cuda-shader-object-layout.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid) +{ + if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) + return static_cast(this); + return nullptr; +} + +void CommandQueueImpl::init(DeviceImpl* inRenderer) +{ + renderer = inRenderer; + m_desc.type = ICommandQueue::QueueType::Graphics; + cuStreamCreate(&stream, 0); +} +CommandQueueImpl::~CommandQueueImpl() +{ + cuStreamSynchronize(stream); + cuStreamDestroy(stream); + currentPipeline = nullptr; + currentRootObject = nullptr; +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers( + GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) +{ + SLANG_UNUSED(valueToSignal); + // TODO: implement fence. + assert(fence == nullptr); + for (GfxIndex i = 0; i < count; i++) + { + execute(static_cast(commandBuffers[i])); + } +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost() +{ + auto resultCode = cuStreamSynchronize(stream); + if (resultCode != cudaSuccess) + SLANG_CUDA_HANDLE_ERROR(resultCode); +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice( + GfxCount fenceCount, IFence** fences, uint64_t* waitValues) +{ + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle) +{ + return SLANG_FAIL; +} + +void CommandQueueImpl::setPipelineState(IPipelineState* state) +{ + currentPipeline = dynamic_cast(state); +} + +Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object) +{ + currentRootObject = dynamic_cast(object); + if (currentRootObject) + return SLANG_OK; + return SLANG_E_INVALID_ARG; +} + +void CommandQueueImpl::dispatchCompute(int x, int y, int z) +{ + // Specialize the compute kernel based on the shader object bindings. + RefPtr newPipeline; + renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); + currentPipeline = static_cast(newPipeline.Ptr()); + + // Find out thread group size from program reflection. + auto& kernelName = currentPipeline->shaderProgram->kernelName; + auto programLayout = static_cast(currentRootObject->getLayout()); + int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); + SLANG_ASSERT(kernelId != -1); + UInt threadGroupSize[3]; + programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); + + int sharedSizeInBytes; + cuFuncGetAttribute( + &sharedSizeInBytes, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + currentPipeline->shaderProgram->cudaKernel); + + // Copy global parameter data to the `SLANG_globalParams` symbol. + { + CUdeviceptr globalParamsSymbol = 0; + size_t globalParamsSymbolSize = 0; + cuModuleGetGlobal( + &globalParamsSymbol, + &globalParamsSymbolSize, + currentPipeline->shaderProgram->cudaModule, + "SLANG_globalParams"); + + CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); + cudaMemcpyAsync( + (void*)globalParamsSymbol, + (void*)globalParamsCUDAData, + globalParamsSymbolSize, + cudaMemcpyDefault, + 0); + } + // + // The argument data for the entry-point parameters are already + // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. + // + auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); + auto entryPointDataSize = + currentRootObject->entryPointObjects[kernelId]->getBufferSize(); + + void* extraOptions[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + entryPointBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &entryPointDataSize, + CU_LAUNCH_PARAM_END, + }; + + // Once we have all the necessary data extracted and/or + // set up, we can launch the kernel and see what happens. + // + auto cudaLaunchResult = cuLaunchKernel( + currentPipeline->shaderProgram->cudaKernel, + x, + y, + z, + int(threadGroupSize[0]), + int(threadGroupSize[1]), + int(threadGroupSize[2]), + sharedSizeInBytes, + stream, + nullptr, + extraOptions); + + SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); +} + +void CommandQueueImpl::copyBuffer( + IBufferResource* dst, + size_t dstOffset, + IBufferResource* src, + size_t srcOffset, + size_t size) +{ + auto dstImpl = static_cast(dst); + auto srcImpl = static_cast(src); + cudaMemcpy( + (uint8_t*)dstImpl->m_cudaMemory + dstOffset, + (uint8_t*)srcImpl->m_cudaMemory + srcOffset, + size, + cudaMemcpyDefault); +} + +void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) +{ + auto dstImpl = static_cast(dst); + cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); +} + +void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index) +{ + auto poolImpl = static_cast(pool); + cuEventRecord(poolImpl->m_events[index], stream); +} + +void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer) +{ + for (auto& cmd : commandBuffer->m_commands) + { + switch (cmd.name) + { + case CommandName::SetPipelineState: + setPipelineState(commandBuffer->getObject(cmd.operands[0])); + break; + case CommandName::BindRootShaderObject: + bindRootShaderObject( + commandBuffer->getObject(cmd.operands[0])); + break; + case CommandName::DispatchCompute: + dispatchCompute( + int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); + break; + case CommandName::CopyBuffer: + copyBuffer( + commandBuffer->getObject(cmd.operands[0]), + cmd.operands[1], + commandBuffer->getObject(cmd.operands[2]), + cmd.operands[3], + cmd.operands[4]); + break; + case CommandName::UploadBufferData: + uploadBufferData( + commandBuffer->getObject(cmd.operands[0]), + cmd.operands[1], + cmd.operands[2], + commandBuffer->getData(cmd.operands[3])); + break; + case CommandName::WriteTimestamp: + writeTimestamp( + commandBuffer->getObject(cmd.operands[0]), + (SlangInt)cmd.operands[1]); + } + } +} + +} // namespace cuda +#endif +} // namespace gfx -- cgit v1.2.3