// cuda-command-queue.cpp #include "cuda-command-queue.h" #include "cuda-buffer.h" #include "cuda-command-buffer.h" #include "cuda-query.h" #include "cuda-shader-object-layout.h" namespace gfx { #ifdef GFX_ENABLE_CUDA using namespace Slang; namespace cuda { ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid) { if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) return static_cast(this); return nullptr; } void CommandQueueImpl::init(DeviceImpl* inRenderer) { renderer = inRenderer; m_desc.type = ICommandQueue::QueueType::Graphics; cuStreamCreate(&stream, 0); } CommandQueueImpl::~CommandQueueImpl() { cuStreamSynchronize(stream); cuStreamDestroy(stream); currentPipeline = nullptr; currentRootObject = nullptr; } SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers( GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) { SLANG_UNUSED(valueToSignal); // TODO: implement fence. assert(fence == nullptr); for (GfxIndex i = 0; i < count; i++) { execute(static_cast(commandBuffers[i])); } } SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost() { auto resultCode = cuStreamSynchronize(stream); if (resultCode != CUDA_SUCCESS) SLANG_CUDA_HANDLE_ERROR(resultCode); } SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice( GfxCount fenceCount, IFence** fences, uint64_t* waitValues) { return SLANG_FAIL; } SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle) { return SLANG_FAIL; } void CommandQueueImpl::setPipelineState(IPipelineState* state) { currentPipeline = dynamic_cast(state); } Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object) { currentRootObject = dynamic_cast(object); if (currentRootObject) return SLANG_OK; return SLANG_E_INVALID_ARG; } void CommandQueueImpl::dispatchCompute(int x, int y, int z) { // Specialize the compute kernel based on the shader object bindings. RefPtr newPipeline; renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); currentPipeline = static_cast(newPipeline.Ptr()); // Find out thread group size from program reflection. auto& kernelName = currentPipeline->shaderProgram->kernelName; auto programLayout = static_cast(currentRootObject->getLayout()); int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); SLANG_ASSERT(kernelId != -1); UInt threadGroupSize[3]; programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); // Copy global parameter data to the `SLANG_globalParams` symbol. { CUdeviceptr globalParamsSymbol = 0; size_t globalParamsSymbolSize = 0; cuModuleGetGlobal( &globalParamsSymbol, &globalParamsSymbolSize, currentPipeline->shaderProgram->cudaModule, "SLANG_globalParams"); CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); cudaMemcpyAsync( (void*)globalParamsSymbol, (void*)globalParamsCUDAData, globalParamsSymbolSize, cudaMemcpyDefault, 0); } // // The argument data for the entry-point parameters are already // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. // auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); auto entryPointDataSize = currentRootObject->entryPointObjects[kernelId]->getBufferSize(); void* extraOptions[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, entryPointBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize, CU_LAUNCH_PARAM_END, }; // Once we have all the necessary data extracted and/or // set up, we can launch the kernel and see what happens. // auto cudaLaunchResult = cuLaunchKernel( currentPipeline->shaderProgram->cudaKernel, x, y, z, int(threadGroupSize[0]), int(threadGroupSize[1]), int(threadGroupSize[2]), 0, stream, nullptr, extraOptions); SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); } void CommandQueueImpl::copyBuffer( IBufferResource* dst, size_t dstOffset, IBufferResource* src, size_t srcOffset, size_t size) { auto dstImpl = static_cast(dst); auto srcImpl = static_cast(src); cudaMemcpy( (uint8_t*)dstImpl->m_cudaMemory + dstOffset, (uint8_t*)srcImpl->m_cudaMemory + srcOffset, size, cudaMemcpyDefault); } void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) { auto dstImpl = static_cast(dst); cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); } void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index) { auto poolImpl = static_cast(pool); cuEventRecord(poolImpl->m_events[index], stream); } void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer) { for (auto& cmd : commandBuffer->m_commands) { switch (cmd.name) { case CommandName::SetPipelineState: setPipelineState(commandBuffer->getObject(cmd.operands[0])); break; case CommandName::BindRootShaderObject: bindRootShaderObject( commandBuffer->getObject(cmd.operands[0])); break; case CommandName::DispatchCompute: dispatchCompute( int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); break; case CommandName::CopyBuffer: copyBuffer( commandBuffer->getObject(cmd.operands[0]), cmd.operands[1], commandBuffer->getObject(cmd.operands[2]), cmd.operands[3], cmd.operands[4]); break; case CommandName::UploadBufferData: uploadBufferData( commandBuffer->getObject(cmd.operands[0]), cmd.operands[1], cmd.operands[2], commandBuffer->getData(cmd.operands[3])); break; case CommandName::WriteTimestamp: writeTimestamp( commandBuffer->getObject(cmd.operands[0]), (SlangInt)cmd.operands[1]); } } } } // namespace cuda #endif } // namespace gfx