summaryrefslogtreecommitdiffstats
path: root/tools/gfx/cuda/cuda-command-queue.cpp
diff options
context:
space:
mode:
authorlucy96chen <47800040+lucy96chen@users.noreply.github.com>2022-07-25 13:36:43 -0700
committerGitHub <noreply@github.com>2022-07-25 13:36:43 -0700
commitce6e946f6f4882aba8a62392ae791c948633e2e3 (patch)
treefde46847900ef80ee111a6b612f7e0931b15fdf2 /tools/gfx/cuda/cuda-command-queue.cpp
parent129294a58d2a51308af78ad5d8d436c026863259 (diff)
Split render-cuda.cpp into smaller files (#2334)
* render-cuda split, compile errors galore due to missing includes etc. * render-cuda split and fully compiles * Ran premake.bat to disable cuda; Added all new files * Removed render-cuda files * CI fixes * Rerun CI
Diffstat (limited to 'tools/gfx/cuda/cuda-command-queue.cpp')
-rw-r--r--tools/gfx/cuda/cuda-command-queue.cpp225
1 files changed, 225 insertions, 0 deletions
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
new file mode 100644
index 000000000..60e81246d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -0,0 +1,225 @@
+// cuda-command-queue.cpp
+#include "cuda-command-queue.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-buffer.h"
+#include "cuda-query.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid)
+{
+ if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
+ return static_cast<ICommandQueue*>(this);
+ return nullptr;
+}
+
+void CommandQueueImpl::init(DeviceImpl* inRenderer)
+{
+ renderer = inRenderer;
+ m_desc.type = ICommandQueue::QueueType::Graphics;
+ cuStreamCreate(&stream, 0);
+}
+CommandQueueImpl::~CommandQueueImpl()
+{
+ cuStreamSynchronize(stream);
+ cuStreamDestroy(stream);
+ currentPipeline = nullptr;
+ currentRootObject = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers(
+ GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal)
+{
+ SLANG_UNUSED(valueToSignal);
+ // TODO: implement fence.
+ assert(fence == nullptr);
+ for (GfxIndex i = 0; i < count; i++)
+ {
+ execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
+ }
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost()
+{
+ auto resultCode = cuStreamSynchronize(stream);
+ if (resultCode != cudaSuccess)
+ SLANG_CUDA_HANDLE_ERROR(resultCode);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice(
+ GfxCount fenceCount, IFence** fences, uint64_t* waitValues)
+{
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle)
+{
+ return SLANG_FAIL;
+}
+
+void CommandQueueImpl::setPipelineState(IPipelineState* state)
+{
+ currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state);
+}
+
+Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object)
+{
+ currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object);
+ if (currentRootObject)
+ return SLANG_OK;
+ return SLANG_E_INVALID_ARG;
+}
+
+void CommandQueueImpl::dispatchCompute(int x, int y, int z)
+{
+ // Specialize the compute kernel based on the shader object bindings.
+ RefPtr<PipelineStateBase> newPipeline;
+ renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
+ currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr());
+
+ // Find out thread group size from program reflection.
+ auto& kernelName = currentPipeline->shaderProgram->kernelName;
+ auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout());
+ int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
+ SLANG_ASSERT(kernelId != -1);
+ UInt threadGroupSize[3];
+ programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
+
+ int sharedSizeInBytes;
+ cuFuncGetAttribute(
+ &sharedSizeInBytes,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+ currentPipeline->shaderProgram->cudaKernel);
+
+ // Copy global parameter data to the `SLANG_globalParams` symbol.
+ {
+ CUdeviceptr globalParamsSymbol = 0;
+ size_t globalParamsSymbolSize = 0;
+ cuModuleGetGlobal(
+ &globalParamsSymbol,
+ &globalParamsSymbolSize,
+ currentPipeline->shaderProgram->cudaModule,
+ "SLANG_globalParams");
+
+ CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
+ cudaMemcpyAsync(
+ (void*)globalParamsSymbol,
+ (void*)globalParamsCUDAData,
+ globalParamsSymbolSize,
+ cudaMemcpyDefault,
+ 0);
+ }
+ //
+ // The argument data for the entry-point parameters are already
+ // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
+ //
+ auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
+ auto entryPointDataSize =
+ currentRootObject->entryPointObjects[kernelId]->getBufferSize();
+
+ void* extraOptions[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER,
+ entryPointBuffer,
+ CU_LAUNCH_PARAM_BUFFER_SIZE,
+ &entryPointDataSize,
+ CU_LAUNCH_PARAM_END,
+ };
+
+ // Once we have all the necessary data extracted and/or
+ // set up, we can launch the kernel and see what happens.
+ //
+ auto cudaLaunchResult = cuLaunchKernel(
+ currentPipeline->shaderProgram->cudaKernel,
+ x,
+ y,
+ z,
+ int(threadGroupSize[0]),
+ int(threadGroupSize[1]),
+ int(threadGroupSize[2]),
+ sharedSizeInBytes,
+ stream,
+ nullptr,
+ extraOptions);
+
+ SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
+}
+
+void CommandQueueImpl::copyBuffer(
+ IBufferResource* dst,
+ size_t dstOffset,
+ IBufferResource* src,
+ size_t srcOffset,
+ size_t size)
+{
+ auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+ auto srcImpl = static_cast<BufferResourceImpl*>(src);
+ cudaMemcpy(
+ (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
+ (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
+ size,
+ cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+{
+ auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+ cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
+{
+ auto poolImpl = static_cast<QueryPoolImpl*>(pool);
+ cuEventRecord(poolImpl->m_events[index], stream);
+}
+
+void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer)
+{
+ for (auto& cmd : commandBuffer->m_commands)
+ {
+ switch (cmd.name)
+ {
+ case CommandName::SetPipelineState:
+ setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
+ break;
+ case CommandName::BindRootShaderObject:
+ bindRootShaderObject(
+ commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
+ break;
+ case CommandName::DispatchCompute:
+ dispatchCompute(
+ int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
+ break;
+ case CommandName::CopyBuffer:
+ copyBuffer(
+ commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+ cmd.operands[1],
+ commandBuffer->getObject<BufferResource>(cmd.operands[2]),
+ cmd.operands[3],
+ cmd.operands[4]);
+ break;
+ case CommandName::UploadBufferData:
+ uploadBufferData(
+ commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+ cmd.operands[1],
+ cmd.operands[2],
+ commandBuffer->getData<uint8_t>(cmd.operands[3]));
+ break;
+ case CommandName::WriteTimestamp:
+ writeTimestamp(
+ commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
+ (SlangInt)cmd.operands[1]);
+ }
+ }
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx