From ce6e946f6f4882aba8a62392ae791c948633e2e3 Mon Sep 17 00:00:00 2001
From: lucy96chen <47800040+lucy96chen@users.noreply.github.com>
Date: Mon, 25 Jul 2022 13:36:43 -0700
Subject: Split render-cuda.cpp into smaller files (#2334)

* render-cuda split, compile errors galore due to missing includes etc.

* render-cuda split and fully compiles

* Ran premake.bat to disable cuda; Added all new files

* Removed render-cuda files

* CI fixes

* Rerun CI
---
 tools/gfx/cuda/cuda-command-queue.cpp | 225 ++++++++++++++++++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 tools/gfx/cuda/cuda-command-queue.cpp

(limited to 'tools/gfx/cuda/cuda-command-queue.cpp')

diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
new file mode 100644
index 000000000..60e81246d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -0,0 +1,225 @@
+// cuda-command-queue.cpp
+#include "cuda-command-queue.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-buffer.h"
+#include "cuda-query.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid)
+{
+    if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
+        return static_cast<ICommandQueue*>(this);
+    return nullptr;
+}
+
+void CommandQueueImpl::init(DeviceImpl* inRenderer)
+{
+    renderer = inRenderer;
+    m_desc.type = ICommandQueue::QueueType::Graphics;
+    cuStreamCreate(&stream, 0);
+}
+CommandQueueImpl::~CommandQueueImpl()
+{
+    cuStreamSynchronize(stream);
+    cuStreamDestroy(stream);
+    currentPipeline = nullptr;
+    currentRootObject = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers(
+    GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal)
+{
+    SLANG_UNUSED(valueToSignal);
+    // TODO: implement fence.
+    assert(fence == nullptr);
+    for (GfxIndex i = 0; i < count; i++)
+    {
+        execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
+    }
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost()
+{
+    auto resultCode = cuStreamSynchronize(stream);
+    if (resultCode != cudaSuccess)
+        SLANG_CUDA_HANDLE_ERROR(resultCode);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice(
+    GfxCount fenceCount, IFence** fences, uint64_t* waitValues)
+{
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle)
+{
+    return SLANG_FAIL;
+}
+
+void CommandQueueImpl::setPipelineState(IPipelineState* state)
+{
+    currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state);
+}
+
+Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object)
+{
+    currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object);
+    if (currentRootObject)
+        return SLANG_OK;
+    return SLANG_E_INVALID_ARG;
+}
+
+void CommandQueueImpl::dispatchCompute(int x, int y, int z)
+{
+    // Specialize the compute kernel based on the shader object bindings.
+    RefPtr<PipelineStateBase> newPipeline;
+    renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
+    currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr());
+
+    // Find out thread group size from program reflection.
+    auto& kernelName = currentPipeline->shaderProgram->kernelName;
+    auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout());
+    int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
+    SLANG_ASSERT(kernelId != -1);
+    UInt threadGroupSize[3];
+    programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
+
+    int sharedSizeInBytes;
+    cuFuncGetAttribute(
+        &sharedSizeInBytes,
+        CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+        currentPipeline->shaderProgram->cudaKernel);
+
+    // Copy global parameter data to the `SLANG_globalParams` symbol.
+    {
+        CUdeviceptr globalParamsSymbol = 0;
+        size_t globalParamsSymbolSize = 0;
+        cuModuleGetGlobal(
+            &globalParamsSymbol,
+            &globalParamsSymbolSize,
+            currentPipeline->shaderProgram->cudaModule,
+            "SLANG_globalParams");
+
+        CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
+        cudaMemcpyAsync(
+            (void*)globalParamsSymbol,
+            (void*)globalParamsCUDAData,
+            globalParamsSymbolSize,
+            cudaMemcpyDefault,
+            0);
+    }
+    //
+    // The argument data for the entry-point parameters are already
+    // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
+    //
+    auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
+    auto entryPointDataSize =
+        currentRootObject->entryPointObjects[kernelId]->getBufferSize();
+
+    void* extraOptions[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        entryPointBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &entryPointDataSize,
+        CU_LAUNCH_PARAM_END,
+    };
+
+    // Once we have all the necessary data extracted and/or
+    // set up, we can launch the kernel and see what happens.
+    //
+    auto cudaLaunchResult = cuLaunchKernel(
+        currentPipeline->shaderProgram->cudaKernel,
+        x,
+        y,
+        z,
+        int(threadGroupSize[0]),
+        int(threadGroupSize[1]),
+        int(threadGroupSize[2]),
+        sharedSizeInBytes,
+        stream,
+        nullptr,
+        extraOptions);
+
+    SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
+}
+
+void CommandQueueImpl::copyBuffer(
+    IBufferResource* dst,
+    size_t dstOffset,
+    IBufferResource* src,
+    size_t srcOffset,
+    size_t size)
+{
+    auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+    auto srcImpl = static_cast<BufferResourceImpl*>(src);
+    cudaMemcpy(
+        (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
+        (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
+        size,
+        cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+{
+    auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+    cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
+{
+    auto poolImpl = static_cast<QueryPoolImpl*>(pool);
+    cuEventRecord(poolImpl->m_events[index], stream);
+}
+
+void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer)
+{
+    for (auto& cmd : commandBuffer->m_commands)
+    {
+        switch (cmd.name)
+        {
+        case CommandName::SetPipelineState:
+            setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
+            break;
+        case CommandName::BindRootShaderObject:
+            bindRootShaderObject(
+                commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
+            break;
+        case CommandName::DispatchCompute:
+            dispatchCompute(
+                int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
+            break;
+        case CommandName::CopyBuffer:
+            copyBuffer(
+                commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+                cmd.operands[1],
+                commandBuffer->getObject<BufferResource>(cmd.operands[2]),
+                cmd.operands[3],
+                cmd.operands[4]);
+            break;
+        case CommandName::UploadBufferData:
+            uploadBufferData(
+                commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+                cmd.operands[1],
+                cmd.operands[2],
+                commandBuffer->getData<uint8_t>(cmd.operands[3]));
+            break;
+        case CommandName::WriteTimestamp:
+            writeTimestamp(
+                commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
+                (SlangInt)cmd.operands[1]);
+        }
+    }
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
-- 
cgit v1.2.3