diff options
33 files changed, 3457 insertions, 2528 deletions
diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj index 542c34371..25a48c339 100644 --- a/build/visual-studio/gfx/gfx.vcxproj +++ b/build/visual-studio/gfx/gfx.vcxproj @@ -303,7 +303,21 @@ <ClInclude Include="..\..\..\tools\gfx\command-encoder-com-forward.h" />
<ClInclude Include="..\..\..\tools\gfx\command-writer.h" />
<ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h" />
- <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d11\d3d11-base.h" />
@@ -390,7 +404,18 @@ </ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp" />
- <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d11\d3d11-buffer.cpp" />
diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters index d461d3d24..61708be0b 100644 --- a/build/visual-studio/gfx/gfx.vcxproj.filters +++ b/build/visual-studio/gfx/gfx.vcxproj.filters @@ -21,7 +21,49 @@ <ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h">
<Filter>Header Files</Filter>
</ClInclude>
- <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h">
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h">
@@ -278,7 +320,40 @@ <ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp">
<Filter>Source Files</Filter>
</ClCompile>
- <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp">
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp">
diff --git a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp index 0dd5ad561..e14729718 100644 --- a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp +++ b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp @@ -21,7 +21,7 @@ namespace gfx_test GFX_CHECK_CALL_ABORT(queue->getNativeHandle(&handle)); if (device->getDeviceInfo().deviceType == gfx::DeviceType::Vulkan) { - SLANG_CHECK(handle.handleValue != NULL); + SLANG_CHECK(handle.handleValue != 0); } #if SLANG_WINDOWS_FAMILY else diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h new file mode 100644 index 000000000..57a244089 --- /dev/null +++ b/tools/gfx/cuda/cuda-base.h @@ -0,0 +1,59 @@ +// cuda-base.h +// Shared header file for CUDA implementation +#pragma once + +#ifdef GFX_ENABLE_CUDA +#include <cuda.h> +#include <cuda_runtime_api.h> +#include "core/slang-basic.h" +#include "core/slang-blob.h" +#include "core/slang-std-writers.h" + +#include "slang.h" +#include "slang-com-ptr.h" +#include "slang-com-helper.h" +#include "../command-writer.h" +#include "../renderer-shared.h" +#include "../mutable-shader-object.h" +#include "../simple-transient-resource-heap.h" +#include "../slang-context.h" +#include "../command-encoder-com-forward.h" + +# ifdef RENDER_TEST_OPTIX + +// The `optix_stubs.h` header produces warnings when compiled with MSVC +# ifdef _MSC_VER +# pragma warning(disable: 4996) +# endif + +# include <optix.h> +# include <optix_function_table_definition.h> +# include <optix_stubs.h> +# endif + +#endif + +namespace gfx +{ +namespace cuda +{ + class CUDAContext; + class BufferResourceImpl; + class TextureResourceImpl; + class ResourceViewImpl; + class ShaderObjectLayoutImpl; + class RootShaderObjectLayoutImpl; + class ShaderObjectImpl; + class MutableShaderObjectImpl; + class EntryPointShaderObjectImpl; + class RootShaderObjectImpl; + class ShaderProgramImpl; + class PipelineStateImpl; + class QueryPoolImpl; + class DeviceImpl; + class CommandBufferImpl; + class ResourceCommandEncoderImpl; + class ComputeCommandEncoderImpl; + class CommandQueueImpl; +} +} diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp new file mode 100644 index 000000000..1cd162841 --- /dev/null +++ b/tools/gfx/cuda/cuda-buffer.cpp @@ -0,0 +1,51 @@ +// cuda-buffer.cpp +#include "cuda-buffer.h" + +#include "cuda-helper-functions.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +BufferResourceImpl::~BufferResourceImpl() +{ + if (m_cudaMemory) + { + SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + } +} + +uint64_t BufferResourceImpl::getBindlessHandle() { return (uint64_t)m_cudaMemory; } + +DeviceAddress BufferResourceImpl::getDeviceAddress() +{ + return (DeviceAddress)m_cudaMemory; +} + +Result BufferResourceImpl::getNativeResourceHandle(InteropHandle* outHandle) +{ + outHandle->handleValue = getBindlessHandle(); + outHandle->api = InteropHandleAPI::CUDA; + return SLANG_OK; +} + +Result BufferResourceImpl::map(MemoryRange* rangeToRead, void** outPointer) +{ + SLANG_UNUSED(rangeToRead); + SLANG_UNUSED(outPointer); + return SLANG_FAIL; +} + +Result BufferResourceImpl::unmap(MemoryRange* writtenRange) +{ + SLANG_UNUSED(writtenRange); + return SLANG_FAIL; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-buffer.h b/tools/gfx/cuda/cuda-buffer.h new file mode 100644 index 000000000..838a06555 --- /dev/null +++ b/tools/gfx/cuda/cuda-buffer.h @@ -0,0 +1,39 @@ +// cuda-buffer.h +#pragma once +#include "cuda-base.h" +#include "cuda-context.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class BufferResourceImpl : public BufferResource +{ +public: + BufferResourceImpl(const Desc& _desc) + : BufferResource(_desc) + {} + + ~BufferResourceImpl(); + + uint64_t getBindlessHandle(); + + void* m_cudaExternalMemory = nullptr; + void* m_cudaMemory = nullptr; + + RefPtr<CUDAContext> m_cudaContext; + + virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override; + virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override; + virtual SLANG_NO_THROW Result SLANG_MCALL + map(MemoryRange* rangeToRead, void** outPointer) override; + virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-buffer.cpp b/tools/gfx/cuda/cuda-command-buffer.cpp new file mode 100644 index 000000000..efb2486f4 --- /dev/null +++ b/tools/gfx/cuda/cuda-command-buffer.cpp @@ -0,0 +1,62 @@ +// cuda-command-buffer.cpp +#include "cuda-command-buffer.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ICommandBuffer* CommandBufferImpl::getInterface(const Guid& guid) +{ + if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer) + return static_cast<ICommandBuffer*>(this); + return nullptr; +} + +void CommandBufferImpl::init(DeviceImpl* device, TransientResourceHeapBase* transientHeap) +{ + m_device = device; + m_transientHeap = transientHeap; +} + +SLANG_NO_THROW void SLANG_MCALL CommandBufferImpl::encodeRenderCommands( + IRenderPassLayout* renderPass, + IFramebuffer* framebuffer, + IRenderCommandEncoder** outEncoder) +{ + SLANG_UNUSED(renderPass); + SLANG_UNUSED(framebuffer); + *outEncoder = nullptr; +} + +SLANG_NO_THROW void SLANG_MCALL + CommandBufferImpl::encodeResourceCommands(IResourceCommandEncoder** outEncoder) +{ + m_resourceCommandEncoder.init(this); + *outEncoder = &m_resourceCommandEncoder; +} + +SLANG_NO_THROW void SLANG_MCALL + CommandBufferImpl::encodeComputeCommands(IComputeCommandEncoder** outEncoder) +{ + m_computeCommandEncoder.init(this); + *outEncoder = &m_computeCommandEncoder; +} + +SLANG_NO_THROW void SLANG_MCALL + CommandBufferImpl::encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) +{ + *outEncoder = nullptr; +} + +SLANG_NO_THROW Result SLANG_MCALL CommandBufferImpl::getNativeHandle(InteropHandle* outHandle) +{ + return SLANG_FAIL; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-buffer.h b/tools/gfx/cuda/cuda-command-buffer.h new file mode 100644 index 000000000..76d9aa01f --- /dev/null +++ b/tools/gfx/cuda/cuda-command-buffer.h @@ -0,0 +1,49 @@ +// cuda-command-buffer.h +#pragma once +#include "cuda-base.h" +#include "cuda-command-encoder.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class CommandBufferImpl + : public ICommandBuffer + , public CommandWriter + , public ComObject +{ +public: + SLANG_COM_OBJECT_IUNKNOWN_ALL + ICommandBuffer* getInterface(const Guid& guid); + +public: + DeviceImpl* m_device; + TransientResourceHeapBase* m_transientHeap; + ResourceCommandEncoderImpl m_resourceCommandEncoder; + ComputeCommandEncoderImpl m_computeCommandEncoder; + + void init(DeviceImpl* device, TransientResourceHeapBase* transientHeap); + virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands( + IRenderPassLayout* renderPass, + IFramebuffer* framebuffer, + IRenderCommandEncoder** outEncoder) override; + + virtual SLANG_NO_THROW void SLANG_MCALL + encodeResourceCommands(IResourceCommandEncoder** outEncoder) override; + virtual SLANG_NO_THROW void SLANG_MCALL + encodeComputeCommands(IComputeCommandEncoder** outEncoder) override; + virtual SLANG_NO_THROW void SLANG_MCALL + encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override; + + virtual SLANG_NO_THROW void SLANG_MCALL close() override {} + + virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-encoder.cpp b/tools/gfx/cuda/cuda-command-encoder.cpp new file mode 100644 index 000000000..6911eb649 --- /dev/null +++ b/tools/gfx/cuda/cuda-command-encoder.cpp @@ -0,0 +1,210 @@ +// cuda-command-encoder.cpp +#include "cuda-command-encoder.h" + +#include "cuda-command-buffer.h" +#include "cuda-device.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +void ResourceCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer) +{ + m_writer = cmdBuffer; +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyBuffer( + IBufferResource* dst, + Offset dstOffset, + IBufferResource* src, + Offset srcOffset, + Size size) +{ + m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadBufferData( + IBufferResource* dst, Offset offset, Size size, void* data) +{ + m_writer->uploadBufferData(dst, offset, size, data); +} + +SLANG_NO_THROW void SLANG_MCALL + ResourceCommandEncoderImpl::writeTimestamp(IQueryPool* pool, GfxIndex index) +{ + m_writer->writeTimestamp(pool, index); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTexture( + ITextureResource* dst, + ResourceState dstState, + SubresourceRange dstSubresource, + ITextureResource::Offset3D dstOffset, + ITextureResource* src, + ResourceState srcState, + SubresourceRange srcSubresource, + ITextureResource::Offset3D srcOffset, + ITextureResource::Extents extent) +{ + SLANG_UNUSED(dst); + SLANG_UNUSED(dstState); + SLANG_UNUSED(dstSubresource); + SLANG_UNUSED(dstOffset); + SLANG_UNUSED(src); + SLANG_UNUSED(srcState); + SLANG_UNUSED(srcSubresource); + SLANG_UNUSED(srcOffset); + SLANG_UNUSED(extent); + SLANG_UNIMPLEMENTED_X("copyTexture"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadTextureData( + ITextureResource* dst, + SubresourceRange subResourceRange, + ITextureResource::Offset3D offset, + ITextureResource::Extents extent, + ITextureResource::SubresourceData* subResourceData, + GfxCount subResourceDataCount) +{ + SLANG_UNUSED(dst); + SLANG_UNUSED(subResourceRange); + SLANG_UNUSED(offset); + SLANG_UNUSED(extent); + SLANG_UNUSED(subResourceData); + SLANG_UNUSED(subResourceDataCount); + SLANG_UNIMPLEMENTED_X("uploadTextureData"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::clearResourceView( + IResourceView* view, + ClearValue* clearValue, + ClearResourceViewFlags::Enum flags) +{ + SLANG_UNUSED(view); + SLANG_UNUSED(clearValue); + SLANG_UNUSED(flags); + SLANG_UNIMPLEMENTED_X("clearResourceView"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveResource( + ITextureResource* source, + ResourceState sourceState, + SubresourceRange sourceRange, + ITextureResource* dest, + ResourceState destState, + SubresourceRange destRange) +{ + SLANG_UNUSED(source); + SLANG_UNUSED(sourceState); + SLANG_UNUSED(sourceRange); + SLANG_UNUSED(dest); + SLANG_UNUSED(destState); + SLANG_UNUSED(destRange); + SLANG_UNIMPLEMENTED_X("resolveResource"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveQuery( + IQueryPool* queryPool, + GfxIndex index, + GfxCount count, + IBufferResource* buffer, + Offset offset) +{ + SLANG_UNUSED(queryPool); + SLANG_UNUSED(index); + SLANG_UNUSED(count); + SLANG_UNUSED(buffer); + SLANG_UNUSED(offset); + SLANG_UNIMPLEMENTED_X("resolveQuery"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTextureToBuffer( + IBufferResource* dst, + Offset dstOffset, + Size dstSize, + Size dstRowStride, + ITextureResource* src, + ResourceState srcState, + SubresourceRange srcSubresource, + ITextureResource::Offset3D srcOffset, + ITextureResource::Extents extent) +{ + SLANG_UNUSED(dst); + SLANG_UNUSED(dstOffset); + SLANG_UNUSED(dstSize); + SLANG_UNUSED(dstRowStride); + SLANG_UNUSED(src); + SLANG_UNUSED(srcState); + SLANG_UNUSED(srcSubresource); + SLANG_UNUSED(srcOffset); + SLANG_UNUSED(extent); + SLANG_UNIMPLEMENTED_X("copyTextureToBuffer"); +} + +SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::textureSubresourceBarrier( + ITextureResource* texture, + SubresourceRange subresourceRange, + ResourceState src, + ResourceState dst) +{ + SLANG_UNUSED(texture); + SLANG_UNUSED(subresourceRange); + SLANG_UNUSED(src); + SLANG_UNUSED(dst); + SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier"); +} + +SLANG_NO_THROW void SLANG_MCALL + ResourceCommandEncoderImpl::beginDebugEvent(const char* name, float rgbColor[3]) +{ + SLANG_UNUSED(name); + SLANG_UNUSED(rgbColor); +} + +void ComputeCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer) +{ + m_writer = cmdBuffer; + m_commandBuffer = cmdBuffer; +} + +SLANG_NO_THROW Result SLANG_MCALL + ComputeCommandEncoderImpl::bindPipeline(IPipelineState* state, IShaderObject** outRootObject) +{ + m_writer->setPipelineState(state); + PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); + SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( + pipelineImpl->m_program, m_rootObject.writeRef())); + returnComPtr(outRootObject, m_rootObject); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL + ComputeCommandEncoderImpl::bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) +{ + m_writer->setPipelineState(state); + PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); + SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( + pipelineImpl->m_program, m_rootObject.writeRef())); + m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap); + return SLANG_OK; +} + +SLANG_NO_THROW void SLANG_MCALL ComputeCommandEncoderImpl::dispatchCompute(int x, int y, int z) +{ + m_writer->bindRootShaderObject(m_rootObject); + m_writer->dispatchCompute(x, y, z); +} + +SLANG_NO_THROW void SLANG_MCALL + ComputeCommandEncoderImpl::dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) +{ + SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect"); +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-encoder.h b/tools/gfx/cuda/cuda-command-encoder.h new file mode 100644 index 000000000..73660534a --- /dev/null +++ b/tools/gfx/cuda/cuda-command-encoder.h @@ -0,0 +1,135 @@ +// cuda-command-encoder.h +#pragma once +#include "cuda-base.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class ResourceCommandEncoderImpl : public IResourceCommandEncoder +{ +public: + CommandWriter* m_writer; + + void init(CommandBufferImpl* cmdBuffer); + + virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} + virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer( + IBufferResource* dst, + Offset dstOffset, + IBufferResource* src, + Offset srcOffset, + Size size) override; + + virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier( + GfxCount count, + ITextureResource* const* textures, + ResourceState src, + ResourceState dst) override + {} + + virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier( + GfxCount count, + IBufferResource* const* buffers, + ResourceState src, + ResourceState dst) override + {} + + virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData( + IBufferResource* dst, Offset offset, Size size, void* data) override; + + virtual SLANG_NO_THROW void SLANG_MCALL + writeTimestamp(IQueryPool* pool, GfxIndex index) override; + + virtual SLANG_NO_THROW void SLANG_MCALL copyTexture( + ITextureResource* dst, + ResourceState dstState, + SubresourceRange dstSubresource, + ITextureResource::Offset3D dstOffset, + ITextureResource* src, + ResourceState srcState, + SubresourceRange srcSubresource, + ITextureResource::Offset3D srcOffset, + ITextureResource::Extents extent) override; + + virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData( + ITextureResource* dst, + SubresourceRange subResourceRange, + ITextureResource::Offset3D offset, + ITextureResource::Extents extent, + ITextureResource::SubresourceData* subResourceData, + GfxCount subResourceDataCount) override; + + virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView( + IResourceView* view, + ClearValue* clearValue, + ClearResourceViewFlags::Enum flags) override; + + virtual SLANG_NO_THROW void SLANG_MCALL resolveResource( + ITextureResource* source, + ResourceState sourceState, + SubresourceRange sourceRange, + ITextureResource* dest, + ResourceState destState, + SubresourceRange destRange) override; + + virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery( + IQueryPool* queryPool, + GfxIndex index, + GfxCount count, + IBufferResource* buffer, + Offset offset) override; + + virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer( + IBufferResource* dst, + Offset dstOffset, + Size dstSize, + Size dstRowStride, + ITextureResource* src, + ResourceState srcState, + SubresourceRange srcSubresource, + ITextureResource::Offset3D srcOffset, + ITextureResource::Extents extent) override; + + virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier( + ITextureResource* texture, + SubresourceRange subresourceRange, + ResourceState src, + ResourceState dst) override; + virtual SLANG_NO_THROW void SLANG_MCALL + beginDebugEvent(const char* name, float rgbColor[3]) override; + virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {} +}; + +class ComputeCommandEncoderImpl + : public IComputeCommandEncoder + , public ResourceCommandEncoderImpl +{ +public: + SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl) +public: + CommandWriter* m_writer; + CommandBufferImpl* m_commandBuffer; + RefPtr<ShaderObjectBase> m_rootObject; + virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} + void init(CommandBufferImpl* cmdBuffer); + + virtual SLANG_NO_THROW Result SLANG_MCALL + bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL + bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override; + + virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override; + + virtual SLANG_NO_THROW void SLANG_MCALL + dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp new file mode 100644 index 000000000..60e81246d --- /dev/null +++ b/tools/gfx/cuda/cuda-command-queue.cpp @@ -0,0 +1,225 @@ +// cuda-command-queue.cpp +#include "cuda-command-queue.h" + +#include "cuda-buffer.h" +#include "cuda-command-buffer.h" +#include "cuda-query.h" +#include "cuda-shader-object-layout.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid) +{ + if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) + return static_cast<ICommandQueue*>(this); + return nullptr; +} + +void CommandQueueImpl::init(DeviceImpl* inRenderer) +{ + renderer = inRenderer; + m_desc.type = ICommandQueue::QueueType::Graphics; + cuStreamCreate(&stream, 0); +} +CommandQueueImpl::~CommandQueueImpl() +{ + cuStreamSynchronize(stream); + cuStreamDestroy(stream); + currentPipeline = nullptr; + currentRootObject = nullptr; +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers( + GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) +{ + SLANG_UNUSED(valueToSignal); + // TODO: implement fence. + assert(fence == nullptr); + for (GfxIndex i = 0; i < count; i++) + { + execute(static_cast<CommandBufferImpl*>(commandBuffers[i])); + } +} + +SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost() +{ + auto resultCode = cuStreamSynchronize(stream); + if (resultCode != cudaSuccess) + SLANG_CUDA_HANDLE_ERROR(resultCode); +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice( + GfxCount fenceCount, IFence** fences, uint64_t* waitValues) +{ + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle) +{ + return SLANG_FAIL; +} + +void CommandQueueImpl::setPipelineState(IPipelineState* state) +{ + currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state); +} + +Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object) +{ + currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object); + if (currentRootObject) + return SLANG_OK; + return SLANG_E_INVALID_ARG; +} + +void CommandQueueImpl::dispatchCompute(int x, int y, int z) +{ + // Specialize the compute kernel based on the shader object bindings. + RefPtr<PipelineStateBase> newPipeline; + renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); + currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr()); + + // Find out thread group size from program reflection. + auto& kernelName = currentPipeline->shaderProgram->kernelName; + auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout()); + int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); + SLANG_ASSERT(kernelId != -1); + UInt threadGroupSize[3]; + programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); + + int sharedSizeInBytes; + cuFuncGetAttribute( + &sharedSizeInBytes, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + currentPipeline->shaderProgram->cudaKernel); + + // Copy global parameter data to the `SLANG_globalParams` symbol. + { + CUdeviceptr globalParamsSymbol = 0; + size_t globalParamsSymbolSize = 0; + cuModuleGetGlobal( + &globalParamsSymbol, + &globalParamsSymbolSize, + currentPipeline->shaderProgram->cudaModule, + "SLANG_globalParams"); + + CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); + cudaMemcpyAsync( + (void*)globalParamsSymbol, + (void*)globalParamsCUDAData, + globalParamsSymbolSize, + cudaMemcpyDefault, + 0); + } + // + // The argument data for the entry-point parameters are already + // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. + // + auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); + auto entryPointDataSize = + currentRootObject->entryPointObjects[kernelId]->getBufferSize(); + + void* extraOptions[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + entryPointBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &entryPointDataSize, + CU_LAUNCH_PARAM_END, + }; + + // Once we have all the necessary data extracted and/or + // set up, we can launch the kernel and see what happens. + // + auto cudaLaunchResult = cuLaunchKernel( + currentPipeline->shaderProgram->cudaKernel, + x, + y, + z, + int(threadGroupSize[0]), + int(threadGroupSize[1]), + int(threadGroupSize[2]), + sharedSizeInBytes, + stream, + nullptr, + extraOptions); + + SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); +} + +void CommandQueueImpl::copyBuffer( + IBufferResource* dst, + size_t dstOffset, + IBufferResource* src, + size_t srcOffset, + size_t size) +{ + auto dstImpl = static_cast<BufferResourceImpl*>(dst); + auto srcImpl = static_cast<BufferResourceImpl*>(src); + cudaMemcpy( + (uint8_t*)dstImpl->m_cudaMemory + dstOffset, + (uint8_t*)srcImpl->m_cudaMemory + srcOffset, + size, + cudaMemcpyDefault); +} + +void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) +{ + auto dstImpl = static_cast<BufferResourceImpl*>(dst); + cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); +} + +void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index) +{ + auto poolImpl = static_cast<QueryPoolImpl*>(pool); + cuEventRecord(poolImpl->m_events[index], stream); +} + +void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer) +{ + for (auto& cmd : commandBuffer->m_commands) + { + switch (cmd.name) + { + case CommandName::SetPipelineState: + setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0])); + break; + case CommandName::BindRootShaderObject: + bindRootShaderObject( + commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0])); + break; + case CommandName::DispatchCompute: + dispatchCompute( + int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); + break; + case CommandName::CopyBuffer: + copyBuffer( + commandBuffer->getObject<BufferResource>(cmd.operands[0]), + cmd.operands[1], + commandBuffer->getObject<BufferResource>(cmd.operands[2]), + cmd.operands[3], + cmd.operands[4]); + break; + case CommandName::UploadBufferData: + uploadBufferData( + commandBuffer->getObject<BufferResource>(cmd.operands[0]), + cmd.operands[1], + cmd.operands[2], + commandBuffer->getData<uint8_t>(cmd.operands[3])); + break; + case CommandName::WriteTimestamp: + writeTimestamp( + commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]), + (SlangInt)cmd.operands[1]); + } + } +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-command-queue.h b/tools/gfx/cuda/cuda-command-queue.h new file mode 100644 index 000000000..10601890e --- /dev/null +++ b/tools/gfx/cuda/cuda-command-queue.h @@ -0,0 +1,69 @@ +// cuda-command-queue.h +#pragma once +#include "cuda-base.h" + +#include "cuda-device.h" +#include "cuda-pipeline-state.h" +#include "cuda-shader-object.h" +#include "cuda-helper-functions.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class CommandQueueImpl + : public ICommandQueue + , public ComObject +{ +public: + SLANG_COM_OBJECT_IUNKNOWN_ALL + ICommandQueue* getInterface(const Guid& guid); + + RefPtr<ComputePipelineStateImpl> currentPipeline; + RefPtr<RootShaderObjectImpl> currentRootObject; + RefPtr<DeviceImpl> renderer; + CUstream stream; + Desc m_desc; + + void init(DeviceImpl* inRenderer); + ~CommandQueueImpl(); + + virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override { return m_desc; } + + virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers( + GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override; + + virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override; + + virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice( + GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override; + + void setPipelineState(IPipelineState* state); + + Result bindRootShaderObject(IShaderObject* object); + + void dispatchCompute(int x, int y, int z); + + void copyBuffer( + IBufferResource* dst, + size_t dstOffset, + IBufferResource* src, + size_t srcOffset, + size_t size); + + void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data); + + void writeTimestamp(IQueryPool* pool, SlangInt index); + + void execute(CommandBufferImpl* commandBuffer); +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-context.h b/tools/gfx/cuda/cuda-context.h new file mode 100644 index 000000000..d1a53e4e9 --- /dev/null +++ b/tools/gfx/cuda/cuda-context.h @@ -0,0 +1,22 @@ +// cuda-context.h +#pragma once +#include "cuda-base.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class CUDAContext : public RefObject +{ +public: + CUcontext m_context = nullptr; + ~CUDAContext() { cuCtxDestroy(m_context); } +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp new file mode 100644 index 000000000..1a4a142d0 --- /dev/null +++ b/tools/gfx/cuda/cuda-device.cpp @@ -0,0 +1,1115 @@ +// cuda-device.cpp +#include "cuda-device.h" + +#include "cuda-buffer.h" +#include "cuda-command-queue.h" +#include "cuda-pipeline-state.h" +#include "cuda-query.h" +#include "cuda-shader-object.h" +#include "cuda-shader-object-layout.h" +#include "cuda-shader-program.h" +#include "cuda-resource-views.h" +#include "cuda-texture.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +int DeviceImpl::_calcSMCountPerMultiProcessor(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + struct SMInfo + { + int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version + int coreCount; + }; + + static const SMInfo infos[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64} }; + + const int sm = ((major << 4) + minor); + for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i) + { + if (infos[i].sm == sm) + { + return infos[i].coreCount; + } + } + + const auto& last = infos[SLANG_COUNT_OF(infos) - 1]; + + // It must be newer presumably + SLANG_ASSERT(sm > last.sm); + + // Default to the last entry + return last.coreCount; +} + +SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex) +{ + int smPerMultiproc = 0; + int maxPerfDevice = -1; + int deviceCount = 0; + int devicesProhibited = 0; + + uint64_t maxComputePerf = 0; + SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); + + // Find the best CUDA capable GPU device + for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) + { + int computeMode = -1, major = 0, minor = 0; + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) + { + if (major == 9999 && minor == 9999) + { + smPerMultiproc = 1; + } + else + { + smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor); + } + + int multiProcessorCount = 0, clockRate = 0; + SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute( + &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); + uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; + + if (compute_perf > maxComputePerf) + { + maxComputePerf = compute_perf; + maxPerfDevice = currentDevice; + } + } + else + { + devicesProhibited++; + } + } + + if (maxPerfDevice < 0) + { + return SLANG_FAIL; + } + + *outDeviceIndex = maxPerfDevice; + return SLANG_OK; +} + +SlangResult DeviceImpl::_initCuda(CUDAReportStyle reportType) +{ + static CUresult res = cuInit(0); + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::getNativeDeviceHandles(InteropHandles* outHandles) +{ + outHandles->handles[0].handleValue = (uint64_t)m_device; + outHandles->handles[0].api = InteropHandleAPI::CUDA; + return SLANG_OK; +} + +SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc) +{ + SLANG_RETURN_ON_FAIL(slangContext.initialize( + desc.slang, + SLANG_PTX, + "sm_5_1", + makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView())); + + SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc)); + + SLANG_RETURN_ON_FAIL(_initCuda(reportType)); + + SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex)); + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType); + + m_context = new CUDAContext(); + + int count = -1; + cuDeviceGetCount(&count); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex)); + + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL( + cuCtxCreate(&m_context->m_context, 0, m_device), reportType); + + // Not clear how to detect half support on CUDA. For now we'll assume we have it + { + m_features.add("half"); + } + + // Initialize DeviceInfo + { + m_info.deviceType = DeviceType::CUDA; + m_info.bindingStyle = BindingStyle::CUDA; + m_info.projectionStyle = ProjectionStyle::DirectX; + m_info.apiName = "CUDA"; + static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; + ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity)); + cudaDeviceProp deviceProperties; + cudaGetDeviceProperties(&deviceProperties, m_deviceIndex); + m_adapterName = deviceProperties.name; + m_info.adapterName = m_adapterName.begin(); + m_info.timestampFrequency = 1000000; + } + + return SLANG_OK; +} + +Result DeviceImpl::getCUDAFormat(Format format, CUarray_format* outFormat) +{ + // TODO: Expand to cover all available formats that can be supported in CUDA + switch (format) + { + case Format::R32G32B32A32_FLOAT: + case Format::R32G32B32_FLOAT: + case Format::R32G32_FLOAT: + case Format::R32_FLOAT: + case Format::D32_FLOAT: + *outFormat = CU_AD_FORMAT_FLOAT; + return SLANG_OK; + case Format::R16G16B16A16_FLOAT: + case Format::R16G16_FLOAT: + case Format::R16_FLOAT: + *outFormat = CU_AD_FORMAT_HALF; + return SLANG_OK; + case Format::R32G32B32A32_UINT: + case Format::R32G32B32_UINT: + case Format::R32G32_UINT: + case Format::R32_UINT: + *outFormat = CU_AD_FORMAT_UNSIGNED_INT32; + return SLANG_OK; + case Format::R16G16B16A16_UINT: + case Format::R16G16_UINT: + case Format::R16_UINT: + *outFormat = CU_AD_FORMAT_UNSIGNED_INT16; + return SLANG_OK; + case Format::R8G8B8A8_UINT: + case Format::R8G8_UINT: + case Format::R8_UINT: + case Format::R8G8B8A8_UNORM: + *outFormat = CU_AD_FORMAT_UNSIGNED_INT8; + return SLANG_OK; + case Format::R32G32B32A32_SINT: + case Format::R32G32B32_SINT: + case Format::R32G32_SINT: + case Format::R32_SINT: + *outFormat = CU_AD_FORMAT_SIGNED_INT32; + return SLANG_OK; + case Format::R16G16B16A16_SINT: + case Format::R16G16_SINT: + case Format::R16_SINT: + *outFormat = CU_AD_FORMAT_SIGNED_INT16; + return SLANG_OK; + case Format::R8G8B8A8_SINT: + case Format::R8G8_SINT: + case Format::R8_SINT: + *outFormat = CU_AD_FORMAT_SIGNED_INT8; + return SLANG_OK; + default: + SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); + return SLANG_FAIL; + } +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureResource( + const ITextureResource::Desc& desc, + const ITextureResource::SubresourceData* initData, + ITextureResource** outResource) +{ + TextureResource::Desc srcDesc = fixupTextureDesc(desc); + + RefPtr<TextureResourceImpl> tex = new TextureResourceImpl(srcDesc); + tex->m_cudaContext = m_context; + + CUresourcetype resourceType; + + // The size of the element/texel in bytes + size_t elementSize = 0; + + // Our `ITextureResource::Desc` uses an enumeration to specify + // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's + // `cuMipmappedArrayCreate` seemingly relies on a policy where + // the extents of the array in dimenions above the rank are + // specified as zero (e.g., a 1D texture requires `height==0`). + // + // We will start by massaging the extents as specified by the + // user into a form that CUDA wants/expects, based on the + // texture shape as specified in the `desc`. + // + int width = desc.size.width; + int height = desc.size.height; + int depth = desc.size.depth; + switch (desc.type) + { + case IResource::Type::Texture1D: + height = 0; + depth = 0; + break; + + case IResource::Type::Texture2D: + depth = 0; + break; + + case IResource::Type::Texture3D: + break; + + case IResource::Type::TextureCube: + depth = 1; + break; + } + + { + CUarray_format format = CU_AD_FORMAT_FLOAT; + int numChannels = 0; + + SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format)); + FormatInfo info; + gfxGetFormatInfo(desc.format, &info); + numChannels = info.channelCount; + + switch (format) + { + case CU_AD_FORMAT_FLOAT: + { + elementSize = sizeof(float) * numChannels; + break; + } + case CU_AD_FORMAT_HALF: + { + elementSize = sizeof(uint16_t) * numChannels; + break; + } + case CU_AD_FORMAT_UNSIGNED_INT8: + { + elementSize = sizeof(uint8_t) * numChannels; + break; + } + default: + { + SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); + return SLANG_FAIL; + } + } + + if (desc.numMipLevels > 1) + { + resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; + + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Width = width; + arrayDesc.Height = height; + arrayDesc.Depth = depth; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + arrayDesc.Flags = 0; + + if (desc.arraySize > 1) + { + if (desc.type == IResource::Type::Texture1D || + desc.type == IResource::Type::Texture2D || + desc.type == IResource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED; + arrayDesc.Depth = desc.arraySize; + } + else + { + SLANG_ASSERT(!"Arrays only supported for 1D and 2D"); + return SLANG_FAIL; + } + } + + if (desc.type == IResource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + arrayDesc.Depth *= 6; + } + + SLANG_CUDA_RETURN_ON_FAIL( + cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels)); + } + else + { + resourceType = CU_RESOURCE_TYPE_ARRAY; + + if (desc.arraySize > 1) + { + if (desc.type == IResource::Type::Texture1D || + desc.type == IResource::Type::Texture2D || + desc.type == IResource::Type::TextureCube) + { + SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported"); + return SLANG_FAIL; + } + + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + // Set the depth as the array length + arrayDesc.Depth = desc.arraySize; + if (desc.type == IResource::Type::TextureCube) + { + arrayDesc.Depth *= 6; + } + + arrayDesc.Height = height; + arrayDesc.Width = width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + if (desc.type == IResource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); + } + else if (desc.type == IResource::Type::Texture3D || + desc.type == IResource::Type::TextureCube) + { + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Depth = depth; + arrayDesc.Height = height; + arrayDesc.Width = width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + arrayDesc.Flags = 0; + + // Handle cube texture + if (desc.type == IResource::Type::TextureCube) + { + arrayDesc.Depth = 6; + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); + } + else + { + CUDA_ARRAY_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Height = height; + arrayDesc.Width = width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + // Allocate the array, will work for 1D or 2D case + SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); + } + } + } + + // Work space for holding data for uploading if it needs to be rearranged + if (initData) + { + List<uint8_t> workspace; + for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel) + { + int mipWidth = width >> mipLevel; + int mipHeight = height >> mipLevel; + int mipDepth = depth >> mipLevel; + + mipWidth = (mipWidth == 0) ? 1 : mipWidth; + mipHeight = (mipHeight == 0) ? 1 : mipHeight; + mipDepth = (mipDepth == 0) ? 1 : mipDepth; + + // If it's a cubemap then the depth is always 6 + if (desc.type == IResource::Type::TextureCube) + { + mipDepth = 6; + } + + auto dstArray = tex->m_cudaArray; + if (tex->m_cudaMipMappedArray) + { + // Get the array for the mip level + SLANG_CUDA_RETURN_ON_FAIL( + cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel)); + } + SLANG_ASSERT(dstArray); + + // Check using the desc to see if it's plausible + { + CUDA_ARRAY_DESCRIPTOR arrayDesc; + SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray)); + + SLANG_ASSERT(mipWidth == arrayDesc.Width); + SLANG_ASSERT( + mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0)); + } + + const void* srcDataPtr = nullptr; + + if (desc.arraySize > 1) + { + SLANG_ASSERT( + desc.type == IResource::Type::Texture1D || + desc.type == IResource::Type::Texture2D || + desc.type == IResource::Type::TextureCube); + + // TODO(JS): Here I assume that arrays are just held contiguously within a + // 'face' This seems reasonable and works with the Copy3D. + const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; + + Index faceCount = desc.arraySize; + if (desc.type == IResource::Type::TextureCube) + { + faceCount *= 6; + } + + const size_t mipSizeInBytes = faceSizeInBytes * faceCount; + workspace.setCount(mipSizeInBytes); + + // We need to add the face data from each mip + // We iterate over face count so we copy all of the cubemap faces + for (Index j = 0; j < faceCount; j++) + { + const auto srcData = initData[mipLevel + j * desc.numMipLevels].data; + // Copy over to the workspace to make contiguous + ::memcpy( + workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes); + } + + srcDataPtr = workspace.getBuffer(); + } + else + { + if (desc.type == IResource::Type::TextureCube) + { + size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; + + workspace.setCount(faceSizeInBytes * 6); + // Copy the data over to make contiguous + for (Index j = 0; j < 6; j++) + { + const auto srcData = + initData[mipLevel + j * desc.numMipLevels].data; + ::memcpy( + workspace.getBuffer() + faceSizeInBytes * j, + srcData, + faceSizeInBytes); + } + srcDataPtr = workspace.getBuffer(); + } + else + { + const auto srcData = initData[mipLevel].data; + srcDataPtr = srcData; + } + } + + if (desc.arraySize > 1) + { + SLANG_ASSERT( + desc.type == IResource::Type::Texture1D || + desc.type == IResource::Type::Texture2D || + desc.type == IResource::Type::TextureCube); + + CUDA_MEMCPY3D copyParam; + memset(©Param, 0, sizeof(copyParam)); + + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + // Set the depth to the array length + copyParam.Depth = desc.arraySize; + + if (desc.type == IResource::Type::TextureCube) + { + copyParam.Depth *= 6; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); + } + else + { + switch (desc.type) + { + case IResource::Type::Texture1D: + case IResource::Type::Texture2D: + { + CUDA_MEMCPY2D copyParam; + memset(©Param, 0, sizeof(copyParam)); + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); + break; + } + case IResource::Type::Texture3D: + case IResource::Type::TextureCube: + { + CUDA_MEMCPY3D copyParam; + memset(©Param, 0, sizeof(copyParam)); + + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + copyParam.Depth = mipDepth; + + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); + break; + } + + default: + { + SLANG_ASSERT(!"Not implemented"); + break; + } + } + } + } + } + // Set up texture sampling parameters, and create final texture obj + + { + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); + resDesc.resType = resourceType; + + if (tex->m_cudaArray) + { + resDesc.res.array.hArray = tex->m_cudaArray; + } + if (tex->m_cudaMipMappedArray) + { + resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray; + } + + // If the texture might be used as a UAV, then we need to allocate + // a CUDA "surface" for it. + // + // Note: We cannot do this unconditionally, because it will fail + // on surfaces that are not usable as UAVs (e.g., those with + // mipmaps). + // + // TODO: We should really only be allocating the array at the + // time we create a resource, and then allocate the surface or + // texture objects as part of view creation. + // + if (desc.allowedStates.contains(ResourceState::UnorderedAccess)) + { + // On CUDA surfaces only support a single MIP map + SLANG_ASSERT(desc.numMipLevels == 1); + + SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc)); + } + + + // Create handle for sampling. + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); + texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + SLANG_CUDA_RETURN_ON_FAIL( + cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); + } + + returnComPtr(outResource, tex); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource( + const IBufferResource::Desc& descIn, + const void* initData, + IBufferResource** outResource) +{ + auto desc = fixupBufferDesc(descIn); + RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc); + resource->m_cudaContext = m_context; + SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes)); + if (initData) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault)); + } + returnComPtr(outResource, resource); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle( + InteropHandle handle, + const IBufferResource::Desc& desc, + IBufferResource** outResource) +{ + if (handle.handleValue == 0) + { + *outResource = nullptr; + return SLANG_OK; + } + + RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc); + resource->m_cudaContext = m_context; + + // CUDA manages sharing of buffers through the idea of an + // "external memory" object, which represents the relationship + // with another API's objects. In order to create this external + // memory association, we first need to fill in a descriptor struct. + cudaExternalMemoryHandleDesc externalMemoryHandleDesc; + memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); + switch (handle.api) + { + case InteropHandleAPI::D3D12: + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; + break; + case InteropHandleAPI::Vulkan: + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; + break; + default: + return SLANG_FAIL; + } + externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; + externalMemoryHandleDesc.size = desc.sizeInBytes; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + + // Once we have filled in the descriptor, we can request + // that CUDA create the required association between the + // external buffer and its own memory. + cudaExternalMemory_t externalMemory; + SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); + resource->m_cudaExternalMemory = externalMemory; + + // The CUDA "external memory" handle is not itself a device + // pointer, so we need to query for a suitable device address + // for the buffer with another call. + // + // Just as for the external memory, we fill in a descriptor + // structure (although in this case we only need to specify + // the size). + cudaExternalMemoryBufferDesc bufferDesc; + memset(&bufferDesc, 0, sizeof(bufferDesc)); + bufferDesc.size = desc.sizeInBytes; + + // Finally, we can "map" the buffer to get a device address. + void* deviceAddress; + SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc)); + resource->m_cudaMemory = deviceAddress; + + returnComPtr(outResource, resource); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle( + InteropHandle handle, + const ITextureResource::Desc& desc, + const size_t size, + ITextureResource** outResource) +{ + if (handle.handleValue == 0) + { + *outResource = nullptr; + return SLANG_OK; + } + + RefPtr<TextureResourceImpl> resource = new TextureResourceImpl(desc); + resource->m_cudaContext = m_context; + + // CUDA manages sharing of buffers through the idea of an + // "external memory" object, which represents the relationship + // with another API's objects. In order to create this external + // memory association, we first need to fill in a descriptor struct. + CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc; + memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); + switch (handle.api) + { + case InteropHandleAPI::D3D12: + externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE; + break; + case InteropHandleAPI::Vulkan: + externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; + break; + default: + return SLANG_FAIL; + } + externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; + externalMemoryHandleDesc.size = size; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + + CUexternalMemory externalMemory; + SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); + resource->m_cudaExternalMemory = externalMemory; + + FormatInfo formatInfo; + SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo)); + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + arrayDesc.Depth = desc.size.depth; + arrayDesc.Height = desc.size.height; + arrayDesc.Width = desc.size.width; + arrayDesc.NumChannels = formatInfo.channelCount; + getCUDAFormat(desc.format, &arrayDesc.Format); + arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER + + CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc; + memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc)); + externalMemoryMipDesc.offset = 0; + externalMemoryMipDesc.arrayDesc = arrayDesc; + externalMemoryMipDesc.numLevels = desc.numMipLevels; + + CUmipmappedArray mipArray; + SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc)); + resource->m_cudaMipMappedArray = mipArray; + + CUarray cuArray; + SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0)); + resource->m_cudaArray = cuArray; + + CUDA_RESOURCE_DESC surfDesc; + memset(&surfDesc, 0, sizeof(surfDesc)); + surfDesc.resType = CU_RESOURCE_TYPE_ARRAY; + surfDesc.res.array.hArray = cuArray; + + CUsurfObject surface; + SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc)); + resource->m_cudaSurfObj = surface; + + returnComPtr(outResource, resource); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureView( + ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) +{ + RefPtr<ResourceViewImpl> view = new ResourceViewImpl(); + view->m_desc = desc; + view->textureResource = dynamic_cast<TextureResourceImpl*>(texture); + returnComPtr(outView, view); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferView( + IBufferResource* buffer, + IBufferResource* counterBuffer, + IResourceView::Desc const& desc, + IResourceView** outView) +{ + RefPtr<ResourceViewImpl> view = new ResourceViewImpl(); + view->m_desc = desc; + view->memoryResource = dynamic_cast<BufferResourceImpl*>(buffer); + returnComPtr(outView, view); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createQueryPool( + const IQueryPool::Desc& desc, + IQueryPool** outPool) +{ + RefPtr<QueryPoolImpl> pool = new QueryPoolImpl(); + SLANG_RETURN_ON_FAIL(pool->init(desc)); + returnComPtr(outPool, pool); + return SLANG_OK; +} + +Result DeviceImpl::createShaderObjectLayout( + slang::TypeLayoutReflection* typeLayout, + ShaderObjectLayoutBase** outLayout) +{ + RefPtr<ShaderObjectLayoutImpl> cudaLayout; + cudaLayout = new ShaderObjectLayoutImpl(this, typeLayout); + returnRefPtrMove(outLayout, cudaLayout); + return SLANG_OK; +} + +Result DeviceImpl::createShaderObject( + ShaderObjectLayoutBase* layout, + IShaderObject** outObject) +{ + RefPtr<ShaderObjectImpl> result = new ShaderObjectImpl(); + SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout))); + returnComPtr(outObject, result); + return SLANG_OK; +} + +Result DeviceImpl::createMutableShaderObject( + ShaderObjectLayoutBase* layout, + IShaderObject** outObject) +{ + RefPtr<MutableShaderObjectImpl> result = new MutableShaderObjectImpl(); + SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout))); + returnComPtr(outObject, result); + return SLANG_OK; +} + +Result DeviceImpl::createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject) +{ + auto cudaProgram = dynamic_cast<ShaderProgramImpl*>(program); + auto cudaLayout = cudaProgram->layout; + + RefPtr<RootShaderObjectImpl> result = new RootShaderObjectImpl(); + SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout)); + returnRefPtrMove(outObject, result); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram( + const IShaderProgram::Desc& desc, + IShaderProgram** outProgram, + ISlangBlob** outDiagnosticBlob) +{ + // If this is a specializable program, we just keep a reference to the slang program and + // don't actually create any kernels. This program will be specialized later when we know + // the shader object bindings. + RefPtr<ShaderProgramImpl> cudaProgram = new ShaderProgramImpl(); + cudaProgram->init(desc); + cudaProgram->cudaContext = m_context; + if (desc.slangGlobalScope->getSpecializationParamCount() != 0) + { + cudaProgram->layout = new RootShaderObjectLayoutImpl(this, desc.slangGlobalScope->getLayout()); + returnComPtr(outProgram, cudaProgram); + return SLANG_OK; + } + + ComPtr<ISlangBlob> kernelCode; + ComPtr<ISlangBlob> diagnostics; + auto compileResult = desc.slangGlobalScope->getEntryPointCode( + (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef()); + if (diagnostics) + { + getDebugCallback()->handleMessage( + compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error, + DebugMessageSource::Slang, + (char*)diagnostics->getBufferPointer()); + if (outDiagnosticBlob) + returnComPtr(outDiagnosticBlob, diagnostics); + } + SLANG_RETURN_ON_FAIL(compileResult); + + SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer())); + cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName(); + SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction( + &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer())); + + auto slangGlobalScope = desc.slangGlobalScope; + if (slangGlobalScope) + { + cudaProgram->slangGlobalScope = slangGlobalScope; + + auto slangProgramLayout = slangGlobalScope->getLayout(); + if (!slangProgramLayout) + return SLANG_FAIL; + + RefPtr<RootShaderObjectLayoutImpl> cudaLayout; + cudaLayout = new RootShaderObjectLayoutImpl(this, slangProgramLayout); + cudaLayout->programLayout = slangProgramLayout; + cudaProgram->layout = cudaLayout; + } + + returnComPtr(outProgram, cudaProgram); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createComputePipelineState( + const ComputePipelineStateDesc& desc, IPipelineState** outState) +{ + RefPtr<ComputePipelineStateImpl> state = new ComputePipelineStateImpl(); + state->shaderProgram = static_cast<ShaderProgramImpl*>(desc.program); + state->init(desc); + returnComPtr(outState, state); + return Result(); +} + +void* DeviceImpl::map(IBufferResource* buffer) +{ + return static_cast<BufferResourceImpl*>(buffer)->m_cudaMemory; +} + +void DeviceImpl::unmap(IBufferResource* buffer) +{ + SLANG_UNUSED(buffer); +} + +SLANG_NO_THROW const DeviceInfo& SLANG_MCALL DeviceImpl::getDeviceInfo() const +{ + return m_info; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTransientResourceHeap( + const ITransientResourceHeap::Desc& desc, + ITransientResourceHeap** outHeap) +{ + RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl(); + SLANG_RETURN_ON_FAIL(result->init(this, desc)); + returnComPtr(outHeap, result); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL + DeviceImpl::createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) +{ + RefPtr<CommandQueueImpl> queue = new CommandQueueImpl(); + queue->init(this); + returnComPtr(outQueue, queue); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createSwapchain( + const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(window); + SLANG_UNUSED(outSwapchain); + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createFramebufferLayout( + const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(outLayout); + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL + DeviceImpl::createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(outFramebuffer); + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createRenderPassLayout( + const IRenderPassLayout::Desc& desc, + IRenderPassLayout** outRenderPassLayout) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(outRenderPassLayout); + return SLANG_FAIL; +} + +SLANG_NO_THROW Result SLANG_MCALL + DeviceImpl::createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) +{ + SLANG_UNUSED(desc); + *outSampler = nullptr; + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createInputLayout( + IInputLayout::Desc const& desc, + IInputLayout** outLayout) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(outLayout); + return SLANG_E_NOT_AVAILABLE; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createGraphicsPipelineState( + const GraphicsPipelineStateDesc& desc, IPipelineState** outState) +{ + SLANG_UNUSED(desc); + SLANG_UNUSED(outState); + return SLANG_E_NOT_AVAILABLE; +} + +SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::readTextureResource( + ITextureResource* texture, + ResourceState state, + ISlangBlob** outBlob, + size_t* outRowPitch, + size_t* outPixelSize) +{ + auto textureImpl = static_cast<TextureResourceImpl*>(texture); + RefPtr<ListBlob> blob = new ListBlob(); + + auto desc = textureImpl->getDesc(); + auto width = desc->size.width; + auto height = desc->size.height; + FormatInfo sizeInfo; + SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo)); + size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock; + size_t rowPitch = width * pixelSize; + size_t size = height * rowPitch; + blob->m_data.setCount((Index)size); + + CUDA_MEMCPY2D copyParam; + memset(©Param, 0, sizeof(copyParam)); + + copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.srcArray = textureImpl->m_cudaArray; + + copyParam.dstMemoryType = CU_MEMORYTYPE_HOST; + copyParam.dstHost = blob->m_data.getBuffer(); + copyParam.dstPitch = rowPitch; + copyParam.WidthInBytes = copyParam.dstPitch; + copyParam.Height = height; + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); + + *outRowPitch = rowPitch; + *outPixelSize = pixelSize; + returnComPtr(outBlob, blob); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource( + IBufferResource* buffer, + size_t offset, + size_t size, + ISlangBlob** outBlob) +{ + auto bufferImpl = static_cast<BufferResourceImpl*>(buffer); + RefPtr<ListBlob> blob = new ListBlob(); + blob->m_data.setCount((Index)size); + cudaMemcpy( + blob->m_data.getBuffer(), + (uint8_t*)bufferImpl->m_cudaMemory + offset, + size, + cudaMemcpyDefault); + returnComPtr(outBlob, blob); + return SLANG_OK; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-device.h b/tools/gfx/cuda/cuda-device.h new file mode 100644 index 000000000..e6a91393b --- /dev/null +++ b/tools/gfx/cuda/cuda-device.h @@ -0,0 +1,152 @@ +// cuda-device.h +#pragma once +#include "cuda-base.h" + +#include "cuda-command-buffer.h" +#include "cuda-context.h" +#include "cuda-helper-functions.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class DeviceImpl : public RendererBase +{ +private: + static const CUDAReportStyle reportType = CUDAReportStyle::Normal; + static int _calcSMCountPerMultiProcessor(int major, int minor); + + static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex); + + static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal); + +private: + int m_deviceIndex = -1; + CUdevice m_device = 0; + RefPtr<CUDAContext> m_context; + DeviceInfo m_info; + String m_adapterName; + +public: + virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override; + + virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override; + + Result getCUDAFormat(Format format, CUarray_format* outFormat); + + virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource( + const ITextureResource::Desc& desc, + const ITextureResource::SubresourceData* initData, + ITextureResource** outResource) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource( + const IBufferResource::Desc& descIn, + const void* initData, + IBufferResource** outResource) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle( + InteropHandle handle, + const IBufferResource::Desc& desc, + IBufferResource** outResource) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle( + InteropHandle handle, + const ITextureResource::Desc& desc, + const size_t size, + ITextureResource** outResource) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView( + ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView( + IBufferResource* buffer, + IBufferResource* counterBuffer, + IResourceView::Desc const& desc, + IResourceView** outView) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool( + const IQueryPool::Desc& desc, + IQueryPool** outPool) override; + + virtual Result createShaderObjectLayout( + slang::TypeLayoutReflection* typeLayout, + ShaderObjectLayoutBase** outLayout) override; + + virtual Result createShaderObject( + ShaderObjectLayoutBase* layout, + IShaderObject** outObject) override; + + virtual Result createMutableShaderObject( + ShaderObjectLayoutBase* layout, + IShaderObject** outObject) override; + + Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject); + + virtual SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram( + const IShaderProgram::Desc& desc, + IShaderProgram** outProgram, + ISlangBlob** outDiagnosticBlob) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState( + const ComputePipelineStateDesc& desc, IPipelineState** outState) override; + + void* map(IBufferResource* buffer); + + void unmap(IBufferResource* buffer); + + virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override; + +public: + using TransientResourceHeapImpl = SimpleTransientResourceHeap<DeviceImpl, CommandBufferImpl>; + + virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap( + const ITransientResourceHeap::Desc& desc, + ITransientResourceHeap** outHeap) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL + createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain( + const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout( + const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL + createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout( + const IRenderPassLayout::Desc& desc, + IRenderPassLayout** outRenderPassLayout) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL + createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout( + IInputLayout::Desc const& desc, + IInputLayout** outLayout) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState( + const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override; + + virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource( + ITextureResource* texture, + ResourceState state, + ISlangBlob** outBlob, + size_t* outRowPitch, + size_t* outPixelSize) override; + + virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource( + IBufferResource* buffer, + size_t offset, + size_t size, + ISlangBlob** outBlob) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp new file mode 100644 index 000000000..6325d9fc6 --- /dev/null +++ b/tools/gfx/cuda/cuda-helper-functions.cpp @@ -0,0 +1,91 @@ +// cuda-helper-functions.cpp +#include "cuda-helper-functions.h" + +#include "cuda-device.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ +SlangResult CUDAErrorInfo::handle() const +{ + StringBuilder builder; + builder << "Error: " << m_filePath << " (" << m_lineNo << ") :"; + + if (m_errorName) + { + builder << m_errorName << " : "; + } + if (m_errorString) + { + builder << m_errorString; + } + + getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver, + builder.getUnownedSlice().begin()); + + // Slang::signalUnexpectedError(builder.getBuffer()); + return SLANG_FAIL; +} + +SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line) +{ + CUDAErrorInfo info(file, line); + cuGetErrorString(cuResult, &info.m_errorString); + cuGetErrorName(cuResult, &info.m_errorName); + return info.handle(); +} + +SlangResult _handleCUDAError(cudaError_t error, const char* file, int line) +{ + return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle(); +} + +# ifdef RENDER_TEST_OPTIX + +static bool _isError(OptixResult result) +{ + return result != OPTIX_SUCCESS; +} + +# if 1 +static SlangResult _handleOptixError(OptixResult result, char const* file, int line) +{ + fprintf( + stderr, + "%s(%d): optix: %s (%s)\n", + file, + line, + optixGetErrorString(result), + optixGetErrorName(result)); + return SLANG_FAIL; +} + +void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData) +{ + fprintf(stderr, "optix: %s (%s)\n", message, tag); +} +# endif +# endif +} // namespace cuda + +Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) +{ +RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl(); +SLANG_RETURN_ON_FAIL(result->initialize(*desc)); +returnComPtr(outDevice, result); +return SLANG_OK; +} +#else +Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) +{ +SLANG_UNUSED(desc); +*outDevice = nullptr; +return SLANG_FAIL; +} +#endif + +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h new file mode 100644 index 000000000..001e3946a --- /dev/null +++ b/tools/gfx/cuda/cuda-helper-functions.h @@ -0,0 +1,106 @@ +// cuda-helper-functions.h +#pragma once + +#include "slang-gfx.h" +#include "cuda-base.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ +SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; } +SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; } + +// A enum used to control if errors are reported on failure of CUDA call. +enum class CUDAReportStyle +{ + Normal, + Silent, +}; + +struct CUDAErrorInfo +{ + CUDAErrorInfo( + const char* filePath, + int lineNo, + const char* errorName = nullptr, + const char* errorString = nullptr) + : m_filePath(filePath) + , m_lineNo(lineNo) + , m_errorName(errorName) + , m_errorString(errorString) + {} + SlangResult handle() const; + + const char* m_filePath; + int m_lineNo; + const char* m_errorName; + const char* m_errorString; +}; + +// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream. + +SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line); +SlangResult _handleCUDAError(cudaError_t error, const char* file, int line); + +# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__) + +# define SLANG_CUDA_RETURN_ON_FAIL(x) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + return SLANG_CUDA_HANDLE_ERROR(_res); \ + } + +# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + { \ + return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \ + : SLANG_FAIL; \ + } \ + } + +# define SLANG_CUDA_ASSERT_ON_FAIL(x) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + { \ + SLANG_ASSERT(!"Failed CUDA call"); \ + }; \ + } + +# ifdef RENDER_TEST_OPTIX + +bool _isError(OptixResult result); + +# if 1 +SlangResult _handleOptixError(OptixResult result, char const* file, int line); + +# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__) +# else +# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL +# endif + +# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \ + do \ + { \ + auto _res = EXPR; \ + if (_isError(_res)) \ + return SLANG_OPTIX_HANDLE_ERROR(_res); \ + } while (0) + +void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData); + +# endif + +} // namespace cuda +#endif + +Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice); + +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-pipeline-state.cpp b/tools/gfx/cuda/cuda-pipeline-state.cpp new file mode 100644 index 000000000..fdbc0ea0e --- /dev/null +++ b/tools/gfx/cuda/cuda-pipeline-state.cpp @@ -0,0 +1,22 @@ +// cuda-pipeline-state.cpp +#include "cuda-pipeline-state.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +void ComputePipelineStateImpl::init(const ComputePipelineStateDesc& inDesc) +{ + PipelineStateDesc pipelineDesc; + pipelineDesc.type = PipelineType::Compute; + pipelineDesc.compute = inDesc; + initializeBase(pipelineDesc); +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-pipeline-state.h b/tools/gfx/cuda/cuda-pipeline-state.h new file mode 100644 index 000000000..e13266d8b --- /dev/null +++ b/tools/gfx/cuda/cuda-pipeline-state.h @@ -0,0 +1,28 @@ +// cuda-pipeline-state.h +#pragma once +#include "cuda-base.h" +#include "cuda-shader-program.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class PipelineStateImpl : public PipelineStateBase +{ +public: +}; + +class ComputePipelineStateImpl : public PipelineStateImpl +{ +public: + RefPtr<ShaderProgramImpl> shaderProgram; + void init(const ComputePipelineStateDesc& inDesc); +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-query.cpp b/tools/gfx/cuda/cuda-query.cpp new file mode 100644 index 000000000..7e97699f8 --- /dev/null +++ b/tools/gfx/cuda/cuda-query.cpp @@ -0,0 +1,48 @@ +// cuda-query.cpp +#include "cuda-query.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +Result QueryPoolImpl::init(const IQueryPool::Desc& desc) +{ + cuEventCreate(&m_startEvent, 0); + cuEventRecord(m_startEvent, 0); + m_events.setCount(desc.count); + for (SlangInt i = 0; i < m_events.getCount(); i++) + { + cuEventCreate(&m_events[i], 0); + } + return SLANG_OK; +} + +QueryPoolImpl::~QueryPoolImpl() +{ + for (auto& e : m_events) + { + cuEventDestroy(e); + } + cuEventDestroy(m_startEvent); +} + +SLANG_NO_THROW Result SLANG_MCALL QueryPoolImpl::getResult( + GfxIndex queryIndex, GfxCount count, uint64_t* data) +{ + for (GfxIndex i = 0; i < count; i++) + { + float time = 0.0f; + cuEventSynchronize(m_events[i + queryIndex]); + cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]); + data[i] = (uint64_t)((double)time * 1000.0f); + } + return SLANG_OK; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-query.h b/tools/gfx/cuda/cuda-query.h new file mode 100644 index 000000000..db29f488d --- /dev/null +++ b/tools/gfx/cuda/cuda-query.h @@ -0,0 +1,32 @@ +// cuda-query.h +#pragma once +#include "cuda-base.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class QueryPoolImpl : public QueryPoolBase +{ +public: + // The event object for each query. Owned by the pool. + List<CUevent> m_events; + + // The event that marks the starting point. + CUevent m_startEvent; + + Result init(const IQueryPool::Desc& desc); + + ~QueryPoolImpl(); + + virtual SLANG_NO_THROW Result SLANG_MCALL getResult( + GfxIndex queryIndex, GfxCount count, uint64_t* data) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-resource-views.h b/tools/gfx/cuda/cuda-resource-views.h new file mode 100644 index 000000000..33bf557d7 --- /dev/null +++ b/tools/gfx/cuda/cuda-resource-views.h @@ -0,0 +1,26 @@ +// cuda-resource-views.h +#pragma once +#include "cuda-base.h" + +#include "cuda-buffer.h" +#include "cuda-texture.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class ResourceViewImpl : public ResourceViewBase +{ +public: + RefPtr<BufferResourceImpl> memoryResource = nullptr; + RefPtr<TextureResourceImpl> textureResource = nullptr; + void* proxyBuffer = nullptr; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-object-layout.cpp b/tools/gfx/cuda/cuda-shader-object-layout.cpp new file mode 100644 index 000000000..0cbe23a63 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-object-layout.cpp @@ -0,0 +1,153 @@ +// cuda-shader-object-layout.cpp +#include "cuda-shader-object-layout.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ShaderObjectLayoutImpl::ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout) +{ + m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType); + + initBase(renderer, m_elementTypeLayout); + + // Compute the binding ranges that are used to store + // the logical contents of the object in memory. These will relate + // to the descriptor ranges in the various sets, but not always + // in a one-to-one fashion. + + SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount(); + for (SlangInt r = 0; r < bindingRangeCount; ++r) + { + slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r); + SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r); + slang::TypeLayoutReflection* slangLeafTypeLayout = + m_elementTypeLayout->getBindingRangeLeafTypeLayout(r); + + SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r); + SlangInt rangeIndexInDescriptorSet = + m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r); + + // TODO: This logic assumes that for any binding range that might consume + // multiple kinds of resources, the descriptor range for its uniform + // usage will be the first one in the range. + // + // We need to decide whether that assumption is one we intend to support + // applications making, or whether they should be forced to perform a + // linear search over the descriptor ranges for a specific binding range. + // + auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset( + descriptorSetIndex, rangeIndexInDescriptorSet); + + Index baseIndex = 0; + Index subObjectIndex = 0; + switch (slangBindingType) + { + case slang::BindingType::ConstantBuffer: + case slang::BindingType::ParameterBlock: + case slang::BindingType::ExistentialValue: + baseIndex = m_subObjectCount; + subObjectIndex = baseIndex; + m_subObjectCount += count; + break; + case slang::BindingType::RawBuffer: + case slang::BindingType::MutableRawBuffer: + if (slangLeafTypeLayout->getType()->getElementType() != nullptr) + { + // A structured buffer occupies both a resource slot and + // a sub-object slot. + subObjectIndex = m_subObjectCount; + m_subObjectCount += count; + } + baseIndex = m_resourceCount; + m_resourceCount += count; + break; + default: + baseIndex = m_resourceCount; + m_resourceCount += count; + break; + } + + BindingRangeInfo bindingRangeInfo; + bindingRangeInfo.bindingType = slangBindingType; + bindingRangeInfo.count = count; + bindingRangeInfo.baseIndex = baseIndex; + bindingRangeInfo.uniformOffset = uniformOffset; + bindingRangeInfo.subObjectIndex = subObjectIndex; + m_bindingRanges.add(bindingRangeInfo); + } + + SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount(); + for (SlangInt r = 0; r < subObjectRangeCount; ++r) + { + SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r); + auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex); + slang::TypeLayoutReflection* slangLeafTypeLayout = + m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex); + + // A sub-object range can either represent a sub-object of a known + // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>` + // (in which case we can pre-compute a layout to use, based on + // the type `Foo`) *or* it can represent a sub-object of some + // existential type (e.g., `IBar`) in which case we cannot + // know the appropriate type/layout of sub-object to allocate. + // + RefPtr<ShaderObjectLayoutImpl> subObjectLayout; + if (slangBindingType != slang::BindingType::ExistentialValue) + { + subObjectLayout = + new ShaderObjectLayoutImpl(renderer, slangLeafTypeLayout->getElementTypeLayout()); + } + + SubObjectRangeInfo subObjectRange; + subObjectRange.bindingRangeIndex = bindingRangeIndex; + subObjectRange.layout = subObjectLayout; + subObjectRanges.add(subObjectRange); + } +} + +Index ShaderObjectLayoutImpl::getResourceCount() const { return m_resourceCount; } +Index ShaderObjectLayoutImpl::getSubObjectCount() const { return m_subObjectCount; } +List<SubObjectRangeInfo>& ShaderObjectLayoutImpl::getSubObjectRanges() { return subObjectRanges; } +BindingRangeInfo ShaderObjectLayoutImpl::getBindingRange(Index index) { return m_bindingRanges[index]; } +Index ShaderObjectLayoutImpl::getBindingRangeCount() const { return m_bindingRanges.getCount(); } + +RootShaderObjectLayoutImpl::RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout) + : ShaderObjectLayoutImpl(renderer, inProgramLayout->getGlobalParamsTypeLayout()) + , programLayout(inProgramLayout) +{ + for (UInt i = 0; i < programLayout->getEntryPointCount(); i++) + { + entryPointLayouts.add(new ShaderObjectLayoutImpl( + renderer, + programLayout->getEntryPointByIndex(i)->getTypeLayout())); + } + +} + +int RootShaderObjectLayoutImpl::getKernelIndex(UnownedStringSlice kernelName) +{ + for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++) + { + auto entryPoint = programLayout->getEntryPointByIndex(i); + if (kernelName == entryPoint->getName()) + { + return i; + } + } + return -1; +} + +void RootShaderObjectLayoutImpl::getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes) +{ + auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex); + entryPoint->getComputeThreadGroupSize(3, threadGroupSizes); +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-object-layout.h b/tools/gfx/cuda/cuda-shader-object-layout.h new file mode 100644 index 000000000..305129109 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-object-layout.h @@ -0,0 +1,74 @@ +// cuda-shader-object-layout.h +#pragma once +#include "cuda-base.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +struct BindingRangeInfo +{ + slang::BindingType bindingType; + Index count; + Index baseIndex; // Flat index for sub-objects + Index subObjectIndex; + + // TODO: The `uniformOffset` field should be removed, + // since it cannot be supported by the Slang reflection + // API once we fix some design issues. + // + // It is only being used today for pre-allocation of sub-objects + // for constant buffers and parameter blocks (which should be + // deprecated/removed anyway). + // + // Note: We would need to bring this field back, plus + // a lot of other complexity, if we ever want to support + // setting of resources/buffers directly by a binding + // range index and array index. + // + Index uniformOffset; // Uniform offset for a resource typed field. +}; + +struct SubObjectRangeInfo +{ + RefPtr<ShaderObjectLayoutImpl> layout; + Index bindingRangeIndex; +}; + +class ShaderObjectLayoutImpl : public ShaderObjectLayoutBase +{ +public: + List<SubObjectRangeInfo> subObjectRanges; + List<BindingRangeInfo> m_bindingRanges; + + Index m_subObjectCount = 0; + Index m_resourceCount = 0; + + ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout); + + Index getResourceCount() const; + Index getSubObjectCount() const; + List<SubObjectRangeInfo>& getSubObjectRanges(); + BindingRangeInfo getBindingRange(Index index); + Index getBindingRangeCount() const; +}; + +class RootShaderObjectLayoutImpl : public ShaderObjectLayoutImpl +{ +public: + slang::ProgramLayout* programLayout = nullptr; + List<RefPtr<ShaderObjectLayoutImpl>> entryPointLayouts; + RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout); + + int getKernelIndex(UnownedStringSlice kernelName); + + void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes); +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp new file mode 100644 index 000000000..3fc55e401 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-object.cpp @@ -0,0 +1,349 @@ +// cuda-shader-object.cpp +#include "cuda-shader-object.h" + +#include "cuda-shader-object-layout.h" +#include "cuda-resource-views.h" + +#include "cuda-helper-functions.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ +Result ShaderObjectData::setCount(Index count) +{ + if (isHostOnly) + { + m_cpuBuffer.setCount(count); + if (!m_bufferView) + { + IResourceView::Desc viewDesc = {}; + viewDesc.type = IResourceView::Type::UnorderedAccess; + m_bufferView = new ResourceViewImpl(); + m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer(); + m_bufferView->m_desc = viewDesc; + } + return SLANG_OK; + } + + if (!m_bufferResource) + { + IBufferResource::Desc desc; + desc.type = IResource::Type::Buffer; + desc.sizeInBytes = count; + m_bufferResource = new BufferResourceImpl(desc); + if (count) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count)); + } + IResourceView::Desc viewDesc = {}; + viewDesc.type = IResourceView::Type::UnorderedAccess; + m_bufferView = new ResourceViewImpl(); + m_bufferView->memoryResource = m_bufferResource; + m_bufferView->m_desc = viewDesc; + } + auto oldSize = m_bufferResource->getDesc()->sizeInBytes; + if ((size_t)count != oldSize) + { + void* newMemory = nullptr; + if (count) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count)); + } + if (oldSize) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( + newMemory, + m_bufferResource->m_cudaMemory, + Math::Min((size_t)count, oldSize), + cudaMemcpyDefault)); + } + cudaFree(m_bufferResource->m_cudaMemory); + m_bufferResource->m_cudaMemory = newMemory; + m_bufferResource->getDesc()->sizeInBytes = count; + } + return SLANG_OK; +} + +Slang::Index ShaderObjectData::getCount() +{ + if (isHostOnly) + return m_cpuBuffer.getCount(); + if (m_bufferResource) + return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes); + else + return 0; +} + +void* ShaderObjectData::getBuffer() +{ + if (isHostOnly) + return m_cpuBuffer.getBuffer(); + + if (m_bufferResource) + return m_bufferResource->m_cudaMemory; + return nullptr; +} + +/// Returns a resource view for GPU access into the buffer content. +ResourceViewBase* ShaderObjectData::getResourceView( + RendererBase* device, + slang::TypeLayoutReflection* elementLayout, + slang::BindingType bindingType) +{ + SLANG_UNUSED(device); + m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize(); + return m_bufferView.Ptr(); +} + +SlangResult ShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout) +{ + m_layout = typeLayout; + + // If the layout tells us that there is any uniform data, + // then we need to allocate a constant buffer to hold that data. + // + // TODO: Do we need to allocate a shadow copy for use from + // the CPU? + // + // TODO: When/where do we bind this constant buffer into + // a descriptor set for later use? + // + auto slangLayout = getLayout()->getElementTypeLayout(); + size_t uniformSize = slangLayout->getSize(); + if (uniformSize) + { + m_data.setCount((Index)uniformSize); + } + + // If the layout specifies that we have any resources or sub-objects, + // then we need to size the appropriate arrays to account for them. + // + // Note: the counts here are the *total* number of resources/sub-objects + // and not just the number of resource/sub-object ranges. + // + resources.setCount(typeLayout->getResourceCount()); + m_objects.setCount(typeLayout->getSubObjectCount()); + + for (auto subObjectRange : getLayout()->subObjectRanges) + { + RefPtr<ShaderObjectLayoutImpl> subObjectLayout = subObjectRange.layout; + + // In the case where the sub-object range represents an + // existential-type leaf field (e.g., an `IBar`), we + // cannot pre-allocate the object(s) to go into that + // range, since we can't possibly know what to allocate + // at this point. + // + if (!subObjectLayout) + continue; + // + // Otherwise, we will allocate a sub-object to fill + // in each entry in this range, based on the layout + // information we already have. + + auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex]; + for (Index i = 0; i < bindingRangeInfo.count; ++i) + { + RefPtr<ShaderObjectImpl> subObject = new ShaderObjectImpl(); + SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout)); + + ShaderOffset offset; + offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i; + offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex; + offset.bindingArrayIndex = (GfxIndex)i; + + SLANG_RETURN_ON_FAIL(setObject(offset, subObject)); + } + } + return SLANG_OK; +} + +SLANG_NO_THROW GfxCount SLANG_MCALL ShaderObjectImpl::getEntryPointCount() +{ + return 0; +} + +SLANG_NO_THROW Result SLANG_MCALL + ShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) +{ + *outEntryPoint = nullptr; + return SLANG_OK; +} + +SLANG_NO_THROW const void* SLANG_MCALL ShaderObjectImpl::getRawData() +{ + return m_data.getBuffer(); +} + +SLANG_NO_THROW Size SLANG_MCALL ShaderObjectImpl::getSize() +{ + return (Size)m_data.getCount(); +} + +SLANG_NO_THROW Result SLANG_MCALL + ShaderObjectImpl::setData(ShaderOffset const& offset, void const* data, Size size) +{ + Size temp = m_data.getCount() - (Size)offset.uniformOffset; + size = Math::Min(size, temp); + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( + (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault)); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL + ShaderObjectImpl::setResource(ShaderOffset const& offset, IResourceView* resourceView) +{ + if (!resourceView) + return SLANG_OK; + + auto layout = getLayout(); + + auto bindingRangeIndex = offset.bindingRangeIndex; + SLANG_ASSERT(bindingRangeIndex >= 0); + SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount()); + + auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex]; + + auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex; + auto cudaView = static_cast<ResourceViewImpl*>(resourceView); + + resources[viewIndex] = cudaView; + + if (cudaView->textureResource) + { + if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess) + { + auto handle = cudaView->textureResource->m_cudaSurfObj; + setData(offset, &handle, sizeof(uint64_t)); + } + else + { + auto handle = cudaView->textureResource->getBindlessHandle(); + setData(offset, &handle, sizeof(uint64_t)); + } + } + else if (cudaView->memoryResource) + { + auto handle = cudaView->memoryResource->getBindlessHandle(); + setData(offset, &handle, sizeof(handle)); + auto sizeOffset = offset; + sizeOffset.uniformOffset += sizeof(handle); + auto& desc = *cudaView->memoryResource->getDesc(); + size_t size = desc.sizeInBytes; + if (desc.elementSize > 1) + size /= desc.elementSize; + setData(sizeOffset, &size, sizeof(size)); + } + else if (cudaView->proxyBuffer) + { + auto handle = cudaView->proxyBuffer; + setData(offset, &handle, sizeof(handle)); + auto sizeOffset = offset; + sizeOffset.uniformOffset += sizeof(handle); + auto& desc = *cudaView->memoryResource->getDesc(); + size_t size = desc.sizeInBytes; + if (desc.elementSize > 1) + size /= desc.elementSize; + setData(sizeOffset, &size, sizeof(size)); + } + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL + ShaderObjectImpl::setObject(ShaderOffset const& offset, IShaderObject* object) +{ + SLANG_RETURN_ON_FAIL(Super::setObject(offset, object)); + + auto bindingRangeIndex = offset.bindingRangeIndex; + auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex]; + + ShaderObjectImpl* subObject = static_cast<ShaderObjectImpl*>(object); + switch (bindingRange.bindingType) + { + default: + { + void* subObjectDataBuffer = subObject->getBuffer(); + SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*))); + } + break; + case slang::BindingType::ExistentialValue: + case slang::BindingType::RawBuffer: + case slang::BindingType::MutableRawBuffer: + break; + } + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL + ShaderObjectImpl::setSampler(ShaderOffset const& offset, ISamplerState* sampler) +{ + SLANG_UNUSED(sampler); + SLANG_UNUSED(offset); + return SLANG_OK; +} + +SLANG_NO_THROW Result SLANG_MCALL ShaderObjectImpl::setCombinedTextureSampler( + ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) +{ + SLANG_UNUSED(sampler); + setResource(offset, textureView); + return SLANG_OK; +} + +EntryPointShaderObjectImpl::EntryPointShaderObjectImpl() +{ + m_data.isHostOnly = true; +} + +SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::addRef() +{ + return 1; +} + +SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::release() +{ + return 1; +} + +SlangResult RootShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout) +{ + SLANG_RETURN_ON_FAIL(ShaderObjectImpl::init(device, typeLayout)); + auto programLayout = dynamic_cast<RootShaderObjectLayoutImpl*>(typeLayout); + for (auto& entryPoint : programLayout->entryPointLayouts) + { + RefPtr<EntryPointShaderObjectImpl> object = new EntryPointShaderObjectImpl(); + SLANG_RETURN_ON_FAIL(object->init(device, entryPoint)); + entryPointObjects.add(object); + } + return SLANG_OK; +} + +SLANG_NO_THROW GfxCount SLANG_MCALL RootShaderObjectImpl::getEntryPointCount() +{ + return (GfxCount)entryPointObjects.getCount(); +} + +SLANG_NO_THROW Result SLANG_MCALL + RootShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) +{ + returnComPtr(outEntryPoint, entryPointObjects[index]); + return SLANG_OK; +} + +Result RootShaderObjectImpl::collectSpecializationArgs(ExtendedShaderObjectTypeList& args) +{ + SLANG_RETURN_ON_FAIL(ShaderObjectImpl::collectSpecializationArgs(args)); + for (auto& entryPoint : entryPointObjects) + { + SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args)); + } + return SLANG_OK; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-object.h b/tools/gfx/cuda/cuda-shader-object.h new file mode 100644 index 000000000..f564f5eb5 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-object.h @@ -0,0 +1,93 @@ +// cuda-shader-object.h +#pragma once +#include "cuda-base.h" + +#include "cuda-buffer.h" +#include "cuda-resource-views.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class ShaderObjectData +{ +public: + bool isHostOnly = false; + Slang::RefPtr<BufferResourceImpl> m_bufferResource; + Slang::RefPtr<ResourceViewImpl> m_bufferView; + Slang::List<uint8_t> m_cpuBuffer; + + Result setCount(Index count); + Slang::Index getCount(); + void* getBuffer(); + + /// Returns a resource view for GPU access into the buffer content. + ResourceViewBase* getResourceView( + RendererBase* device, + slang::TypeLayoutReflection* elementLayout, + slang::BindingType bindingType); +}; + +class ShaderObjectImpl + : public ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData> +{ + typedef ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData> + Super; + +public: + List<RefPtr<ResourceViewImpl>> resources; + + virtual SLANG_NO_THROW Result SLANG_MCALL + init(IDevice* device, ShaderObjectLayoutImpl* typeLayout); + + virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override; + virtual SLANG_NO_THROW Result SLANG_MCALL + getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override; + + virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override; + + virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override; + + virtual SLANG_NO_THROW Result SLANG_MCALL + setData(ShaderOffset const& offset, void const* data, Size size) override; + virtual SLANG_NO_THROW Result SLANG_MCALL + setResource(ShaderOffset const& offset, IResourceView* resourceView) override; + virtual SLANG_NO_THROW Result SLANG_MCALL + setObject(ShaderOffset const& offset, IShaderObject* object) override; + virtual SLANG_NO_THROW Result SLANG_MCALL + setSampler(ShaderOffset const& offset, ISamplerState* sampler) override; + virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler( + ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override; +}; + +class MutableShaderObjectImpl : public MutableShaderObject< MutableShaderObjectImpl, ShaderObjectLayoutImpl> +{}; + +class EntryPointShaderObjectImpl : public ShaderObjectImpl +{ +public: + EntryPointShaderObjectImpl(); +}; + +class RootShaderObjectImpl : public ShaderObjectImpl +{ +public: + virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override; + virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override; +public: + List<RefPtr<EntryPointShaderObjectImpl>> entryPointObjects; + virtual SLANG_NO_THROW Result SLANG_MCALL + init(IDevice* device, ShaderObjectLayoutImpl* typeLayout) override; + virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override; + virtual SLANG_NO_THROW Result SLANG_MCALL + getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override; + virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-program.cpp b/tools/gfx/cuda/cuda-shader-program.cpp new file mode 100644 index 000000000..73e0c3c19 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-program.cpp @@ -0,0 +1,20 @@ +// cuda-shader-program.cpp +#include "cuda-shader-program.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +ShaderProgramImpl::~ShaderProgramImpl() +{ + if (cudaModule) + cuModuleUnload(cudaModule); +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-shader-program.h b/tools/gfx/cuda/cuda-shader-program.h new file mode 100644 index 000000000..b0961bfc3 --- /dev/null +++ b/tools/gfx/cuda/cuda-shader-program.h @@ -0,0 +1,29 @@ +// cuda-shader-program.h +#pragma once +#include "cuda-base.h" + +#include "cuda-context.h" +#include "cuda-shader-object-layout.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class ShaderProgramImpl : public ShaderProgramBase +{ +public: + CUmodule cudaModule = nullptr; + CUfunction cudaKernel; + String kernelName; + RefPtr<RootShaderObjectLayoutImpl> layout; + RefPtr<CUDAContext> cudaContext; + ~ShaderProgramImpl(); +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-texture.cpp b/tools/gfx/cuda/cuda-texture.cpp new file mode 100644 index 000000000..6e6e42db5 --- /dev/null +++ b/tools/gfx/cuda/cuda-texture.cpp @@ -0,0 +1,48 @@ +// cuda-texture.cpp +#include "cuda-texture.h" + +#include "cuda-helper-functions.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +TextureResourceImpl::~TextureResourceImpl() +{ + if (m_cudaSurfObj) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj)); + } + if (m_cudaTexObj) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); + } + if (m_cudaArray) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); + } + if (m_cudaMipMappedArray) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray)); + } +} + +uint64_t TextureResourceImpl::getBindlessHandle() +{ + return (uint64_t)m_cudaTexObj; +} + +Result TextureResourceImpl::getNativeResourceHandle(InteropHandle* outHandle) +{ + outHandle->handleValue = getBindlessHandle(); + outHandle->api = InteropHandleAPI::CUDA; + return SLANG_OK; +} + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/cuda-texture.h b/tools/gfx/cuda/cuda-texture.h new file mode 100644 index 000000000..c9cc8569a --- /dev/null +++ b/tools/gfx/cuda/cuda-texture.h @@ -0,0 +1,44 @@ +// cuda-texture.h +#pragma once +#include "cuda-base.h" + +#include "cuda-context.h" + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +namespace cuda +{ + +class TextureResourceImpl : public TextureResource +{ +public: + TextureResourceImpl(const TextureResource::Desc& desc) + : TextureResource(desc) + {} + ~TextureResourceImpl(); + + uint64_t getBindlessHandle(); + + // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by + // a long long + CUtexObject m_cudaTexObj = CUtexObject(); + + // The surfObj is for reading/writing 'texture like' things, but not for sampling. + CUsurfObject m_cudaSurfObj = CUsurfObject(); + + CUarray m_cudaArray = CUarray(); + CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray(); + + void* m_cudaExternalMemory = nullptr; + + RefPtr<CUDAContext> m_cudaContext; + + virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override; +}; + +} // namespace cuda +#endif +} // namespace gfx diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp deleted file mode 100644 index 114751a3d..000000000 --- a/tools/gfx/cuda/render-cuda.cpp +++ /dev/null @@ -1,2513 +0,0 @@ -#include "render-cuda.h" - -#ifdef GFX_ENABLE_CUDA -#include <cuda.h> -#include <cuda_runtime_api.h> -#include "core/slang-basic.h" -#include "core/slang-blob.h" -#include "core/slang-std-writers.h" - -#include "slang.h" -#include "slang-com-ptr.h" -#include "slang-com-helper.h" -#include "../command-writer.h" -#include "../renderer-shared.h" -#include "../mutable-shader-object.h" -#include "../simple-transient-resource-heap.h" -#include "../slang-context.h" -#include "../command-encoder-com-forward.h" - -# ifdef RENDER_TEST_OPTIX - -// The `optix_stubs.h` header produces warnings when compiled with MSVC -# ifdef _MSC_VER -# pragma warning(disable: 4996) -# endif - -# include <optix.h> -# include <optix_function_table_definition.h> -# include <optix_stubs.h> -# endif - -#endif - -namespace gfx -{ -#ifdef GFX_ENABLE_CUDA -using namespace Slang; - -SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } -SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } - -// A enum used to control if errors are reported on failure of CUDA call. -enum class CUDAReportStyle -{ - Normal, - Silent, -}; - -struct CUDAErrorInfo -{ - CUDAErrorInfo( - const char* filePath, - int lineNo, - const char* errorName = nullptr, - const char* errorString = nullptr) - : m_filePath(filePath) - , m_lineNo(lineNo) - , m_errorName(errorName) - , m_errorString(errorString) - {} - SlangResult handle() const - { - StringBuilder builder; - builder << "Error: " << m_filePath << " (" << m_lineNo << ") :"; - - if (m_errorName) - { - builder << m_errorName << " : "; - } - if (m_errorString) - { - builder << m_errorString; - } - - getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver, - builder.getUnownedSlice().begin()); - - // Slang::signalUnexpectedError(builder.getBuffer()); - return SLANG_FAIL; - } - - const char* m_filePath; - int m_lineNo; - const char* m_errorName; - const char* m_errorString; -}; - -// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream. - -static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line) -{ - CUDAErrorInfo info(file, line); - cuGetErrorString(cuResult, &info.m_errorString); - cuGetErrorName(cuResult, &info.m_errorName); - return info.handle(); -} - -static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line) -{ - return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle(); -} - -# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__) - -# define SLANG_CUDA_RETURN_ON_FAIL(x) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - return SLANG_CUDA_HANDLE_ERROR(_res); \ - } -# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - { \ - return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \ - : SLANG_FAIL; \ - } \ - } - -# define SLANG_CUDA_ASSERT_ON_FAIL(x) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - { \ - SLANG_ASSERT(!"Failed CUDA call"); \ - }; \ - } - -# ifdef RENDER_TEST_OPTIX - -static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; } - -# if 1 -static SlangResult _handleOptixError(OptixResult result, char const* file, int line) -{ - fprintf( - stderr, - "%s(%d): optix: %s (%s)\n", - file, - line, - optixGetErrorString(result), - optixGetErrorName(result)); - return SLANG_FAIL; -} -# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__) -# else -# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL -# endif - -# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \ - do \ - { \ - auto _res = EXPR; \ - if (_isError(_res)) \ - return SLANG_OPTIX_HANDLE_ERROR(_res); \ - } while (0) - -void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData) -{ - fprintf(stderr, "optix: %s (%s)\n", message, tag); -} - -# endif - -class CUDAContext : public RefObject -{ -public: - CUcontext m_context = nullptr; - ~CUDAContext() { cuCtxDestroy(m_context); } -}; - -class MemoryCUDAResource : public BufferResource -{ -public: - MemoryCUDAResource(const Desc& _desc) - : BufferResource(_desc) - {} - - ~MemoryCUDAResource() - { - if (m_cudaMemory) - { - SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); - } - } - - uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; } - - void* m_cudaExternalMemory = nullptr; - void* m_cudaMemory = nullptr; - - RefPtr<CUDAContext> m_cudaContext; - - virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override - { - return (DeviceAddress)m_cudaMemory; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override - { - outHandle->handleValue = getBindlessHandle(); - outHandle->api = InteropHandleAPI::CUDA; - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - map(MemoryRange* rangeToRead, void** outPointer) override - { - SLANG_UNUSED(rangeToRead); - SLANG_UNUSED(outPointer); - return SLANG_FAIL; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override - { - SLANG_UNUSED(writtenRange); - return SLANG_FAIL; - } -}; - -class TextureCUDAResource : public TextureResource -{ -public: - TextureCUDAResource(const TextureResource::Desc& desc) - : TextureResource(desc) - {} - ~TextureCUDAResource() - { - if (m_cudaSurfObj) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj)); - } - if (m_cudaTexObj) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); - } - if (m_cudaArray) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); - } - if (m_cudaMipMappedArray) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray)); - } - } - - uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; } - - // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by - // a long long - CUtexObject m_cudaTexObj = CUtexObject(); - - // The surfObj is for reading/writing 'texture like' things, but not for sampling. - CUsurfObject m_cudaSurfObj = CUsurfObject(); - - CUarray m_cudaArray = CUarray(); - CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray(); - - void* m_cudaExternalMemory = nullptr; - - RefPtr<CUDAContext> m_cudaContext; - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override - { - outHandle->handleValue = getBindlessHandle(); - outHandle->api = InteropHandleAPI::CUDA; - return SLANG_OK; - } -}; - -class CUDAResourceView : public ResourceViewBase -{ -public: - RefPtr<MemoryCUDAResource> memoryResource = nullptr; - RefPtr<TextureCUDAResource> textureResource = nullptr; - void* proxyBuffer = nullptr; -}; - -class CUDAShaderObjectLayout : public ShaderObjectLayoutBase -{ -public: - struct BindingRangeInfo - { - slang::BindingType bindingType; - Index count; - Index baseIndex; // Flat index for sub-ojects - Index subObjectIndex; - - // TODO: The `uniformOffset` field should be removed, - // since it cannot be supported by the Slang reflection - // API once we fix some design issues. - // - // It is only being used today for pre-allocation of sub-objects - // for constant buffers and parameter blocks (which should be - // deprecated/removed anyway). - // - // Note: We would need to bring this field back, plus - // a lot of other complexity, if we ever want to support - // setting of resources/buffers directly by a binding - // range index and array index. - // - Index uniformOffset; // Uniform offset for a resource typed field. - }; - - struct SubObjectRangeInfo - { - RefPtr<CUDAShaderObjectLayout> layout; - Index bindingRangeIndex; - }; - - List<SubObjectRangeInfo> subObjectRanges; - List<BindingRangeInfo> m_bindingRanges; - - Index m_subObjectCount = 0; - Index m_resourceCount = 0; - - CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout) - { - m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType); - - initBase(renderer, m_elementTypeLayout); - - // Compute the binding ranges that are used to store - // the logical contents of the object in memory. These will relate - // to the descriptor ranges in the various sets, but not always - // in a one-to-one fashion. - - SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount(); - for (SlangInt r = 0; r < bindingRangeCount; ++r) - { - slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r); - SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r); - slang::TypeLayoutReflection* slangLeafTypeLayout = - m_elementTypeLayout->getBindingRangeLeafTypeLayout(r); - - SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r); - SlangInt rangeIndexInDescriptorSet = - m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r); - - // TODO: This logic assumes that for any binding range that might consume - // multiple kinds of resources, the descriptor range for its uniform - // usage will be the first one in the range. - // - // We need to decide whether that assumption is one we intend to support - // applications making, or whether they should be forced to perform a - // linear search over the descriptor ranges for a specific binding range. - // - auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset( - descriptorSetIndex, rangeIndexInDescriptorSet); - - Index baseIndex = 0; - Index subObjectIndex = 0; - switch (slangBindingType) - { - case slang::BindingType::ConstantBuffer: - case slang::BindingType::ParameterBlock: - case slang::BindingType::ExistentialValue: - baseIndex = m_subObjectCount; - subObjectIndex = baseIndex; - m_subObjectCount += count; - break; - case slang::BindingType::RawBuffer: - case slang::BindingType::MutableRawBuffer: - if (slangLeafTypeLayout->getType()->getElementType() != nullptr) - { - // A structured buffer occupies both a resource slot and - // a sub-object slot. - subObjectIndex = m_subObjectCount; - m_subObjectCount += count; - } - baseIndex = m_resourceCount; - m_resourceCount += count; - break; - default: - baseIndex = m_resourceCount; - m_resourceCount += count; - break; - } - - BindingRangeInfo bindingRangeInfo; - bindingRangeInfo.bindingType = slangBindingType; - bindingRangeInfo.count = count; - bindingRangeInfo.baseIndex = baseIndex; - bindingRangeInfo.uniformOffset = uniformOffset; - bindingRangeInfo.subObjectIndex = subObjectIndex; - m_bindingRanges.add(bindingRangeInfo); - } - - SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount(); - for (SlangInt r = 0; r < subObjectRangeCount; ++r) - { - SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r); - auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex); - slang::TypeLayoutReflection* slangLeafTypeLayout = - m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex); - - // A sub-object range can either represent a sub-object of a known - // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>` - // (in which case we can pre-compute a layout to use, based on - // the type `Foo`) *or* it can represent a sub-object of some - // existential type (e.g., `IBar`) in which case we cannot - // know the appropraite type/layout of sub-object to allocate. - // - RefPtr<CUDAShaderObjectLayout> subObjectLayout; - if (slangBindingType != slang::BindingType::ExistentialValue) - { - subObjectLayout = - new CUDAShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout()); - } - - SubObjectRangeInfo subObjectRange; - subObjectRange.bindingRangeIndex = bindingRangeIndex; - subObjectRange.layout = subObjectLayout; - subObjectRanges.add(subObjectRange); - } - } - - Index getResourceCount() const { return m_resourceCount; } - Index getSubObjectCount() const { return m_subObjectCount; } - List<SubObjectRangeInfo>& getSubObjectRanges() { return subObjectRanges; } - BindingRangeInfo getBindingRange(Index index) { return m_bindingRanges[index]; } - Index getBindingRangeCount() const { return m_bindingRanges.getCount(); } -}; - -class CUDAProgramLayout : public CUDAShaderObjectLayout -{ -public: - slang::ProgramLayout* programLayout = nullptr; - List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts; - CUDAProgramLayout(RendererBase* renderer, slang::ProgramLayout* inProgramLayout) - : CUDAShaderObjectLayout(renderer, inProgramLayout->getGlobalParamsTypeLayout()) - , programLayout(inProgramLayout) - { - for (UInt i =0; i< programLayout->getEntryPointCount(); i++) - { - entryPointLayouts.add(new CUDAShaderObjectLayout( - renderer, - programLayout->getEntryPointByIndex(i)->getTypeLayout())); - } - - } - - int getKernelIndex(UnownedStringSlice kernelName) - { - for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++) - { - auto entryPoint = programLayout->getEntryPointByIndex(i); - if (kernelName == entryPoint->getName()) - { - return i; - } - } - return -1; - } - - void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes) - { - auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex); - entryPoint->getComputeThreadGroupSize(3, threadGroupSizes); - } -}; - -class CUDAShaderObjectData -{ -public: - bool isHostOnly = false; - Slang::RefPtr<MemoryCUDAResource> m_bufferResource; - Slang::RefPtr<CUDAResourceView> m_bufferView; - Slang::List<uint8_t> m_cpuBuffer; - Result setCount(Index count) - { - if (isHostOnly) - { - m_cpuBuffer.setCount(count); - if (!m_bufferView) - { - IResourceView::Desc viewDesc = {}; - viewDesc.type = IResourceView::Type::UnorderedAccess; - m_bufferView = new CUDAResourceView(); - m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer(); - m_bufferView->m_desc = viewDesc; - } - return SLANG_OK; - } - - if (!m_bufferResource) - { - IBufferResource::Desc desc; - desc.type = IResource::Type::Buffer; - desc.sizeInBytes = count; - m_bufferResource = new MemoryCUDAResource(desc); - if (count) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count)); - } - IResourceView::Desc viewDesc = {}; - viewDesc.type = IResourceView::Type::UnorderedAccess; - m_bufferView = new CUDAResourceView(); - m_bufferView->memoryResource = m_bufferResource; - m_bufferView->m_desc = viewDesc; - } - auto oldSize = m_bufferResource->getDesc()->sizeInBytes; - if ((size_t)count != oldSize) - { - void* newMemory = nullptr; - if (count) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count)); - } - if (oldSize) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - newMemory, - m_bufferResource->m_cudaMemory, - Math::Min((size_t)count, oldSize), - cudaMemcpyDefault)); - } - cudaFree(m_bufferResource->m_cudaMemory); - m_bufferResource->m_cudaMemory = newMemory; - m_bufferResource->getDesc()->sizeInBytes = count; - } - return SLANG_OK; - } - - Slang::Index getCount() - { - if (isHostOnly) - return m_cpuBuffer.getCount(); - if (m_bufferResource) - return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes); - else - return 0; - } - - void* getBuffer() - { - if (isHostOnly) - return m_cpuBuffer.getBuffer(); - - if (m_bufferResource) - return m_bufferResource->m_cudaMemory; - return nullptr; - } - - /// Returns a resource view for GPU access into the buffer content. - ResourceViewBase* getResourceView( - RendererBase* device, - slang::TypeLayoutReflection* elementLayout, - slang::BindingType bindingType) - { - SLANG_UNUSED(device); - m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize(); - return m_bufferView.Ptr(); - } -}; - -class CUDAShaderObject - : public ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData> -{ - typedef ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData> - Super; - -public: - List<RefPtr<CUDAResourceView>> resources; - - virtual SLANG_NO_THROW Result SLANG_MCALL - init(IDevice* device, CUDAShaderObjectLayout* typeLayout); - - virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return 0; } - virtual SLANG_NO_THROW Result SLANG_MCALL - getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override - { - *outEntryPoint = nullptr; - return SLANG_OK; - } - - virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override - { - return m_data.getBuffer(); - } - - virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override - { - return (Size)m_data.getCount(); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - setData(ShaderOffset const& offset, void const* data, Size size) override - { - size = Math::Min(size, (Size)m_data.getCount() - offset.uniformOffset); - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault)); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setResource(ShaderOffset const& offset, IResourceView* resourceView) override - { - if (!resourceView) - return SLANG_OK; - - auto layout = getLayout(); - - auto bindingRangeIndex = offset.bindingRangeIndex; - SLANG_ASSERT(bindingRangeIndex >= 0); - SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount()); - - auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex]; - - auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex; - auto cudaView = static_cast<CUDAResourceView*>(resourceView); - - resources[viewIndex] = cudaView; - - if (cudaView->textureResource) - { - if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess) - { - auto handle = cudaView->textureResource->m_cudaSurfObj; - setData(offset, &handle, sizeof(uint64_t)); - } - else - { - auto handle = cudaView->textureResource->getBindlessHandle(); - setData(offset, &handle, sizeof(uint64_t)); - } - } - else if (cudaView->memoryResource) - { - auto handle = cudaView->memoryResource->getBindlessHandle(); - setData(offset, &handle, sizeof(handle)); - auto sizeOffset = offset; - sizeOffset.uniformOffset += sizeof(handle); - auto& desc = *cudaView->memoryResource->getDesc(); - size_t size = desc.sizeInBytes; - if (desc.elementSize > 1) - size /= desc.elementSize; - setData(sizeOffset, &size, sizeof(size)); - } - else if (cudaView->proxyBuffer) - { - auto handle = cudaView->proxyBuffer; - setData(offset, &handle, sizeof(handle)); - auto sizeOffset = offset; - sizeOffset.uniformOffset += sizeof(handle); - auto& desc = *cudaView->memoryResource->getDesc(); - size_t size = desc.sizeInBytes; - if (desc.elementSize > 1) - size /= desc.elementSize; - setData(sizeOffset, &size, sizeof(size)); - } - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setObject(ShaderOffset const& offset, IShaderObject* object) override - { - SLANG_RETURN_ON_FAIL(Super::setObject(offset, object)); - - auto bindingRangeIndex = offset.bindingRangeIndex; - auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex]; - - CUDAShaderObject* subObject = static_cast<CUDAShaderObject*>(object); - switch (bindingRange.bindingType) - { - default: - { - void* subObjectDataBuffer = subObject->getBuffer(); - SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*))); - } - break; - case slang::BindingType::ExistentialValue: - case slang::BindingType::RawBuffer: - case slang::BindingType::MutableRawBuffer: - break; - } - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setSampler(ShaderOffset const& offset, ISamplerState* sampler) override - { - SLANG_UNUSED(sampler); - SLANG_UNUSED(offset); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler( - ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override - { - SLANG_UNUSED(sampler); - setResource(offset, textureView); - return SLANG_OK; - } -}; - -class CUDAMutableShaderObject : public MutableShaderObject< CUDAMutableShaderObject, CUDAShaderObjectLayout> -{}; - -class CUDAEntryPointShaderObject : public CUDAShaderObject -{ -public: - CUDAEntryPointShaderObject() { m_data.isHostOnly = true; } -}; - -class CUDARootShaderObject : public CUDAShaderObject -{ -public: - virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override { return 1; } - virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override { return 1; } -public: - List<RefPtr<CUDAEntryPointShaderObject>> entryPointObjects; - virtual SLANG_NO_THROW Result SLANG_MCALL - init(IDevice* device, CUDAShaderObjectLayout* typeLayout) override; - virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return (GfxCount)entryPointObjects.getCount(); } - virtual SLANG_NO_THROW Result SLANG_MCALL - getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override - { - returnComPtr(outEntryPoint, entryPointObjects[index]); - return SLANG_OK; - } - virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override - { - SLANG_RETURN_ON_FAIL(CUDAShaderObject::collectSpecializationArgs(args)); - for (auto& entryPoint : entryPointObjects) - { - SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args)); - } - return SLANG_OK; - } -}; - -class CUDAShaderProgram : public ShaderProgramBase -{ -public: - CUmodule cudaModule = nullptr; - CUfunction cudaKernel; - String kernelName; - RefPtr<CUDAProgramLayout> layout; - RefPtr<CUDAContext> cudaContext; - ~CUDAShaderProgram() - { - if (cudaModule) - cuModuleUnload(cudaModule); - } -}; - -class CUDAPipelineState : public PipelineStateBase -{ -public: - RefPtr<CUDAShaderProgram> shaderProgram; - void init(const ComputePipelineStateDesc& inDesc) - { - PipelineStateDesc pipelineDesc; - pipelineDesc.type = PipelineType::Compute; - pipelineDesc.compute = inDesc; - initializeBase(pipelineDesc); - } -}; - -class CUDAQueryPool : public QueryPoolBase -{ -public: - // The event object for each query. Owned by the pool. - List<CUevent> m_events; - - // The event that marks the starting point. - CUevent m_startEvent; - - Result init(const IQueryPool::Desc& desc) - { - cuEventCreate(&m_startEvent, 0); - cuEventRecord(m_startEvent, 0); - m_events.setCount(desc.count); - for (SlangInt i = 0; i < m_events.getCount(); i++) - { - cuEventCreate(&m_events[i], 0); - } - return SLANG_OK; - } - - ~CUDAQueryPool() - { - for (auto& e : m_events) - { - cuEventDestroy(e); - } - cuEventDestroy(m_startEvent); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getResult( - GfxIndex queryIndex, GfxCount count, uint64_t* data) override - { - for (GfxIndex i = 0; i < count; i++) - { - float time = 0.0f; - cuEventSynchronize(m_events[i + queryIndex]); - cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]); - data[i] = (uint64_t)((double)time * 1000.0f); - } - return SLANG_OK; - } -}; - -class CUDADevice : public RendererBase -{ -private: - static const CUDAReportStyle reportType = CUDAReportStyle::Normal; - static int _calcSMCountPerMultiProcessor(int major, int minor) - { - // Defines for GPU Architecture types (using the SM version to determine - // the # of cores per SM - struct SMInfo - { - int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version - int coreCount; - }; - - static const SMInfo infos[] = { - {0x30, 192}, - {0x32, 192}, - {0x35, 192}, - {0x37, 192}, - {0x50, 128}, - {0x52, 128}, - {0x53, 128}, - {0x60, 64}, - {0x61, 128}, - {0x62, 128}, - {0x70, 64}, - {0x72, 64}, - {0x75, 64}}; - - const int sm = ((major << 4) + minor); - for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i) - { - if (infos[i].sm == sm) - { - return infos[i].coreCount; - } - } - - const auto& last = infos[SLANG_COUNT_OF(infos) - 1]; - - // It must be newer presumably - SLANG_ASSERT(sm > last.sm); - - // Default to the last entry - return last.coreCount; - } - - static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex) - { - int smPerMultiproc = 0; - int maxPerfDevice = -1; - int deviceCount = 0; - int devicesProhibited = 0; - - uint64_t maxComputePerf = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); - - // Find the best CUDA capable GPU device - for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) - { - int computeMode = -1, major = 0, minor = 0; - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); - - // If this GPU is not running on Compute Mode prohibited, - // then we can add it to the list - if (computeMode != cudaComputeModeProhibited) - { - if (major == 9999 && minor == 9999) - { - smPerMultiproc = 1; - } - else - { - smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor); - } - - int multiProcessorCount = 0, clockRate = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute( - &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); - uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; - - if (compute_perf > maxComputePerf) - { - maxComputePerf = compute_perf; - maxPerfDevice = currentDevice; - } - } - else - { - devicesProhibited++; - } - } - - if (maxPerfDevice < 0) - { - return SLANG_FAIL; - } - - *outDeviceIndex = maxPerfDevice; - return SLANG_OK; - } - - static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal) - { - static CUresult res = cuInit(0); - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType); - return SLANG_OK; - } - -private: - int m_deviceIndex = -1; - CUdevice m_device = 0; - RefPtr<CUDAContext> m_context; - DeviceInfo m_info; - String m_adapterName; - -public: - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override - { - outHandles->handles[0].handleValue = (uint64_t)m_device; - outHandles->handles[0].api = InteropHandleAPI::CUDA; - return SLANG_OK; - } - - class CommandQueueImpl; - - class CommandBufferImpl - : public ICommandBuffer - , public CommandWriter - , public ComObject - { - public: - SLANG_COM_OBJECT_IUNKNOWN_ALL - ICommandBuffer* getInterface(const Guid& guid) - { - if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer) - return static_cast<ICommandBuffer*>(this); - return nullptr; - } - public: - CUDADevice* m_device; - TransientResourceHeapBase* m_transientHeap; - - void init(CUDADevice* device, TransientResourceHeapBase* transientHeap) - { - m_device = device; - m_transientHeap = transientHeap; - } - - virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands( - IRenderPassLayout* renderPass, - IFramebuffer* framebuffer, - IRenderCommandEncoder** outEncoder) override - { - SLANG_UNUSED(renderPass); - SLANG_UNUSED(framebuffer); - *outEncoder = nullptr; - } - - class ResourceCommandEncoderImpl : public IResourceCommandEncoder - { - public: - CommandWriter* m_writer; - - void init(CommandBufferImpl* cmdBuffer) { m_writer = cmdBuffer; } - - virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} - virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer( - IBufferResource* dst, - Offset dstOffset, - IBufferResource* src, - Offset srcOffset, - Size size) override - { - m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size); - } - - virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier( - GfxCount count, - ITextureResource* const* textures, - ResourceState src, - ResourceState dst) override - {} - - virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier( - GfxCount count, - IBufferResource* const* buffers, - ResourceState src, - ResourceState dst) override - {} - - virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData( - IBufferResource* dst, Offset offset, Size size, void* data) override - { - m_writer->uploadBufferData(dst, offset, size, data); - } - - virtual SLANG_NO_THROW void SLANG_MCALL - writeTimestamp(IQueryPool* pool, GfxIndex index) override - { - m_writer->writeTimestamp(pool, index); - } - - virtual SLANG_NO_THROW void SLANG_MCALL copyTexture( - ITextureResource* dst, - ResourceState dstState, - SubresourceRange dstSubresource, - ITextureResource::Offset3D dstOffset, - ITextureResource* src, - ResourceState srcState, - SubresourceRange srcSubresource, - ITextureResource::Offset3D srcOffset, - ITextureResource::Extents extent) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(dstState); - SLANG_UNUSED(dstSubresource); - SLANG_UNUSED(dstOffset); - SLANG_UNUSED(src); - SLANG_UNUSED(srcState); - SLANG_UNUSED(srcSubresource); - SLANG_UNUSED(srcOffset); - SLANG_UNUSED(extent); - SLANG_UNIMPLEMENTED_X("copyTexture"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData( - ITextureResource* dst, - SubresourceRange subResourceRange, - ITextureResource::Offset3D offset, - ITextureResource::Extents extent, - ITextureResource::SubresourceData* subResourceData, - GfxCount subResourceDataCount) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(subResourceRange); - SLANG_UNUSED(offset); - SLANG_UNUSED(extent); - SLANG_UNUSED(subResourceData); - SLANG_UNUSED(subResourceDataCount); - SLANG_UNIMPLEMENTED_X("uploadTextureData"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView( - IResourceView* view, - ClearValue* clearValue, - ClearResourceViewFlags::Enum flags) override - { - SLANG_UNUSED(view); - SLANG_UNUSED(clearValue); - SLANG_UNUSED(flags); - SLANG_UNIMPLEMENTED_X("clearResourceView"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL resolveResource( - ITextureResource* source, - ResourceState sourceState, - SubresourceRange sourceRange, - ITextureResource* dest, - ResourceState destState, - SubresourceRange destRange) override - { - SLANG_UNUSED(source); - SLANG_UNUSED(sourceState); - SLANG_UNUSED(sourceRange); - SLANG_UNUSED(dest); - SLANG_UNUSED(destState); - SLANG_UNUSED(destRange); - SLANG_UNIMPLEMENTED_X("resolveResource"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery( - IQueryPool* queryPool, - GfxIndex index, - GfxCount count, - IBufferResource* buffer, - Offset offset) override - { - SLANG_UNUSED(queryPool); - SLANG_UNUSED(index); - SLANG_UNUSED(count); - SLANG_UNUSED(buffer); - SLANG_UNUSED(offset); - SLANG_UNIMPLEMENTED_X("resolveQuery"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer( - IBufferResource* dst, - Offset dstOffset, - Size dstSize, - Size dstRowStride, - ITextureResource* src, - ResourceState srcState, - SubresourceRange srcSubresource, - ITextureResource::Offset3D srcOffset, - ITextureResource::Extents extent) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(dstOffset); - SLANG_UNUSED(dstSize); - SLANG_UNUSED(dstRowStride); - SLANG_UNUSED(src); - SLANG_UNUSED(srcState); - SLANG_UNUSED(srcSubresource); - SLANG_UNUSED(srcOffset); - SLANG_UNUSED(extent); - SLANG_UNIMPLEMENTED_X("copyTextureToBuffer"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier( - ITextureResource* texture, - SubresourceRange subresourceRange, - ResourceState src, - ResourceState dst) override - { - SLANG_UNUSED(texture); - SLANG_UNUSED(subresourceRange); - SLANG_UNUSED(src); - SLANG_UNUSED(dst); - SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier"); - } - virtual SLANG_NO_THROW void SLANG_MCALL - beginDebugEvent(const char* name, float rgbColor[3]) override - { - SLANG_UNUSED(name); - SLANG_UNUSED(rgbColor); - } - virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {} - }; - - ResourceCommandEncoderImpl m_resourceCommandEncoder; - - virtual SLANG_NO_THROW void SLANG_MCALL - encodeResourceCommands(IResourceCommandEncoder** outEncoder) override - { - m_resourceCommandEncoder.init(this); - *outEncoder = &m_resourceCommandEncoder; - } - - class ComputeCommandEncoderImpl - : public IComputeCommandEncoder - , public ResourceCommandEncoderImpl - { - public: - SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl) - public: - CommandWriter* m_writer; - CommandBufferImpl* m_commandBuffer; - RefPtr<ShaderObjectBase> m_rootObject; - virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} - void init(CommandBufferImpl* cmdBuffer) - { - m_writer = cmdBuffer; - m_commandBuffer = cmdBuffer; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override - { - m_writer->setPipelineState(state); - PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); - SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( - pipelineImpl->m_program, m_rootObject.writeRef())); - returnComPtr(outRootObject, m_rootObject); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override - { - m_writer->setPipelineState(state); - PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); - SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( - pipelineImpl->m_program, m_rootObject.writeRef())); - m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap); - return SLANG_OK; - } - - virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override - { - m_writer->bindRootShaderObject(m_rootObject); - m_writer->dispatchCompute(x, y, z); - } - - virtual SLANG_NO_THROW void SLANG_MCALL - dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override - { - SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect"); - } - }; - - ComputeCommandEncoderImpl m_computeCommandEncoder; - virtual SLANG_NO_THROW void SLANG_MCALL - encodeComputeCommands(IComputeCommandEncoder** outEncoder) override - { - m_computeCommandEncoder.init(this); - *outEncoder = &m_computeCommandEncoder; - } - - virtual SLANG_NO_THROW void SLANG_MCALL - encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override - { - *outEncoder = nullptr; - } - - virtual SLANG_NO_THROW void SLANG_MCALL close() override {} - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override - { - return SLANG_FAIL; - } - }; - - class CommandQueueImpl - : public ICommandQueue - , public ComObject - { - public: - SLANG_COM_OBJECT_IUNKNOWN_ALL - ICommandQueue* getInterface(const Guid& guid) - { - if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) - return static_cast<ICommandQueue*>(this); - return nullptr; - } - - public: - RefPtr<CUDAPipelineState> currentPipeline; - RefPtr<CUDARootShaderObject> currentRootObject; - RefPtr<CUDADevice> renderer; - CUstream stream; - Desc m_desc; - public: - void init(CUDADevice* inRenderer) - { - renderer = inRenderer; - m_desc.type = ICommandQueue::QueueType::Graphics; - cuStreamCreate(&stream, 0); - } - ~CommandQueueImpl() - { - cuStreamSynchronize(stream); - cuStreamDestroy(stream); - currentPipeline = nullptr; - currentRootObject = nullptr; - } - - public: - virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override - { - return m_desc; - } - - virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers( - GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override - { - SLANG_UNUSED(valueToSignal); - // TODO: implement fence. - assert(fence == nullptr); - for (GfxIndex i = 0; i < count; i++) - { - execute(static_cast<CommandBufferImpl*>(commandBuffers[i])); - } - } - - virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override - { - auto resultCode = cuStreamSynchronize(stream); - if (resultCode != cudaSuccess) - SLANG_CUDA_HANDLE_ERROR(resultCode); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice( - GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override - { - return SLANG_FAIL; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override - { - return SLANG_FAIL; - } - - public: - void setPipelineState(IPipelineState* state) - { - currentPipeline = dynamic_cast<CUDAPipelineState*>(state); - } - - Result bindRootShaderObject(IShaderObject* object) - { - currentRootObject = dynamic_cast<CUDARootShaderObject*>(object); - if (currentRootObject) - return SLANG_OK; - return SLANG_E_INVALID_ARG; - } - - void dispatchCompute(int x, int y, int z) - { - // Specialize the compute kernel based on the shader object bindings. - RefPtr<PipelineStateBase> newPipeline; - renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); - currentPipeline = static_cast<CUDAPipelineState*>(newPipeline.Ptr()); - - // Find out thread group size from program reflection. - auto& kernelName = currentPipeline->shaderProgram->kernelName; - auto programLayout = static_cast<CUDAProgramLayout*>(currentRootObject->getLayout()); - int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); - SLANG_ASSERT(kernelId != -1); - UInt threadGroupSize[3]; - programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); - - int sharedSizeInBytes; - cuFuncGetAttribute( - &sharedSizeInBytes, - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - currentPipeline->shaderProgram->cudaKernel); - - // Copy global parameter data to the `SLANG_globalParams` symbol. - { - CUdeviceptr globalParamsSymbol = 0; - size_t globalParamsSymbolSize = 0; - cuModuleGetGlobal( - &globalParamsSymbol, - &globalParamsSymbolSize, - currentPipeline->shaderProgram->cudaModule, - "SLANG_globalParams"); - - CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); - cudaMemcpyAsync( - (void*)globalParamsSymbol, - (void*)globalParamsCUDAData, - globalParamsSymbolSize, - cudaMemcpyDefault, - 0); - } - // - // The argument data for the entry-point parameters are already - // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. - // - auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); - auto entryPointDataSize = - currentRootObject->entryPointObjects[kernelId]->getBufferSize(); - - void* extraOptions[] = { - CU_LAUNCH_PARAM_BUFFER_POINTER, - entryPointBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, - &entryPointDataSize, - CU_LAUNCH_PARAM_END, - }; - - // Once we have all the decessary data extracted and/or - // set up, we can launch the kernel and see what happens. - // - auto cudaLaunchResult = cuLaunchKernel( - currentPipeline->shaderProgram->cudaKernel, - x, - y, - z, - int(threadGroupSize[0]), - int(threadGroupSize[1]), - int(threadGroupSize[2]), - sharedSizeInBytes, - stream, - nullptr, - extraOptions); - - SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); - } - - void copyBuffer( - IBufferResource* dst, - size_t dstOffset, - IBufferResource* src, - size_t srcOffset, - size_t size) - { - auto dstImpl = static_cast<MemoryCUDAResource*>(dst); - auto srcImpl = static_cast<MemoryCUDAResource*>(src); - cudaMemcpy( - (uint8_t*)dstImpl->m_cudaMemory + dstOffset, - (uint8_t*)srcImpl->m_cudaMemory + srcOffset, - size, - cudaMemcpyDefault); - } - - void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) - { - auto dstImpl = static_cast<MemoryCUDAResource*>(dst); - cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); - } - - void writeTimestamp(IQueryPool* pool, SlangInt index) - { - auto poolImpl = static_cast<CUDAQueryPool*>(pool); - cuEventRecord(poolImpl->m_events[index], stream); - } - - void execute(CommandBufferImpl* commandBuffer) - { - for (auto& cmd : commandBuffer->m_commands) - { - switch (cmd.name) - { - case CommandName::SetPipelineState: - setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0])); - break; - case CommandName::BindRootShaderObject: - bindRootShaderObject( - commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0])); - break; - case CommandName::DispatchCompute: - dispatchCompute( - int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); - break; - case CommandName::CopyBuffer: - copyBuffer( - commandBuffer->getObject<BufferResource>(cmd.operands[0]), - cmd.operands[1], - commandBuffer->getObject<BufferResource>(cmd.operands[2]), - cmd.operands[3], - cmd.operands[4]); - break; - case CommandName::UploadBufferData: - uploadBufferData( - commandBuffer->getObject<BufferResource>(cmd.operands[0]), - cmd.operands[1], - cmd.operands[2], - commandBuffer->getData<uint8_t>(cmd.operands[3])); - break; - case CommandName::WriteTimestamp: - writeTimestamp( - commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]), - (SlangInt)cmd.operands[1]); - } - } - } - }; - - using TransientResourceHeapImpl = SimpleTransientResourceHeap<CUDADevice, CommandBufferImpl>; - -public: - virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override - { - SLANG_RETURN_ON_FAIL(slangContext.initialize( - desc.slang, - SLANG_PTX, - "sm_5_1", - makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView())); - - SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc)); - - SLANG_RETURN_ON_FAIL(_initCuda(reportType)); - - SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex)); - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType); - - m_context = new CUDAContext(); - - int count = -1; - cuDeviceGetCount(&count); - SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex)); - - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL( - cuCtxCreate(&m_context->m_context, 0, m_device), reportType); - - // Not clear how to detect half support on CUDA. For now we'll assume we have it - { - m_features.add("half"); - } - - // Initialize DeviceInfo - { - m_info.deviceType = DeviceType::CUDA; - m_info.bindingStyle = BindingStyle::CUDA; - m_info.projectionStyle = ProjectionStyle::DirectX; - m_info.apiName = "CUDA"; - static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}; - ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity)); - cudaDeviceProp deviceProperties; - cudaGetDeviceProperties(&deviceProperties, m_deviceIndex); - m_adapterName = deviceProperties.name; - m_info.adapterName = m_adapterName.begin(); - m_info.timestampFrequency = 1000000; - } - - return SLANG_OK; - } - - Result getCUDAFormat(Format format, CUarray_format* outFormat) - { - // TODO: Expand to cover all available formats that can be supported in CUDA - switch (format) - { - case Format::R32G32B32A32_FLOAT: - case Format::R32G32B32_FLOAT: - case Format::R32G32_FLOAT: - case Format::R32_FLOAT: - case Format::D32_FLOAT: - *outFormat = CU_AD_FORMAT_FLOAT; - return SLANG_OK; - case Format::R16G16B16A16_FLOAT: - case Format::R16G16_FLOAT: - case Format::R16_FLOAT: - *outFormat = CU_AD_FORMAT_HALF; - return SLANG_OK; - case Format::R32G32B32A32_UINT: - case Format::R32G32B32_UINT: - case Format::R32G32_UINT: - case Format::R32_UINT: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT32; - return SLANG_OK; - case Format::R16G16B16A16_UINT: - case Format::R16G16_UINT: - case Format::R16_UINT: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT16; - return SLANG_OK; - case Format::R8G8B8A8_UINT: - case Format::R8G8_UINT: - case Format::R8_UINT: - case Format::R8G8B8A8_UNORM: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT8; - return SLANG_OK; - case Format::R32G32B32A32_SINT: - case Format::R32G32B32_SINT: - case Format::R32G32_SINT: - case Format::R32_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT32; - return SLANG_OK; - case Format::R16G16B16A16_SINT: - case Format::R16G16_SINT: - case Format::R16_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT16; - return SLANG_OK; - case Format::R8G8B8A8_SINT: - case Format::R8G8_SINT: - case Format::R8_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT8; - return SLANG_OK; - default: - SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); - return SLANG_FAIL; - } - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource( - const ITextureResource::Desc& desc, - const ITextureResource::SubresourceData* initData, - ITextureResource** outResource) override - { - TextureResource::Desc srcDesc = fixupTextureDesc(desc); - - RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(srcDesc); - tex->m_cudaContext = m_context; - - CUresourcetype resourceType; - - // The size of the element/texel in bytes - size_t elementSize = 0; - - // Our `ITextureResource::Desc` uses an enumeration to specify - // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's - // `cuMipmappedArrayCreate` seemingly relies on a policy where - // the extents of the array in dimenions above the rank are - // specified as zero (e.g., a 1D texture requires `height==0`). - // - // We will start by massaging the extents as specified by the - // user into a form that CUDA wants/expects, based on the - // texture shape as specified in the `desc`. - // - int width = desc.size.width; - int height = desc.size.height; - int depth = desc.size.depth; - switch (desc.type) - { - case IResource::Type::Texture1D: - height = 0; - depth = 0; - break; - - case IResource::Type::Texture2D: - depth = 0; - break; - - case IResource::Type::Texture3D: - break; - - case IResource::Type::TextureCube: - depth = 1; - break; - } - - { - CUarray_format format = CU_AD_FORMAT_FLOAT; - int numChannels = 0; - - SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format)); - FormatInfo info; - gfxGetFormatInfo(desc.format, &info); - numChannels = info.channelCount; - - switch (format) - { - case CU_AD_FORMAT_FLOAT: - { - elementSize = sizeof(float) * numChannels; - break; - } - case CU_AD_FORMAT_HALF: - { - elementSize = sizeof(uint16_t) * numChannels; - break; - } - case CU_AD_FORMAT_UNSIGNED_INT8: - { - elementSize = sizeof(uint8_t) * numChannels; - break; - } - default: - { - SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); - return SLANG_FAIL; - } - } - - if (desc.numMipLevels > 1) - { - resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; - - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Width = width; - arrayDesc.Height = height; - arrayDesc.Depth = depth; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - arrayDesc.Flags = 0; - - if (desc.arraySize > 1) - { - if (desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED; - arrayDesc.Depth = desc.arraySize; - } - else - { - SLANG_ASSERT(!"Arrays only supported for 1D and 2D"); - return SLANG_FAIL; - } - } - - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - arrayDesc.Depth *= 6; - } - - SLANG_CUDA_RETURN_ON_FAIL( - cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels)); - } - else - { - resourceType = CU_RESOURCE_TYPE_ARRAY; - - if (desc.arraySize > 1) - { - if (desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube) - { - SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported"); - return SLANG_FAIL; - } - - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - // Set the depth as the array length - arrayDesc.Depth = desc.arraySize; - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Depth *= 6; - } - - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); - } - else if (desc.type == IResource::Type::Texture3D || - desc.type == IResource::Type::TextureCube) - { - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Depth = depth; - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - arrayDesc.Flags = 0; - - // Handle cube texture - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Depth = 6; - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); - } - else - { - CUDA_ARRAY_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - // Allocate the array, will work for 1D or 2D case - SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); - } - } - } - - // Work space for holding data for uploading if it needs to be rearranged - if (initData) - { - List<uint8_t> workspace; - for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel) - { - int mipWidth = width >> mipLevel; - int mipHeight = height >> mipLevel; - int mipDepth = depth >> mipLevel; - - mipWidth = (mipWidth == 0) ? 1 : mipWidth; - mipHeight = (mipHeight == 0) ? 1 : mipHeight; - mipDepth = (mipDepth == 0) ? 1 : mipDepth; - - // If it's a cubemap then the depth is always 6 - if (desc.type == IResource::Type::TextureCube) - { - mipDepth = 6; - } - - auto dstArray = tex->m_cudaArray; - if (tex->m_cudaMipMappedArray) - { - // Get the array for the mip level - SLANG_CUDA_RETURN_ON_FAIL( - cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel)); - } - SLANG_ASSERT(dstArray); - - // Check using the desc to see if it's plausible - { - CUDA_ARRAY_DESCRIPTOR arrayDesc; - SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray)); - - SLANG_ASSERT(mipWidth == arrayDesc.Width); - SLANG_ASSERT( - mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0)); - } - - const void* srcDataPtr = nullptr; - - if (desc.arraySize > 1) - { - SLANG_ASSERT( - desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube); - - // TODO(JS): Here I assume that arrays are just held contiguously within a - // 'face' This seems reasonable and works with the Copy3D. - const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; - - Index faceCount = desc.arraySize; - if (desc.type == IResource::Type::TextureCube) - { - faceCount *= 6; - } - - const size_t mipSizeInBytes = faceSizeInBytes * faceCount; - workspace.setCount(mipSizeInBytes); - - // We need to add the face data from each mip - // We iterate over face count so we copy all of the cubemap faces - for (Index j = 0; j < faceCount; j++) - { - const auto srcData = initData[mipLevel + j * desc.numMipLevels].data; - // Copy over to the workspace to make contiguous - ::memcpy( - workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes); - } - - srcDataPtr = workspace.getBuffer(); - } - else - { - if (desc.type == IResource::Type::TextureCube) - { - size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; - - workspace.setCount(faceSizeInBytes * 6); - // Copy the data over to make contiguous - for (Index j = 0; j < 6; j++) - { - const auto srcData = - initData[mipLevel + j * desc.numMipLevels].data; - ::memcpy( - workspace.getBuffer() + faceSizeInBytes * j, - srcData, - faceSizeInBytes); - } - srcDataPtr = workspace.getBuffer(); - } - else - { - const auto srcData = initData[mipLevel].data; - srcDataPtr = srcData; - } - } - - if (desc.arraySize > 1) - { - SLANG_ASSERT( - desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube); - - CUDA_MEMCPY3D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - // Set the depth to the array length - copyParam.Depth = desc.arraySize; - - if (desc.type == IResource::Type::TextureCube) - { - copyParam.Depth *= 6; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); - } - else - { - switch (desc.type) - { - case IResource::Type::Texture1D: - case IResource::Type::Texture2D: - { - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); - break; - } - case IResource::Type::Texture3D: - case IResource::Type::TextureCube: - { - CUDA_MEMCPY3D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - copyParam.Depth = mipDepth; - - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); - break; - } - - default: - { - SLANG_ASSERT(!"Not implemented"); - break; - } - } - } - } - } - // Set up texture sampling parameters, and create final texture obj - - { - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); - resDesc.resType = resourceType; - - if (tex->m_cudaArray) - { - resDesc.res.array.hArray = tex->m_cudaArray; - } - if (tex->m_cudaMipMappedArray) - { - resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray; - } - - // If the texture might be used as a UAV, then we need to allocate - // a CUDA "surface" for it. - // - // Note: We cannot do this unconditionally, because it will fail - // on surfaces that are not usable as UAVs (e.g., those with - // mipmaps). - // - // TODO: We should really only be allocating the array at the - // time we create a resource, and then allocate the surface or - // texture objects as part of view creation. - // - if (desc.allowedStates.contains(ResourceState::UnorderedAccess)) - { - // On CUDA surfaces only support a single MIP map - SLANG_ASSERT(desc.numMipLevels == 1); - - SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc)); - } - - - // Create handle for sampling. - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); - texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - SLANG_CUDA_RETURN_ON_FAIL( - cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); - } - - returnComPtr(outResource, tex); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource( - const IBufferResource::Desc& descIn, - const void* initData, - IBufferResource** outResource) override - { - auto desc = fixupBufferDesc(descIn); - RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc); - resource->m_cudaContext = m_context; - SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes)); - if (initData) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault)); - } - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle( - InteropHandle handle, - const IBufferResource::Desc& desc, - IBufferResource** outResource) override - { - if (handle.handleValue == 0) - { - *outResource = nullptr; - return SLANG_OK; - } - - RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc); - resource->m_cudaContext = m_context; - - // CUDA manages sharing of buffers through the idea of an - // "external memory" object, which represents the relationship - // with another API's objects. In order to create this external - // memory association, we first need to fill in a descriptor struct. - cudaExternalMemoryHandleDesc externalMemoryHandleDesc; - memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - switch (handle.api) - { - case InteropHandleAPI::D3D12: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; - break; - case InteropHandleAPI::Vulkan: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; - break; - default: - return SLANG_FAIL; - } - externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; - externalMemoryHandleDesc.size = desc.sizeInBytes; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; - - // Once we have filled in the descriptor, we can request - // that CUDA create the required association between the - // external buffer and its own memory. - cudaExternalMemory_t externalMemory; - SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); - resource->m_cudaExternalMemory = externalMemory; - - // The CUDA "external memory" handle is not itself a device - // pointer, so we need to query for a suitable device address - // for the buffer with another call. - // - // Just as for the external memory, we fill in a descriptor - // structure (although in this case we only need to specify - // the size). - cudaExternalMemoryBufferDesc bufferDesc; - memset(&bufferDesc, 0, sizeof(bufferDesc)); - bufferDesc.size = desc.sizeInBytes; - - // Finally, we can "map" the buffer to get a device address. - void* deviceAddress; - SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc)); - resource->m_cudaMemory = deviceAddress; - - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle( - InteropHandle handle, - const ITextureResource::Desc& desc, - const size_t size, - ITextureResource** outResource) override - { - if (handle.handleValue == 0) - { - *outResource = nullptr; - return SLANG_OK; - } - - RefPtr<TextureCUDAResource> resource = new TextureCUDAResource(desc); - resource->m_cudaContext = m_context; - - // CUDA manages sharing of buffers through the idea of an - // "external memory" object, which represents the relationship - // with another API's objects. In order to create this external - // memory association, we first need to fill in a descriptor struct. - CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc; - memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - switch (handle.api) - { - case InteropHandleAPI::D3D12: - externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE; - break; - case InteropHandleAPI::Vulkan: - externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; - break; - default: - return SLANG_FAIL; - } - externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; - externalMemoryHandleDesc.size = size; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; - - CUexternalMemory externalMemory; - SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); - resource->m_cudaExternalMemory = externalMemory; - - FormatInfo formatInfo; - SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo)); - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - arrayDesc.Depth = desc.size.depth; - arrayDesc.Height = desc.size.height; - arrayDesc.Width = desc.size.width; - arrayDesc.NumChannels = formatInfo.channelCount; - getCUDAFormat(desc.format, &arrayDesc.Format); - arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER - - CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc; - memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc)); - externalMemoryMipDesc.offset = 0; - externalMemoryMipDesc.arrayDesc = arrayDesc; - externalMemoryMipDesc.numLevels = desc.numMipLevels; - - CUmipmappedArray mipArray; - SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc)); - resource->m_cudaMipMappedArray = mipArray; - - CUarray cuArray; - SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0)); - resource->m_cudaArray = cuArray; - - CUDA_RESOURCE_DESC surfDesc; - memset(&surfDesc, 0, sizeof(surfDesc)); - surfDesc.resType = CU_RESOURCE_TYPE_ARRAY; - surfDesc.res.array.hArray = cuArray; - - CUsurfObject surface; - SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc)); - resource->m_cudaSurfObj = surface; - - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView( - ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override - { - RefPtr<CUDAResourceView> view = new CUDAResourceView(); - view->m_desc = desc; - view->textureResource = dynamic_cast<TextureCUDAResource*>(texture); - returnComPtr(outView, view); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView( - IBufferResource* buffer, - IBufferResource* counterBuffer, - IResourceView::Desc const& desc, - IResourceView** outView) override - { - RefPtr<CUDAResourceView> view = new CUDAResourceView(); - view->m_desc = desc; - view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer); - returnComPtr(outView, view); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool( - const IQueryPool::Desc& desc, - IQueryPool** outPool) override - { - RefPtr<CUDAQueryPool> pool = new CUDAQueryPool(); - SLANG_RETURN_ON_FAIL(pool->init(desc)); - returnComPtr(outPool, pool); - return SLANG_OK; - } - - virtual Result createShaderObjectLayout( - slang::TypeLayoutReflection* typeLayout, - ShaderObjectLayoutBase** outLayout) override - { - RefPtr<CUDAShaderObjectLayout> cudaLayout; - cudaLayout = new CUDAShaderObjectLayout(this, typeLayout); - returnRefPtrMove(outLayout, cudaLayout); - return SLANG_OK; - } - - virtual Result createShaderObject( - ShaderObjectLayoutBase* layout, - IShaderObject** outObject) override - { - RefPtr<CUDAShaderObject> result = new CUDAShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout))); - returnComPtr(outObject, result); - return SLANG_OK; - } - - virtual Result createMutableShaderObject( - ShaderObjectLayoutBase* layout, - IShaderObject** outObject) override - { - RefPtr<CUDAMutableShaderObject> result = new CUDAMutableShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout))); - returnComPtr(outObject, result); - return SLANG_OK; - } - - Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject) - { - auto cudaProgram = dynamic_cast<CUDAShaderProgram*>(program); - auto cudaLayout = cudaProgram->layout; - - RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout)); - returnRefPtrMove(outObject, result); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createProgram( - const IShaderProgram::Desc& desc, - IShaderProgram** outProgram, - ISlangBlob** outDiagnosticBlob) override - { - // If this is a specializable program, we just keep a reference to the slang program and - // don't actually create any kernels. This program will be specialized later when we know - // the shader object bindings. - RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram(); - cudaProgram->init(desc); - cudaProgram->cudaContext = m_context; - if (desc.slangGlobalScope->getSpecializationParamCount() != 0) - { - cudaProgram->layout = new CUDAProgramLayout(this, desc.slangGlobalScope->getLayout()); - returnComPtr(outProgram, cudaProgram); - return SLANG_OK; - } - - ComPtr<ISlangBlob> kernelCode; - ComPtr<ISlangBlob> diagnostics; - auto compileResult = desc.slangGlobalScope->getEntryPointCode( - (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef()); - if (diagnostics) - { - getDebugCallback()->handleMessage( - compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error, - DebugMessageSource::Slang, - (char*)diagnostics->getBufferPointer()); - if (outDiagnosticBlob) - returnComPtr(outDiagnosticBlob, diagnostics); - } - SLANG_RETURN_ON_FAIL(compileResult); - - SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer())); - cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName(); - SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction( - &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer())); - - auto slangGlobalScope = desc.slangGlobalScope; - if( slangGlobalScope ) - { - cudaProgram->slangGlobalScope = slangGlobalScope; - - auto slangProgramLayout = slangGlobalScope->getLayout(); - if(!slangProgramLayout) - return SLANG_FAIL; - - RefPtr<CUDAProgramLayout> cudaLayout; - cudaLayout = new CUDAProgramLayout(this, slangProgramLayout); - cudaLayout->programLayout = slangProgramLayout; - cudaProgram->layout = cudaLayout; - } - - returnComPtr(outProgram, cudaProgram); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState( - const ComputePipelineStateDesc& desc, IPipelineState** outState) override - { - RefPtr<CUDAPipelineState> state = new CUDAPipelineState(); - state->shaderProgram = static_cast<CUDAShaderProgram*>(desc.program); - state->init(desc); - returnComPtr(outState, state); - return Result(); - } - - void* map(IBufferResource* buffer) - { - return static_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory; - } - - void unmap(IBufferResource* buffer) - { - SLANG_UNUSED(buffer); - } - - virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override - { - return m_info; - } - -public: - virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap( - const ITransientResourceHeap::Desc& desc, - ITransientResourceHeap** outHeap) override - { - RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl(); - SLANG_RETURN_ON_FAIL(result->init(this, desc)); - returnComPtr(outHeap, result); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override - { - RefPtr<CommandQueueImpl> queue = new CommandQueueImpl(); - queue->init(this); - returnComPtr(outQueue, queue); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain( - const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(window); - SLANG_UNUSED(outSwapchain); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout( - const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outLayout); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outFramebuffer); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout( - const IRenderPassLayout::Desc& desc, - IRenderPassLayout** outRenderPassLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outRenderPassLayout); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override - { - SLANG_UNUSED(desc); - *outSampler = nullptr; - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout( - IInputLayout::Desc const& desc, - IInputLayout** outLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outLayout); - return SLANG_E_NOT_AVAILABLE; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState( - const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outState); - return SLANG_E_NOT_AVAILABLE; - } - - virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource( - ITextureResource* texture, - ResourceState state, - ISlangBlob** outBlob, - size_t* outRowPitch, - size_t* outPixelSize) override - { - auto textureImpl = static_cast<TextureCUDAResource*>(texture); - RefPtr<ListBlob> blob = new ListBlob(); - - auto desc = textureImpl->getDesc(); - auto width = desc->size.width; - auto height = desc->size.height; - FormatInfo sizeInfo; - SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo)); - size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock; - size_t rowPitch = width * pixelSize; - size_t size = height * rowPitch; - blob->m_data.setCount((Index)size); - - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.srcArray = textureImpl->m_cudaArray; - - copyParam.dstMemoryType = CU_MEMORYTYPE_HOST; - copyParam.dstHost = blob->m_data.getBuffer(); - copyParam.dstPitch = rowPitch; - copyParam.WidthInBytes = copyParam.dstPitch; - copyParam.Height = height; - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); - - *outRowPitch = rowPitch; - *outPixelSize = pixelSize; - returnComPtr(outBlob, blob); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource( - IBufferResource* buffer, - size_t offset, - size_t size, - ISlangBlob** outBlob) override - { - auto bufferImpl = static_cast<MemoryCUDAResource*>(buffer); - RefPtr<ListBlob> blob = new ListBlob(); - blob->m_data.setCount((Index)size); - cudaMemcpy( - blob->m_data.getBuffer(), - (uint8_t*)bufferImpl->m_cudaMemory + offset, - size, - cudaMemcpyDefault); - returnComPtr(outBlob, blob); - return SLANG_OK; - } -}; - -SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout) -{ - m_layout = typeLayout; - - // If the layout tells us that there is any uniform data, - // then we need to allocate a constant buffer to hold that data. - // - // TODO: Do we need to allocate a shadow copy for use from - // the CPU? - // - // TODO: When/where do we bind this constant buffer into - // a descriptor set for later use? - // - auto slangLayout = getLayout()->getElementTypeLayout(); - size_t uniformSize = slangLayout->getSize(); - if (uniformSize) - { - m_data.setCount((Index)uniformSize); - } - - // If the layout specifies that we have any resources or sub-objects, - // then we need to size the appropriate arrays to account for them. - // - // Note: the counts here are the *total* number of resources/sub-objects - // and not just the number of resource/sub-object ranges. - // - resources.setCount(typeLayout->getResourceCount()); - m_objects.setCount(typeLayout->getSubObjectCount()); - - for (auto subObjectRange : getLayout()->subObjectRanges) - { - RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout; - - // In the case where the sub-object range represents an - // existential-type leaf field (e.g., an `IBar`), we - // cannot pre-allocate the object(s) to go into that - // range, since we can't possibly know what to allocate - // at this point. - // - if (!subObjectLayout) - continue; - // - // Otherwise, we will allocate a sub-object to fill - // in each entry in this range, based on the layout - // information we already have. - - auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex]; - for (Index i = 0; i < bindingRangeInfo.count; ++i) - { - RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject(); - SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout)); - - ShaderOffset offset; - offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i; - offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex; - offset.bindingArrayIndex = (GfxIndex)i; - - SLANG_RETURN_ON_FAIL(setObject(offset, subObject)); - } - } - return SLANG_OK; -} - -SlangResult CUDARootShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout) -{ - SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(device, typeLayout)); - auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout); - for (auto& entryPoint : programLayout->entryPointLayouts) - { - RefPtr<CUDAEntryPointShaderObject> object = new CUDAEntryPointShaderObject(); - SLANG_RETURN_ON_FAIL(object->init(device, entryPoint)); - entryPointObjects.add(object); - } - return SLANG_OK; -} - -SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) -{ - RefPtr<CUDADevice> result = new CUDADevice(); - SLANG_RETURN_ON_FAIL(result->initialize(*desc)); - returnComPtr(outDevice, result); - return SLANG_OK; -} -#else -SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) -{ - SLANG_UNUSED(desc); - *outDevice = nullptr; - return SLANG_FAIL; -} -#endif - -} diff --git a/tools/gfx/cuda/render-cuda.h b/tools/gfx/cuda/render-cuda.h deleted file mode 100644 index 5c477f513..000000000 --- a/tools/gfx/cuda/render-cuda.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include "../renderer-shared.h" - -namespace gfx -{ - -SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice); -} diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp index ef4edb341..880e61c61 100644 --- a/tools/gfx/render.cpp +++ b/tools/gfx/render.cpp @@ -2,7 +2,6 @@ #include "renderer-shared.h" #include "../../source/core/slang-math.h" #include "open-gl/render-gl.h" -#include "cuda/render-cuda.h" #include "cpu/render-cpu.h" #include "debug-layer.h" @@ -14,6 +13,7 @@ using namespace Slang; Result SLANG_MCALL createD3D11Device(const IDevice::Desc* desc, IDevice** outDevice); Result SLANG_MCALL createD3D12Device(const IDevice::Desc* desc, IDevice** outDevice); Result SLANG_MCALL createVKDevice(const IDevice::Desc* desc, IDevice** outDevice); +Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice); static bool debugLayerEnabled = false; |
