summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--build/visual-studio/gfx/gfx.vcxproj29
-rw-r--r--build/visual-studio/gfx/gfx.vcxproj.filters79
-rw-r--r--tools/gfx-unit-test/get-cmd-queue-handle-test.cpp2
-rw-r--r--tools/gfx/cuda/cuda-base.h59
-rw-r--r--tools/gfx/cuda/cuda-buffer.cpp51
-rw-r--r--tools/gfx/cuda/cuda-buffer.h39
-rw-r--r--tools/gfx/cuda/cuda-command-buffer.cpp62
-rw-r--r--tools/gfx/cuda/cuda-command-buffer.h49
-rw-r--r--tools/gfx/cuda/cuda-command-encoder.cpp210
-rw-r--r--tools/gfx/cuda/cuda-command-encoder.h135
-rw-r--r--tools/gfx/cuda/cuda-command-queue.cpp225
-rw-r--r--tools/gfx/cuda/cuda-command-queue.h69
-rw-r--r--tools/gfx/cuda/cuda-context.h22
-rw-r--r--tools/gfx/cuda/cuda-device.cpp1115
-rw-r--r--tools/gfx/cuda/cuda-device.h152
-rw-r--r--tools/gfx/cuda/cuda-helper-functions.cpp91
-rw-r--r--tools/gfx/cuda/cuda-helper-functions.h106
-rw-r--r--tools/gfx/cuda/cuda-pipeline-state.cpp22
-rw-r--r--tools/gfx/cuda/cuda-pipeline-state.h28
-rw-r--r--tools/gfx/cuda/cuda-query.cpp48
-rw-r--r--tools/gfx/cuda/cuda-query.h32
-rw-r--r--tools/gfx/cuda/cuda-resource-views.h26
-rw-r--r--tools/gfx/cuda/cuda-shader-object-layout.cpp153
-rw-r--r--tools/gfx/cuda/cuda-shader-object-layout.h74
-rw-r--r--tools/gfx/cuda/cuda-shader-object.cpp349
-rw-r--r--tools/gfx/cuda/cuda-shader-object.h93
-rw-r--r--tools/gfx/cuda/cuda-shader-program.cpp20
-rw-r--r--tools/gfx/cuda/cuda-shader-program.h29
-rw-r--r--tools/gfx/cuda/cuda-texture.cpp48
-rw-r--r--tools/gfx/cuda/cuda-texture.h44
-rw-r--r--tools/gfx/cuda/render-cuda.cpp2513
-rw-r--r--tools/gfx/cuda/render-cuda.h9
-rw-r--r--tools/gfx/render.cpp2
33 files changed, 3457 insertions, 2528 deletions
diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj
index 542c34371..25a48c339 100644
--- a/build/visual-studio/gfx/gfx.vcxproj
+++ b/build/visual-studio/gfx/gfx.vcxproj
@@ -303,7 +303,21 @@
<ClInclude Include="..\..\..\tools\gfx\command-encoder-com-forward.h" />
<ClInclude Include="..\..\..\tools\gfx\command-writer.h" />
<ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h" />
- <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h" />
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h" />
<ClInclude Include="..\..\..\tools\gfx\d3d11\d3d11-base.h" />
@@ -390,7 +404,18 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp" />
- <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp" />
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp" />
<ClCompile Include="..\..\..\tools\gfx\d3d11\d3d11-buffer.cpp" />
diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters
index d461d3d24..61708be0b 100644
--- a/build/visual-studio/gfx/gfx.vcxproj.filters
+++ b/build/visual-studio/gfx/gfx.vcxproj.filters
@@ -21,7 +21,49 @@
<ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h">
<Filter>Header Files</Filter>
</ClInclude>
- <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h">
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h">
@@ -278,7 +320,40 @@
<ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp">
<Filter>Source Files</Filter>
</ClCompile>
- <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp">
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp">
diff --git a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
index 0dd5ad561..e14729718 100644
--- a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
+++ b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
@@ -21,7 +21,7 @@ namespace gfx_test
GFX_CHECK_CALL_ABORT(queue->getNativeHandle(&handle));
if (device->getDeviceInfo().deviceType == gfx::DeviceType::Vulkan)
{
- SLANG_CHECK(handle.handleValue != NULL);
+ SLANG_CHECK(handle.handleValue != 0);
}
#if SLANG_WINDOWS_FAMILY
else
diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h
new file mode 100644
index 000000000..57a244089
--- /dev/null
+++ b/tools/gfx/cuda/cuda-base.h
@@ -0,0 +1,59 @@
+// cuda-base.h
+// Shared header file for CUDA implementation
+#pragma once
+
+#ifdef GFX_ENABLE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "core/slang-basic.h"
+#include "core/slang-blob.h"
+#include "core/slang-std-writers.h"
+
+#include "slang.h"
+#include "slang-com-ptr.h"
+#include "slang-com-helper.h"
+#include "../command-writer.h"
+#include "../renderer-shared.h"
+#include "../mutable-shader-object.h"
+#include "../simple-transient-resource-heap.h"
+#include "../slang-context.h"
+#include "../command-encoder-com-forward.h"
+
+# ifdef RENDER_TEST_OPTIX
+
+// The `optix_stubs.h` header produces warnings when compiled with MSVC
+# ifdef _MSC_VER
+# pragma warning(disable: 4996)
+# endif
+
+# include <optix.h>
+# include <optix_function_table_definition.h>
+# include <optix_stubs.h>
+# endif
+
+#endif
+
+namespace gfx
+{
+namespace cuda
+{
+ class CUDAContext;
+ class BufferResourceImpl;
+ class TextureResourceImpl;
+ class ResourceViewImpl;
+ class ShaderObjectLayoutImpl;
+ class RootShaderObjectLayoutImpl;
+ class ShaderObjectImpl;
+ class MutableShaderObjectImpl;
+ class EntryPointShaderObjectImpl;
+ class RootShaderObjectImpl;
+ class ShaderProgramImpl;
+ class PipelineStateImpl;
+ class QueryPoolImpl;
+ class DeviceImpl;
+ class CommandBufferImpl;
+ class ResourceCommandEncoderImpl;
+ class ComputeCommandEncoderImpl;
+ class CommandQueueImpl;
+}
+}
diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp
new file mode 100644
index 000000000..1cd162841
--- /dev/null
+++ b/tools/gfx/cuda/cuda-buffer.cpp
@@ -0,0 +1,51 @@
+// cuda-buffer.cpp
+#include "cuda-buffer.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+BufferResourceImpl::~BufferResourceImpl()
+{
+ if (m_cudaMemory)
+ {
+ SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+ }
+}
+
+uint64_t BufferResourceImpl::getBindlessHandle() { return (uint64_t)m_cudaMemory; }
+
+DeviceAddress BufferResourceImpl::getDeviceAddress()
+{
+ return (DeviceAddress)m_cudaMemory;
+}
+
+Result BufferResourceImpl::getNativeResourceHandle(InteropHandle* outHandle)
+{
+ outHandle->handleValue = getBindlessHandle();
+ outHandle->api = InteropHandleAPI::CUDA;
+ return SLANG_OK;
+}
+
+Result BufferResourceImpl::map(MemoryRange* rangeToRead, void** outPointer)
+{
+ SLANG_UNUSED(rangeToRead);
+ SLANG_UNUSED(outPointer);
+ return SLANG_FAIL;
+}
+
+Result BufferResourceImpl::unmap(MemoryRange* writtenRange)
+{
+ SLANG_UNUSED(writtenRange);
+ return SLANG_FAIL;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-buffer.h b/tools/gfx/cuda/cuda-buffer.h
new file mode 100644
index 000000000..838a06555
--- /dev/null
+++ b/tools/gfx/cuda/cuda-buffer.h
@@ -0,0 +1,39 @@
+// cuda-buffer.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-context.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class BufferResourceImpl : public BufferResource
+{
+public:
+ BufferResourceImpl(const Desc& _desc)
+ : BufferResource(_desc)
+ {}
+
+ ~BufferResourceImpl();
+
+ uint64_t getBindlessHandle();
+
+ void* m_cudaExternalMemory = nullptr;
+ void* m_cudaMemory = nullptr;
+
+ RefPtr<CUDAContext> m_cudaContext;
+
+ virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ map(MemoryRange* rangeToRead, void** outPointer) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-buffer.cpp b/tools/gfx/cuda/cuda-command-buffer.cpp
new file mode 100644
index 000000000..efb2486f4
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-buffer.cpp
@@ -0,0 +1,62 @@
+// cuda-command-buffer.cpp
+#include "cuda-command-buffer.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandBuffer* CommandBufferImpl::getInterface(const Guid& guid)
+{
+ if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
+ return static_cast<ICommandBuffer*>(this);
+ return nullptr;
+}
+
+void CommandBufferImpl::init(DeviceImpl* device, TransientResourceHeapBase* transientHeap)
+{
+ m_device = device;
+ m_transientHeap = transientHeap;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandBufferImpl::encodeRenderCommands(
+ IRenderPassLayout* renderPass,
+ IFramebuffer* framebuffer,
+ IRenderCommandEncoder** outEncoder)
+{
+ SLANG_UNUSED(renderPass);
+ SLANG_UNUSED(framebuffer);
+ *outEncoder = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ CommandBufferImpl::encodeResourceCommands(IResourceCommandEncoder** outEncoder)
+{
+ m_resourceCommandEncoder.init(this);
+ *outEncoder = &m_resourceCommandEncoder;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ CommandBufferImpl::encodeComputeCommands(IComputeCommandEncoder** outEncoder)
+{
+ m_computeCommandEncoder.init(this);
+ *outEncoder = &m_computeCommandEncoder;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ CommandBufferImpl::encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder)
+{
+ *outEncoder = nullptr;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandBufferImpl::getNativeHandle(InteropHandle* outHandle)
+{
+ return SLANG_FAIL;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-buffer.h b/tools/gfx/cuda/cuda-command-buffer.h
new file mode 100644
index 000000000..76d9aa01f
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-buffer.h
@@ -0,0 +1,49 @@
+// cuda-command-buffer.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-command-encoder.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CommandBufferImpl
+ : public ICommandBuffer
+ , public CommandWriter
+ , public ComObject
+{
+public:
+ SLANG_COM_OBJECT_IUNKNOWN_ALL
+ ICommandBuffer* getInterface(const Guid& guid);
+
+public:
+ DeviceImpl* m_device;
+ TransientResourceHeapBase* m_transientHeap;
+ ResourceCommandEncoderImpl m_resourceCommandEncoder;
+ ComputeCommandEncoderImpl m_computeCommandEncoder;
+
+ void init(DeviceImpl* device, TransientResourceHeapBase* transientHeap);
+ virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
+ IRenderPassLayout* renderPass,
+ IFramebuffer* framebuffer,
+ IRenderCommandEncoder** outEncoder) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ encodeResourceCommands(IResourceCommandEncoder** outEncoder) override;
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ encodeComputeCommands(IComputeCommandEncoder** outEncoder) override;
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-encoder.cpp b/tools/gfx/cuda/cuda-command-encoder.cpp
new file mode 100644
index 000000000..6911eb649
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-encoder.cpp
@@ -0,0 +1,210 @@
+// cuda-command-encoder.cpp
+#include "cuda-command-encoder.h"
+
+#include "cuda-command-buffer.h"
+#include "cuda-device.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+void ResourceCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer)
+{
+ m_writer = cmdBuffer;
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyBuffer(
+ IBufferResource* dst,
+ Offset dstOffset,
+ IBufferResource* src,
+ Offset srcOffset,
+ Size size)
+{
+ m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadBufferData(
+ IBufferResource* dst, Offset offset, Size size, void* data)
+{
+ m_writer->uploadBufferData(dst, offset, size, data);
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ ResourceCommandEncoderImpl::writeTimestamp(IQueryPool* pool, GfxIndex index)
+{
+ m_writer->writeTimestamp(pool, index);
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTexture(
+ ITextureResource* dst,
+ ResourceState dstState,
+ SubresourceRange dstSubresource,
+ ITextureResource::Offset3D dstOffset,
+ ITextureResource* src,
+ ResourceState srcState,
+ SubresourceRange srcSubresource,
+ ITextureResource::Offset3D srcOffset,
+ ITextureResource::Extents extent)
+{
+ SLANG_UNUSED(dst);
+ SLANG_UNUSED(dstState);
+ SLANG_UNUSED(dstSubresource);
+ SLANG_UNUSED(dstOffset);
+ SLANG_UNUSED(src);
+ SLANG_UNUSED(srcState);
+ SLANG_UNUSED(srcSubresource);
+ SLANG_UNUSED(srcOffset);
+ SLANG_UNUSED(extent);
+ SLANG_UNIMPLEMENTED_X("copyTexture");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadTextureData(
+ ITextureResource* dst,
+ SubresourceRange subResourceRange,
+ ITextureResource::Offset3D offset,
+ ITextureResource::Extents extent,
+ ITextureResource::SubresourceData* subResourceData,
+ GfxCount subResourceDataCount)
+{
+ SLANG_UNUSED(dst);
+ SLANG_UNUSED(subResourceRange);
+ SLANG_UNUSED(offset);
+ SLANG_UNUSED(extent);
+ SLANG_UNUSED(subResourceData);
+ SLANG_UNUSED(subResourceDataCount);
+ SLANG_UNIMPLEMENTED_X("uploadTextureData");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::clearResourceView(
+ IResourceView* view,
+ ClearValue* clearValue,
+ ClearResourceViewFlags::Enum flags)
+{
+ SLANG_UNUSED(view);
+ SLANG_UNUSED(clearValue);
+ SLANG_UNUSED(flags);
+ SLANG_UNIMPLEMENTED_X("clearResourceView");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveResource(
+ ITextureResource* source,
+ ResourceState sourceState,
+ SubresourceRange sourceRange,
+ ITextureResource* dest,
+ ResourceState destState,
+ SubresourceRange destRange)
+{
+ SLANG_UNUSED(source);
+ SLANG_UNUSED(sourceState);
+ SLANG_UNUSED(sourceRange);
+ SLANG_UNUSED(dest);
+ SLANG_UNUSED(destState);
+ SLANG_UNUSED(destRange);
+ SLANG_UNIMPLEMENTED_X("resolveResource");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveQuery(
+ IQueryPool* queryPool,
+ GfxIndex index,
+ GfxCount count,
+ IBufferResource* buffer,
+ Offset offset)
+{
+ SLANG_UNUSED(queryPool);
+ SLANG_UNUSED(index);
+ SLANG_UNUSED(count);
+ SLANG_UNUSED(buffer);
+ SLANG_UNUSED(offset);
+ SLANG_UNIMPLEMENTED_X("resolveQuery");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTextureToBuffer(
+ IBufferResource* dst,
+ Offset dstOffset,
+ Size dstSize,
+ Size dstRowStride,
+ ITextureResource* src,
+ ResourceState srcState,
+ SubresourceRange srcSubresource,
+ ITextureResource::Offset3D srcOffset,
+ ITextureResource::Extents extent)
+{
+ SLANG_UNUSED(dst);
+ SLANG_UNUSED(dstOffset);
+ SLANG_UNUSED(dstSize);
+ SLANG_UNUSED(dstRowStride);
+ SLANG_UNUSED(src);
+ SLANG_UNUSED(srcState);
+ SLANG_UNUSED(srcSubresource);
+ SLANG_UNUSED(srcOffset);
+ SLANG_UNUSED(extent);
+ SLANG_UNIMPLEMENTED_X("copyTextureToBuffer");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::textureSubresourceBarrier(
+ ITextureResource* texture,
+ SubresourceRange subresourceRange,
+ ResourceState src,
+ ResourceState dst)
+{
+ SLANG_UNUSED(texture);
+ SLANG_UNUSED(subresourceRange);
+ SLANG_UNUSED(src);
+ SLANG_UNUSED(dst);
+ SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier");
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ ResourceCommandEncoderImpl::beginDebugEvent(const char* name, float rgbColor[3])
+{
+ SLANG_UNUSED(name);
+ SLANG_UNUSED(rgbColor);
+}
+
+void ComputeCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer)
+{
+ m_writer = cmdBuffer;
+ m_commandBuffer = cmdBuffer;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ComputeCommandEncoderImpl::bindPipeline(IPipelineState* state, IShaderObject** outRootObject)
+{
+ m_writer->setPipelineState(state);
+ PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
+ SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
+ pipelineImpl->m_program, m_rootObject.writeRef()));
+ returnComPtr(outRootObject, m_rootObject);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ComputeCommandEncoderImpl::bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject)
+{
+ m_writer->setPipelineState(state);
+ PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
+ SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
+ pipelineImpl->m_program, m_rootObject.writeRef()));
+ m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW void SLANG_MCALL ComputeCommandEncoderImpl::dispatchCompute(int x, int y, int z)
+{
+ m_writer->bindRootShaderObject(m_rootObject);
+ m_writer->dispatchCompute(x, y, z);
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+ ComputeCommandEncoderImpl::dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset)
+{
+ SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect");
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-encoder.h b/tools/gfx/cuda/cuda-command-encoder.h
new file mode 100644
index 000000000..73660534a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-encoder.h
@@ -0,0 +1,135 @@
+// cuda-command-encoder.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ResourceCommandEncoderImpl : public IResourceCommandEncoder
+{
+public:
+ CommandWriter* m_writer;
+
+ void init(CommandBufferImpl* cmdBuffer);
+
+ virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+ virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
+ IBufferResource* dst,
+ Offset dstOffset,
+ IBufferResource* src,
+ Offset srcOffset,
+ Size size) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier(
+ GfxCount count,
+ ITextureResource* const* textures,
+ ResourceState src,
+ ResourceState dst) override
+ {}
+
+ virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier(
+ GfxCount count,
+ IBufferResource* const* buffers,
+ ResourceState src,
+ ResourceState dst) override
+ {}
+
+ virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData(
+ IBufferResource* dst, Offset offset, Size size, void* data) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ writeTimestamp(IQueryPool* pool, GfxIndex index) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL copyTexture(
+ ITextureResource* dst,
+ ResourceState dstState,
+ SubresourceRange dstSubresource,
+ ITextureResource::Offset3D dstOffset,
+ ITextureResource* src,
+ ResourceState srcState,
+ SubresourceRange srcSubresource,
+ ITextureResource::Offset3D srcOffset,
+ ITextureResource::Extents extent) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData(
+ ITextureResource* dst,
+ SubresourceRange subResourceRange,
+ ITextureResource::Offset3D offset,
+ ITextureResource::Extents extent,
+ ITextureResource::SubresourceData* subResourceData,
+ GfxCount subResourceDataCount) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView(
+ IResourceView* view,
+ ClearValue* clearValue,
+ ClearResourceViewFlags::Enum flags) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL resolveResource(
+ ITextureResource* source,
+ ResourceState sourceState,
+ SubresourceRange sourceRange,
+ ITextureResource* dest,
+ ResourceState destState,
+ SubresourceRange destRange) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery(
+ IQueryPool* queryPool,
+ GfxIndex index,
+ GfxCount count,
+ IBufferResource* buffer,
+ Offset offset) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer(
+ IBufferResource* dst,
+ Offset dstOffset,
+ Size dstSize,
+ Size dstRowStride,
+ ITextureResource* src,
+ ResourceState srcState,
+ SubresourceRange srcSubresource,
+ ITextureResource::Offset3D srcOffset,
+ ITextureResource::Extents extent) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier(
+ ITextureResource* texture,
+ SubresourceRange subresourceRange,
+ ResourceState src,
+ ResourceState dst) override;
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ beginDebugEvent(const char* name, float rgbColor[3]) override;
+ virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {}
+};
+
+class ComputeCommandEncoderImpl
+ : public IComputeCommandEncoder
+ , public ResourceCommandEncoderImpl
+{
+public:
+ SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl)
+public:
+ CommandWriter* m_writer;
+ CommandBufferImpl* m_commandBuffer;
+ RefPtr<ShaderObjectBase> m_rootObject;
+ virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+ void init(CommandBufferImpl* cmdBuffer);
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL
+ dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
new file mode 100644
index 000000000..60e81246d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -0,0 +1,225 @@
+// cuda-command-queue.cpp
+#include "cuda-command-queue.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-buffer.h"
+#include "cuda-query.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid)
+{
+ if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
+ return static_cast<ICommandQueue*>(this);
+ return nullptr;
+}
+
+void CommandQueueImpl::init(DeviceImpl* inRenderer)
+{
+ renderer = inRenderer;
+ m_desc.type = ICommandQueue::QueueType::Graphics;
+ cuStreamCreate(&stream, 0);
+}
+CommandQueueImpl::~CommandQueueImpl()
+{
+ cuStreamSynchronize(stream);
+ cuStreamDestroy(stream);
+ currentPipeline = nullptr;
+ currentRootObject = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers(
+ GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal)
+{
+ SLANG_UNUSED(valueToSignal);
+ // TODO: implement fence.
+ assert(fence == nullptr);
+ for (GfxIndex i = 0; i < count; i++)
+ {
+ execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
+ }
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost()
+{
+ auto resultCode = cuStreamSynchronize(stream);
+ if (resultCode != cudaSuccess)
+ SLANG_CUDA_HANDLE_ERROR(resultCode);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice(
+ GfxCount fenceCount, IFence** fences, uint64_t* waitValues)
+{
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle)
+{
+ return SLANG_FAIL;
+}
+
+void CommandQueueImpl::setPipelineState(IPipelineState* state)
+{
+ currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state);
+}
+
+Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object)
+{
+ currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object);
+ if (currentRootObject)
+ return SLANG_OK;
+ return SLANG_E_INVALID_ARG;
+}
+
+void CommandQueueImpl::dispatchCompute(int x, int y, int z)
+{
+ // Specialize the compute kernel based on the shader object bindings.
+ RefPtr<PipelineStateBase> newPipeline;
+ renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
+ currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr());
+
+ // Find out thread group size from program reflection.
+ auto& kernelName = currentPipeline->shaderProgram->kernelName;
+ auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout());
+ int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
+ SLANG_ASSERT(kernelId != -1);
+ UInt threadGroupSize[3];
+ programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
+
+ int sharedSizeInBytes;
+ cuFuncGetAttribute(
+ &sharedSizeInBytes,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+ currentPipeline->shaderProgram->cudaKernel);
+
+ // Copy global parameter data to the `SLANG_globalParams` symbol.
+ {
+ CUdeviceptr globalParamsSymbol = 0;
+ size_t globalParamsSymbolSize = 0;
+ cuModuleGetGlobal(
+ &globalParamsSymbol,
+ &globalParamsSymbolSize,
+ currentPipeline->shaderProgram->cudaModule,
+ "SLANG_globalParams");
+
+ CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
+ cudaMemcpyAsync(
+ (void*)globalParamsSymbol,
+ (void*)globalParamsCUDAData,
+ globalParamsSymbolSize,
+ cudaMemcpyDefault,
+ 0);
+ }
+ //
+ // The argument data for the entry-point parameters are already
+ // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
+ //
+ auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
+ auto entryPointDataSize =
+ currentRootObject->entryPointObjects[kernelId]->getBufferSize();
+
+ void* extraOptions[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER,
+ entryPointBuffer,
+ CU_LAUNCH_PARAM_BUFFER_SIZE,
+ &entryPointDataSize,
+ CU_LAUNCH_PARAM_END,
+ };
+
+ // Once we have all the necessary data extracted and/or
+ // set up, we can launch the kernel and see what happens.
+ //
+ auto cudaLaunchResult = cuLaunchKernel(
+ currentPipeline->shaderProgram->cudaKernel,
+ x,
+ y,
+ z,
+ int(threadGroupSize[0]),
+ int(threadGroupSize[1]),
+ int(threadGroupSize[2]),
+ sharedSizeInBytes,
+ stream,
+ nullptr,
+ extraOptions);
+
+ SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
+}
+
+void CommandQueueImpl::copyBuffer(
+ IBufferResource* dst,
+ size_t dstOffset,
+ IBufferResource* src,
+ size_t srcOffset,
+ size_t size)
+{
+ auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+ auto srcImpl = static_cast<BufferResourceImpl*>(src);
+ cudaMemcpy(
+ (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
+ (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
+ size,
+ cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+{
+ auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+ cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
+{
+ auto poolImpl = static_cast<QueryPoolImpl*>(pool);
+ cuEventRecord(poolImpl->m_events[index], stream);
+}
+
+void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer)
+{
+ for (auto& cmd : commandBuffer->m_commands)
+ {
+ switch (cmd.name)
+ {
+ case CommandName::SetPipelineState:
+ setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
+ break;
+ case CommandName::BindRootShaderObject:
+ bindRootShaderObject(
+ commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
+ break;
+ case CommandName::DispatchCompute:
+ dispatchCompute(
+ int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
+ break;
+ case CommandName::CopyBuffer:
+ copyBuffer(
+ commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+ cmd.operands[1],
+ commandBuffer->getObject<BufferResource>(cmd.operands[2]),
+ cmd.operands[3],
+ cmd.operands[4]);
+ break;
+ case CommandName::UploadBufferData:
+ uploadBufferData(
+ commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+ cmd.operands[1],
+ cmd.operands[2],
+ commandBuffer->getData<uint8_t>(cmd.operands[3]));
+ break;
+ case CommandName::WriteTimestamp:
+ writeTimestamp(
+ commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
+ (SlangInt)cmd.operands[1]);
+ }
+ }
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-queue.h b/tools/gfx/cuda/cuda-command-queue.h
new file mode 100644
index 000000000..10601890e
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.h
@@ -0,0 +1,69 @@
+// cuda-command-queue.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-device.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-shader-object.h"
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CommandQueueImpl
+ : public ICommandQueue
+ , public ComObject
+{
+public:
+ SLANG_COM_OBJECT_IUNKNOWN_ALL
+ ICommandQueue* getInterface(const Guid& guid);
+
+ RefPtr<ComputePipelineStateImpl> currentPipeline;
+ RefPtr<RootShaderObjectImpl> currentRootObject;
+ RefPtr<DeviceImpl> renderer;
+ CUstream stream;
+ Desc m_desc;
+
+ void init(DeviceImpl* inRenderer);
+ ~CommandQueueImpl();
+
+ virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override { return m_desc; }
+
+ virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers(
+ GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override;
+
+ virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice(
+ GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override;
+
+ void setPipelineState(IPipelineState* state);
+
+ Result bindRootShaderObject(IShaderObject* object);
+
+ void dispatchCompute(int x, int y, int z);
+
+ void copyBuffer(
+ IBufferResource* dst,
+ size_t dstOffset,
+ IBufferResource* src,
+ size_t srcOffset,
+ size_t size);
+
+ void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data);
+
+ void writeTimestamp(IQueryPool* pool, SlangInt index);
+
+ void execute(CommandBufferImpl* commandBuffer);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-context.h b/tools/gfx/cuda/cuda-context.h
new file mode 100644
index 000000000..d1a53e4e9
--- /dev/null
+++ b/tools/gfx/cuda/cuda-context.h
@@ -0,0 +1,22 @@
+// cuda-context.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CUDAContext : public RefObject
+{
+public:
+ CUcontext m_context = nullptr;
+ ~CUDAContext() { cuCtxDestroy(m_context); }
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
new file mode 100644
index 000000000..1a4a142d0
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -0,0 +1,1115 @@
+// cuda-device.cpp
+#include "cuda-device.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-queue.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-query.h"
+#include "cuda-shader-object.h"
+#include "cuda-shader-object-layout.h"
+#include "cuda-shader-program.h"
+#include "cuda-resource-views.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+int DeviceImpl::_calcSMCountPerMultiProcessor(int major, int minor)
+{
+ // Defines for GPU Architecture types (using the SM version to determine
+ // the # of cores per SM
+ struct SMInfo
+ {
+ int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+ int coreCount;
+ };
+
+ static const SMInfo infos[] = {
+ {0x30, 192},
+ {0x32, 192},
+ {0x35, 192},
+ {0x37, 192},
+ {0x50, 128},
+ {0x52, 128},
+ {0x53, 128},
+ {0x60, 64},
+ {0x61, 128},
+ {0x62, 128},
+ {0x70, 64},
+ {0x72, 64},
+ {0x75, 64} };
+
+ const int sm = ((major << 4) + minor);
+ for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+ {
+ if (infos[i].sm == sm)
+ {
+ return infos[i].coreCount;
+ }
+ }
+
+ const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+ // It must be newer presumably
+ SLANG_ASSERT(sm > last.sm);
+
+ // Default to the last entry
+ return last.coreCount;
+}
+
+SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
+{
+ int smPerMultiproc = 0;
+ int maxPerfDevice = -1;
+ int deviceCount = 0;
+ int devicesProhibited = 0;
+
+ uint64_t maxComputePerf = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+ // Find the best CUDA capable GPU device
+ for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+ {
+ int computeMode = -1, major = 0, minor = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+ // If this GPU is not running on Compute Mode prohibited,
+ // then we can add it to the list
+ if (computeMode != cudaComputeModeProhibited)
+ {
+ if (major == 9999 && minor == 9999)
+ {
+ smPerMultiproc = 1;
+ }
+ else
+ {
+ smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+ }
+
+ int multiProcessorCount = 0, clockRate = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
+ &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+ uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+ if (compute_perf > maxComputePerf)
+ {
+ maxComputePerf = compute_perf;
+ maxPerfDevice = currentDevice;
+ }
+ }
+ else
+ {
+ devicesProhibited++;
+ }
+ }
+
+ if (maxPerfDevice < 0)
+ {
+ return SLANG_FAIL;
+ }
+
+ *outDeviceIndex = maxPerfDevice;
+ return SLANG_OK;
+}
+
+SlangResult DeviceImpl::_initCuda(CUDAReportStyle reportType)
+{
+ static CUresult res = cuInit(0);
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::getNativeDeviceHandles(InteropHandles* outHandles)
+{
+ outHandles->handles[0].handleValue = (uint64_t)m_device;
+ outHandles->handles[0].api = InteropHandleAPI::CUDA;
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
+{
+ SLANG_RETURN_ON_FAIL(slangContext.initialize(
+ desc.slang,
+ SLANG_PTX,
+ "sm_5_1",
+ makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
+
+ SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
+
+ SLANG_RETURN_ON_FAIL(_initCuda(reportType));
+
+ SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
+
+ m_context = new CUDAContext();
+
+ int count = -1;
+ cuDeviceGetCount(&count);
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
+
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
+ cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
+
+ // Not clear how to detect half support on CUDA. For now we'll assume we have it
+ {
+ m_features.add("half");
+ }
+
+ // Initialize DeviceInfo
+ {
+ m_info.deviceType = DeviceType::CUDA;
+ m_info.bindingStyle = BindingStyle::CUDA;
+ m_info.projectionStyle = ProjectionStyle::DirectX;
+ m_info.apiName = "CUDA";
+ static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
+ ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
+ cudaDeviceProp deviceProperties;
+ cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
+ m_adapterName = deviceProperties.name;
+ m_info.adapterName = m_adapterName.begin();
+ m_info.timestampFrequency = 1000000;
+ }
+
+ return SLANG_OK;
+}
+
+Result DeviceImpl::getCUDAFormat(Format format, CUarray_format* outFormat)
+{
+ // TODO: Expand to cover all available formats that can be supported in CUDA
+ switch (format)
+ {
+ case Format::R32G32B32A32_FLOAT:
+ case Format::R32G32B32_FLOAT:
+ case Format::R32G32_FLOAT:
+ case Format::R32_FLOAT:
+ case Format::D32_FLOAT:
+ *outFormat = CU_AD_FORMAT_FLOAT;
+ return SLANG_OK;
+ case Format::R16G16B16A16_FLOAT:
+ case Format::R16G16_FLOAT:
+ case Format::R16_FLOAT:
+ *outFormat = CU_AD_FORMAT_HALF;
+ return SLANG_OK;
+ case Format::R32G32B32A32_UINT:
+ case Format::R32G32B32_UINT:
+ case Format::R32G32_UINT:
+ case Format::R32_UINT:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
+ return SLANG_OK;
+ case Format::R16G16B16A16_UINT:
+ case Format::R16G16_UINT:
+ case Format::R16_UINT:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
+ return SLANG_OK;
+ case Format::R8G8B8A8_UINT:
+ case Format::R8G8_UINT:
+ case Format::R8_UINT:
+ case Format::R8G8B8A8_UNORM:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+ return SLANG_OK;
+ case Format::R32G32B32A32_SINT:
+ case Format::R32G32B32_SINT:
+ case Format::R32G32_SINT:
+ case Format::R32_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT32;
+ return SLANG_OK;
+ case Format::R16G16B16A16_SINT:
+ case Format::R16G16_SINT:
+ case Format::R16_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT16;
+ return SLANG_OK;
+ case Format::R8G8B8A8_SINT:
+ case Format::R8G8_SINT:
+ case Format::R8_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT8;
+ return SLANG_OK;
+ default:
+ SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+ return SLANG_FAIL;
+ }
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureResource(
+ const ITextureResource::Desc& desc,
+ const ITextureResource::SubresourceData* initData,
+ ITextureResource** outResource)
+{
+ TextureResource::Desc srcDesc = fixupTextureDesc(desc);
+
+ RefPtr<TextureResourceImpl> tex = new TextureResourceImpl(srcDesc);
+ tex->m_cudaContext = m_context;
+
+ CUresourcetype resourceType;
+
+ // The size of the element/texel in bytes
+ size_t elementSize = 0;
+
+ // Our `ITextureResource::Desc` uses an enumeration to specify
+ // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
+ // `cuMipmappedArrayCreate` seemingly relies on a policy where
+ // the extents of the array in dimenions above the rank are
+ // specified as zero (e.g., a 1D texture requires `height==0`).
+ //
+ // We will start by massaging the extents as specified by the
+ // user into a form that CUDA wants/expects, based on the
+ // texture shape as specified in the `desc`.
+ //
+ int width = desc.size.width;
+ int height = desc.size.height;
+ int depth = desc.size.depth;
+ switch (desc.type)
+ {
+ case IResource::Type::Texture1D:
+ height = 0;
+ depth = 0;
+ break;
+
+ case IResource::Type::Texture2D:
+ depth = 0;
+ break;
+
+ case IResource::Type::Texture3D:
+ break;
+
+ case IResource::Type::TextureCube:
+ depth = 1;
+ break;
+ }
+
+ {
+ CUarray_format format = CU_AD_FORMAT_FLOAT;
+ int numChannels = 0;
+
+ SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
+ FormatInfo info;
+ gfxGetFormatInfo(desc.format, &info);
+ numChannels = info.channelCount;
+
+ switch (format)
+ {
+ case CU_AD_FORMAT_FLOAT:
+ {
+ elementSize = sizeof(float) * numChannels;
+ break;
+ }
+ case CU_AD_FORMAT_HALF:
+ {
+ elementSize = sizeof(uint16_t) * numChannels;
+ break;
+ }
+ case CU_AD_FORMAT_UNSIGNED_INT8:
+ {
+ elementSize = sizeof(uint8_t) * numChannels;
+ break;
+ }
+ default:
+ {
+ SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+ return SLANG_FAIL;
+ }
+ }
+
+ if (desc.numMipLevels > 1)
+ {
+ resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Width = width;
+ arrayDesc.Height = height;
+ arrayDesc.Depth = depth;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+ arrayDesc.Flags = 0;
+
+ if (desc.arraySize > 1)
+ {
+ if (desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
+ arrayDesc.Depth = desc.arraySize;
+ }
+ else
+ {
+ SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
+ return SLANG_FAIL;
+ }
+ }
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ arrayDesc.Depth *= 6;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
+ }
+ else
+ {
+ resourceType = CU_RESOURCE_TYPE_ARRAY;
+
+ if (desc.arraySize > 1)
+ {
+ if (desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
+ return SLANG_FAIL;
+ }
+
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ // Set the depth as the array length
+ arrayDesc.Depth = desc.arraySize;
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Depth *= 6;
+ }
+
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ else if (desc.type == IResource::Type::Texture3D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Depth = depth;
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ arrayDesc.Flags = 0;
+
+ // Handle cube texture
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Depth = 6;
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ else
+ {
+ CUDA_ARRAY_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ // Allocate the array, will work for 1D or 2D case
+ SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ }
+ }
+
+ // Work space for holding data for uploading if it needs to be rearranged
+ if (initData)
+ {
+ List<uint8_t> workspace;
+ for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
+ {
+ int mipWidth = width >> mipLevel;
+ int mipHeight = height >> mipLevel;
+ int mipDepth = depth >> mipLevel;
+
+ mipWidth = (mipWidth == 0) ? 1 : mipWidth;
+ mipHeight = (mipHeight == 0) ? 1 : mipHeight;
+ mipDepth = (mipDepth == 0) ? 1 : mipDepth;
+
+ // If it's a cubemap then the depth is always 6
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ mipDepth = 6;
+ }
+
+ auto dstArray = tex->m_cudaArray;
+ if (tex->m_cudaMipMappedArray)
+ {
+ // Get the array for the mip level
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
+ }
+ SLANG_ASSERT(dstArray);
+
+ // Check using the desc to see if it's plausible
+ {
+ CUDA_ARRAY_DESCRIPTOR arrayDesc;
+ SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
+
+ SLANG_ASSERT(mipWidth == arrayDesc.Width);
+ SLANG_ASSERT(
+ mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
+ }
+
+ const void* srcDataPtr = nullptr;
+
+ if (desc.arraySize > 1)
+ {
+ SLANG_ASSERT(
+ desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube);
+
+ // TODO(JS): Here I assume that arrays are just held contiguously within a
+ // 'face' This seems reasonable and works with the Copy3D.
+ const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+ Index faceCount = desc.arraySize;
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ faceCount *= 6;
+ }
+
+ const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
+ workspace.setCount(mipSizeInBytes);
+
+ // We need to add the face data from each mip
+ // We iterate over face count so we copy all of the cubemap faces
+ for (Index j = 0; j < faceCount; j++)
+ {
+ const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
+ // Copy over to the workspace to make contiguous
+ ::memcpy(
+ workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
+ }
+
+ srcDataPtr = workspace.getBuffer();
+ }
+ else
+ {
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+ workspace.setCount(faceSizeInBytes * 6);
+ // Copy the data over to make contiguous
+ for (Index j = 0; j < 6; j++)
+ {
+ const auto srcData =
+ initData[mipLevel + j * desc.numMipLevels].data;
+ ::memcpy(
+ workspace.getBuffer() + faceSizeInBytes * j,
+ srcData,
+ faceSizeInBytes);
+ }
+ srcDataPtr = workspace.getBuffer();
+ }
+ else
+ {
+ const auto srcData = initData[mipLevel].data;
+ srcDataPtr = srcData;
+ }
+ }
+
+ if (desc.arraySize > 1)
+ {
+ SLANG_ASSERT(
+ desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube);
+
+ CUDA_MEMCPY3D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ // Set the depth to the array length
+ copyParam.Depth = desc.arraySize;
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ copyParam.Depth *= 6;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+ }
+ else
+ {
+ switch (desc.type)
+ {
+ case IResource::Type::Texture1D:
+ case IResource::Type::Texture2D:
+ {
+ CUDA_MEMCPY2D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+ break;
+ }
+ case IResource::Type::Texture3D:
+ case IResource::Type::TextureCube:
+ {
+ CUDA_MEMCPY3D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ copyParam.Depth = mipDepth;
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+ break;
+ }
+
+ default:
+ {
+ SLANG_ASSERT(!"Not implemented");
+ break;
+ }
+ }
+ }
+ }
+ }
+ // Set up texture sampling parameters, and create final texture obj
+
+ {
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+ resDesc.resType = resourceType;
+
+ if (tex->m_cudaArray)
+ {
+ resDesc.res.array.hArray = tex->m_cudaArray;
+ }
+ if (tex->m_cudaMipMappedArray)
+ {
+ resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
+ }
+
+ // If the texture might be used as a UAV, then we need to allocate
+ // a CUDA "surface" for it.
+ //
+ // Note: We cannot do this unconditionally, because it will fail
+ // on surfaces that are not usable as UAVs (e.g., those with
+ // mipmaps).
+ //
+ // TODO: We should really only be allocating the array at the
+ // time we create a resource, and then allocate the surface or
+ // texture objects as part of view creation.
+ //
+ if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
+ {
+ // On CUDA surfaces only support a single MIP map
+ SLANG_ASSERT(desc.numMipLevels == 1);
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+ }
+
+
+ // Create handle for sampling.
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+ texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+ }
+
+ returnComPtr(outResource, tex);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
+ const IBufferResource::Desc& descIn,
+ const void* initData,
+ IBufferResource** outResource)
+{
+ auto desc = fixupBufferDesc(descIn);
+ RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+ if (initData)
+ {
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+ }
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
+ InteropHandle handle,
+ const IBufferResource::Desc& desc,
+ IBufferResource** outResource)
+{
+ if (handle.handleValue == 0)
+ {
+ *outResource = nullptr;
+ return SLANG_OK;
+ }
+
+ RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+
+ // CUDA manages sharing of buffers through the idea of an
+ // "external memory" object, which represents the relationship
+ // with another API's objects. In order to create this external
+ // memory association, we first need to fill in a descriptor struct.
+ cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+ memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+ switch (handle.api)
+ {
+ case InteropHandleAPI::D3D12:
+ externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+ break;
+ case InteropHandleAPI::Vulkan:
+ externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+ break;
+ default:
+ return SLANG_FAIL;
+ }
+ externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+ externalMemoryHandleDesc.size = desc.sizeInBytes;
+ externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+ // Once we have filled in the descriptor, we can request
+ // that CUDA create the required association between the
+ // external buffer and its own memory.
+ cudaExternalMemory_t externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ resource->m_cudaExternalMemory = externalMemory;
+
+ // The CUDA "external memory" handle is not itself a device
+ // pointer, so we need to query for a suitable device address
+ // for the buffer with another call.
+ //
+ // Just as for the external memory, we fill in a descriptor
+ // structure (although in this case we only need to specify
+ // the size).
+ cudaExternalMemoryBufferDesc bufferDesc;
+ memset(&bufferDesc, 0, sizeof(bufferDesc));
+ bufferDesc.size = desc.sizeInBytes;
+
+ // Finally, we can "map" the buffer to get a device address.
+ void* deviceAddress;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+ resource->m_cudaMemory = deviceAddress;
+
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
+ InteropHandle handle,
+ const ITextureResource::Desc& desc,
+ const size_t size,
+ ITextureResource** outResource)
+{
+ if (handle.handleValue == 0)
+ {
+ *outResource = nullptr;
+ return SLANG_OK;
+ }
+
+ RefPtr<TextureResourceImpl> resource = new TextureResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+
+ // CUDA manages sharing of buffers through the idea of an
+ // "external memory" object, which represents the relationship
+ // with another API's objects. In order to create this external
+ // memory association, we first need to fill in a descriptor struct.
+ CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
+ memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+ switch (handle.api)
+ {
+ case InteropHandleAPI::D3D12:
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
+ break;
+ case InteropHandleAPI::Vulkan:
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
+ break;
+ default:
+ return SLANG_FAIL;
+ }
+ externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+ externalMemoryHandleDesc.size = size;
+ externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+ CUexternalMemory externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ resource->m_cudaExternalMemory = externalMemory;
+
+ FormatInfo formatInfo;
+ SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ arrayDesc.Depth = desc.size.depth;
+ arrayDesc.Height = desc.size.height;
+ arrayDesc.Width = desc.size.width;
+ arrayDesc.NumChannels = formatInfo.channelCount;
+ getCUDAFormat(desc.format, &arrayDesc.Format);
+ arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
+
+ CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
+ memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
+ externalMemoryMipDesc.offset = 0;
+ externalMemoryMipDesc.arrayDesc = arrayDesc;
+ externalMemoryMipDesc.numLevels = desc.numMipLevels;
+
+ CUmipmappedArray mipArray;
+ SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
+ resource->m_cudaMipMappedArray = mipArray;
+
+ CUarray cuArray;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
+ resource->m_cudaArray = cuArray;
+
+ CUDA_RESOURCE_DESC surfDesc;
+ memset(&surfDesc, 0, sizeof(surfDesc));
+ surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ surfDesc.res.array.hArray = cuArray;
+
+ CUsurfObject surface;
+ SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
+ resource->m_cudaSurfObj = surface;
+
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureView(
+ ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView)
+{
+ RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+ view->m_desc = desc;
+ view->textureResource = dynamic_cast<TextureResourceImpl*>(texture);
+ returnComPtr(outView, view);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferView(
+ IBufferResource* buffer,
+ IBufferResource* counterBuffer,
+ IResourceView::Desc const& desc,
+ IResourceView** outView)
+{
+ RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+ view->m_desc = desc;
+ view->memoryResource = dynamic_cast<BufferResourceImpl*>(buffer);
+ returnComPtr(outView, view);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createQueryPool(
+ const IQueryPool::Desc& desc,
+ IQueryPool** outPool)
+{
+ RefPtr<QueryPoolImpl> pool = new QueryPoolImpl();
+ SLANG_RETURN_ON_FAIL(pool->init(desc));
+ returnComPtr(outPool, pool);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObjectLayout(
+ slang::TypeLayoutReflection* typeLayout,
+ ShaderObjectLayoutBase** outLayout)
+{
+ RefPtr<ShaderObjectLayoutImpl> cudaLayout;
+ cudaLayout = new ShaderObjectLayoutImpl(this, typeLayout);
+ returnRefPtrMove(outLayout, cudaLayout);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject)
+{
+ RefPtr<ShaderObjectImpl> result = new ShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+ returnComPtr(outObject, result);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createMutableShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject)
+{
+ RefPtr<MutableShaderObjectImpl> result = new MutableShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+ returnComPtr(outObject, result);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
+{
+ auto cudaProgram = dynamic_cast<ShaderProgramImpl*>(program);
+ auto cudaLayout = cudaProgram->layout;
+
+ RefPtr<RootShaderObjectImpl> result = new RootShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
+ returnRefPtrMove(outObject, result);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+ const IShaderProgram::Desc& desc,
+ IShaderProgram** outProgram,
+ ISlangBlob** outDiagnosticBlob)
+{
+ // If this is a specializable program, we just keep a reference to the slang program and
+ // don't actually create any kernels. This program will be specialized later when we know
+ // the shader object bindings.
+ RefPtr<ShaderProgramImpl> cudaProgram = new ShaderProgramImpl();
+ cudaProgram->init(desc);
+ cudaProgram->cudaContext = m_context;
+ if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
+ {
+ cudaProgram->layout = new RootShaderObjectLayoutImpl(this, desc.slangGlobalScope->getLayout());
+ returnComPtr(outProgram, cudaProgram);
+ return SLANG_OK;
+ }
+
+ ComPtr<ISlangBlob> kernelCode;
+ ComPtr<ISlangBlob> diagnostics;
+ auto compileResult = desc.slangGlobalScope->getEntryPointCode(
+ (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
+ if (diagnostics)
+ {
+ getDebugCallback()->handleMessage(
+ compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
+ DebugMessageSource::Slang,
+ (char*)diagnostics->getBufferPointer());
+ if (outDiagnosticBlob)
+ returnComPtr(outDiagnosticBlob, diagnostics);
+ }
+ SLANG_RETURN_ON_FAIL(compileResult);
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
+ cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
+ SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
+ &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
+
+ auto slangGlobalScope = desc.slangGlobalScope;
+ if (slangGlobalScope)
+ {
+ cudaProgram->slangGlobalScope = slangGlobalScope;
+
+ auto slangProgramLayout = slangGlobalScope->getLayout();
+ if (!slangProgramLayout)
+ return SLANG_FAIL;
+
+ RefPtr<RootShaderObjectLayoutImpl> cudaLayout;
+ cudaLayout = new RootShaderObjectLayoutImpl(this, slangProgramLayout);
+ cudaLayout->programLayout = slangProgramLayout;
+ cudaProgram->layout = cudaLayout;
+ }
+
+ returnComPtr(outProgram, cudaProgram);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createComputePipelineState(
+ const ComputePipelineStateDesc& desc, IPipelineState** outState)
+{
+ RefPtr<ComputePipelineStateImpl> state = new ComputePipelineStateImpl();
+ state->shaderProgram = static_cast<ShaderProgramImpl*>(desc.program);
+ state->init(desc);
+ returnComPtr(outState, state);
+ return Result();
+}
+
+void* DeviceImpl::map(IBufferResource* buffer)
+{
+ return static_cast<BufferResourceImpl*>(buffer)->m_cudaMemory;
+}
+
+void DeviceImpl::unmap(IBufferResource* buffer)
+{
+ SLANG_UNUSED(buffer);
+}
+
+SLANG_NO_THROW const DeviceInfo& SLANG_MCALL DeviceImpl::getDeviceInfo() const
+{
+ return m_info;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTransientResourceHeap(
+ const ITransientResourceHeap::Desc& desc,
+ ITransientResourceHeap** outHeap)
+{
+ RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, desc));
+ returnComPtr(outHeap, result);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue)
+{
+ RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
+ queue->init(this);
+ returnComPtr(outQueue, queue);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createSwapchain(
+ const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(window);
+ SLANG_UNUSED(outSwapchain);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createFramebufferLayout(
+ const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outLayout);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outFramebuffer);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createRenderPassLayout(
+ const IRenderPassLayout::Desc& desc,
+ IRenderPassLayout** outRenderPassLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outRenderPassLayout);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler)
+{
+ SLANG_UNUSED(desc);
+ *outSampler = nullptr;
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createInputLayout(
+ IInputLayout::Desc const& desc,
+ IInputLayout** outLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outLayout);
+ return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createGraphicsPipelineState(
+ const GraphicsPipelineStateDesc& desc, IPipelineState** outState)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outState);
+ return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::readTextureResource(
+ ITextureResource* texture,
+ ResourceState state,
+ ISlangBlob** outBlob,
+ size_t* outRowPitch,
+ size_t* outPixelSize)
+{
+ auto textureImpl = static_cast<TextureResourceImpl*>(texture);
+ RefPtr<ListBlob> blob = new ListBlob();
+
+ auto desc = textureImpl->getDesc();
+ auto width = desc->size.width;
+ auto height = desc->size.height;
+ FormatInfo sizeInfo;
+ SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
+ size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
+ size_t rowPitch = width * pixelSize;
+ size_t size = height * rowPitch;
+ blob->m_data.setCount((Index)size);
+
+ CUDA_MEMCPY2D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.srcArray = textureImpl->m_cudaArray;
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.dstHost = blob->m_data.getBuffer();
+ copyParam.dstPitch = rowPitch;
+ copyParam.WidthInBytes = copyParam.dstPitch;
+ copyParam.Height = height;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+
+ *outRowPitch = rowPitch;
+ *outPixelSize = pixelSize;
+ returnComPtr(outBlob, blob);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
+ IBufferResource* buffer,
+ size_t offset,
+ size_t size,
+ ISlangBlob** outBlob)
+{
+ auto bufferImpl = static_cast<BufferResourceImpl*>(buffer);
+ RefPtr<ListBlob> blob = new ListBlob();
+ blob->m_data.setCount((Index)size);
+ cudaMemcpy(
+ blob->m_data.getBuffer(),
+ (uint8_t*)bufferImpl->m_cudaMemory + offset,
+ size,
+ cudaMemcpyDefault);
+ returnComPtr(outBlob, blob);
+ return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-device.h b/tools/gfx/cuda/cuda-device.h
new file mode 100644
index 000000000..e6a91393b
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.h
@@ -0,0 +1,152 @@
+// cuda-device.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-command-buffer.h"
+#include "cuda-context.h"
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class DeviceImpl : public RendererBase
+{
+private:
+ static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
+ static int _calcSMCountPerMultiProcessor(int major, int minor);
+
+ static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex);
+
+ static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal);
+
+private:
+ int m_deviceIndex = -1;
+ CUdevice m_device = 0;
+ RefPtr<CUDAContext> m_context;
+ DeviceInfo m_info;
+ String m_adapterName;
+
+public:
+ virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override;
+
+ virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override;
+
+ Result getCUDAFormat(Format format, CUarray_format* outFormat);
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
+ const ITextureResource::Desc& desc,
+ const ITextureResource::SubresourceData* initData,
+ ITextureResource** outResource) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
+ const IBufferResource::Desc& descIn,
+ const void* initData,
+ IBufferResource** outResource) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle(
+ InteropHandle handle,
+ const IBufferResource::Desc& desc,
+ IBufferResource** outResource) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle(
+ InteropHandle handle,
+ const ITextureResource::Desc& desc,
+ const size_t size,
+ ITextureResource** outResource) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
+ ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
+ IBufferResource* buffer,
+ IBufferResource* counterBuffer,
+ IResourceView::Desc const& desc,
+ IResourceView** outView) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool(
+ const IQueryPool::Desc& desc,
+ IQueryPool** outPool) override;
+
+ virtual Result createShaderObjectLayout(
+ slang::TypeLayoutReflection* typeLayout,
+ ShaderObjectLayoutBase** outLayout) override;
+
+ virtual Result createShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject) override;
+
+ virtual Result createMutableShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject) override;
+
+ Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject);
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+ const IShaderProgram::Desc& desc,
+ IShaderProgram** outProgram,
+ ISlangBlob** outDiagnosticBlob) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
+ const ComputePipelineStateDesc& desc, IPipelineState** outState) override;
+
+ void* map(IBufferResource* buffer);
+
+ void unmap(IBufferResource* buffer);
+
+ virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override;
+
+public:
+ using TransientResourceHeapImpl = SimpleTransientResourceHeap<DeviceImpl, CommandBufferImpl>;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap(
+ const ITransientResourceHeap::Desc& desc,
+ ITransientResourceHeap** outHeap) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
+ const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
+ const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
+ const IRenderPassLayout::Desc& desc,
+ IRenderPassLayout** outRenderPassLayout) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
+ IInputLayout::Desc const& desc,
+ IInputLayout** outLayout) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
+ const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override;
+
+ virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
+ ITextureResource* texture,
+ ResourceState state,
+ ISlangBlob** outBlob,
+ size_t* outRowPitch,
+ size_t* outPixelSize) override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
+ IBufferResource* buffer,
+ size_t offset,
+ size_t size,
+ ISlangBlob** outBlob) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp
new file mode 100644
index 000000000..6325d9fc6
--- /dev/null
+++ b/tools/gfx/cuda/cuda-helper-functions.cpp
@@ -0,0 +1,91 @@
+// cuda-helper-functions.cpp
+#include "cuda-helper-functions.h"
+
+#include "cuda-device.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+SlangResult CUDAErrorInfo::handle() const
+{
+ StringBuilder builder;
+ builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
+
+ if (m_errorName)
+ {
+ builder << m_errorName << " : ";
+ }
+ if (m_errorString)
+ {
+ builder << m_errorString;
+ }
+
+ getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver,
+ builder.getUnownedSlice().begin());
+
+ // Slang::signalUnexpectedError(builder.getBuffer());
+ return SLANG_FAIL;
+}
+
+SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
+{
+ CUDAErrorInfo info(file, line);
+ cuGetErrorString(cuResult, &info.m_errorString);
+ cuGetErrorName(cuResult, &info.m_errorName);
+ return info.handle();
+}
+
+SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
+{
+ return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
+}
+
+# ifdef RENDER_TEST_OPTIX
+
+static bool _isError(OptixResult result)
+{
+ return result != OPTIX_SUCCESS;
+}
+
+# if 1
+static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
+{
+ fprintf(
+ stderr,
+ "%s(%d): optix: %s (%s)\n",
+ file,
+ line,
+ optixGetErrorString(result),
+ optixGetErrorName(result));
+ return SLANG_FAIL;
+}
+
+void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
+{
+ fprintf(stderr, "optix: %s (%s)\n", message, tag);
+}
+# endif
+# endif
+} // namespace cuda
+
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
+{
+RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl();
+SLANG_RETURN_ON_FAIL(result->initialize(*desc));
+returnComPtr(outDevice, result);
+return SLANG_OK;
+}
+#else
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
+{
+SLANG_UNUSED(desc);
+*outDevice = nullptr;
+return SLANG_FAIL;
+}
+#endif
+
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h
new file mode 100644
index 000000000..001e3946a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-helper-functions.h
@@ -0,0 +1,106 @@
+// cuda-helper-functions.h
+#pragma once
+
+#include "slang-gfx.h"
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; }
+SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; }
+
+// A enum used to control if errors are reported on failure of CUDA call.
+enum class CUDAReportStyle
+{
+ Normal,
+ Silent,
+};
+
+struct CUDAErrorInfo
+{
+ CUDAErrorInfo(
+ const char* filePath,
+ int lineNo,
+ const char* errorName = nullptr,
+ const char* errorString = nullptr)
+ : m_filePath(filePath)
+ , m_lineNo(lineNo)
+ , m_errorName(errorName)
+ , m_errorString(errorString)
+ {}
+ SlangResult handle() const;
+
+ const char* m_filePath;
+ int m_lineNo;
+ const char* m_errorName;
+ const char* m_errorString;
+};
+
+// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
+
+SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line);
+SlangResult _handleCUDAError(cudaError_t error, const char* file, int line);
+
+# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
+
+# define SLANG_CUDA_RETURN_ON_FAIL(x) \
+ { \
+ auto _res = x; \
+ if (_isError(_res)) \
+ return SLANG_CUDA_HANDLE_ERROR(_res); \
+ }
+
+# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \
+ { \
+ auto _res = x; \
+ if (_isError(_res)) \
+ { \
+ return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
+ : SLANG_FAIL; \
+ } \
+ }
+
+# define SLANG_CUDA_ASSERT_ON_FAIL(x) \
+ { \
+ auto _res = x; \
+ if (_isError(_res)) \
+ { \
+ SLANG_ASSERT(!"Failed CUDA call"); \
+ }; \
+ }
+
+# ifdef RENDER_TEST_OPTIX
+
+bool _isError(OptixResult result);
+
+# if 1
+SlangResult _handleOptixError(OptixResult result, char const* file, int line);
+
+# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
+# else
+# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
+# endif
+
+# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \
+ do \
+ { \
+ auto _res = EXPR; \
+ if (_isError(_res)) \
+ return SLANG_OPTIX_HANDLE_ERROR(_res); \
+ } while (0)
+
+void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData);
+
+# endif
+
+} // namespace cuda
+#endif
+
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
+
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-pipeline-state.cpp b/tools/gfx/cuda/cuda-pipeline-state.cpp
new file mode 100644
index 000000000..fdbc0ea0e
--- /dev/null
+++ b/tools/gfx/cuda/cuda-pipeline-state.cpp
@@ -0,0 +1,22 @@
+// cuda-pipeline-state.cpp
+#include "cuda-pipeline-state.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+void ComputePipelineStateImpl::init(const ComputePipelineStateDesc& inDesc)
+{
+ PipelineStateDesc pipelineDesc;
+ pipelineDesc.type = PipelineType::Compute;
+ pipelineDesc.compute = inDesc;
+ initializeBase(pipelineDesc);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-pipeline-state.h b/tools/gfx/cuda/cuda-pipeline-state.h
new file mode 100644
index 000000000..e13266d8b
--- /dev/null
+++ b/tools/gfx/cuda/cuda-pipeline-state.h
@@ -0,0 +1,28 @@
+// cuda-pipeline-state.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-shader-program.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class PipelineStateImpl : public PipelineStateBase
+{
+public:
+};
+
+class ComputePipelineStateImpl : public PipelineStateImpl
+{
+public:
+ RefPtr<ShaderProgramImpl> shaderProgram;
+ void init(const ComputePipelineStateDesc& inDesc);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-query.cpp b/tools/gfx/cuda/cuda-query.cpp
new file mode 100644
index 000000000..7e97699f8
--- /dev/null
+++ b/tools/gfx/cuda/cuda-query.cpp
@@ -0,0 +1,48 @@
+// cuda-query.cpp
+#include "cuda-query.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+Result QueryPoolImpl::init(const IQueryPool::Desc& desc)
+{
+ cuEventCreate(&m_startEvent, 0);
+ cuEventRecord(m_startEvent, 0);
+ m_events.setCount(desc.count);
+ for (SlangInt i = 0; i < m_events.getCount(); i++)
+ {
+ cuEventCreate(&m_events[i], 0);
+ }
+ return SLANG_OK;
+}
+
+QueryPoolImpl::~QueryPoolImpl()
+{
+ for (auto& e : m_events)
+ {
+ cuEventDestroy(e);
+ }
+ cuEventDestroy(m_startEvent);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL QueryPoolImpl::getResult(
+ GfxIndex queryIndex, GfxCount count, uint64_t* data)
+{
+ for (GfxIndex i = 0; i < count; i++)
+ {
+ float time = 0.0f;
+ cuEventSynchronize(m_events[i + queryIndex]);
+ cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]);
+ data[i] = (uint64_t)((double)time * 1000.0f);
+ }
+ return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-query.h b/tools/gfx/cuda/cuda-query.h
new file mode 100644
index 000000000..db29f488d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-query.h
@@ -0,0 +1,32 @@
+// cuda-query.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class QueryPoolImpl : public QueryPoolBase
+{
+public:
+ // The event object for each query. Owned by the pool.
+ List<CUevent> m_events;
+
+ // The event that marks the starting point.
+ CUevent m_startEvent;
+
+ Result init(const IQueryPool::Desc& desc);
+
+ ~QueryPoolImpl();
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL getResult(
+ GfxIndex queryIndex, GfxCount count, uint64_t* data) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-resource-views.h b/tools/gfx/cuda/cuda-resource-views.h
new file mode 100644
index 000000000..33bf557d7
--- /dev/null
+++ b/tools/gfx/cuda/cuda-resource-views.h
@@ -0,0 +1,26 @@
+// cuda-resource-views.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-buffer.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ResourceViewImpl : public ResourceViewBase
+{
+public:
+ RefPtr<BufferResourceImpl> memoryResource = nullptr;
+ RefPtr<TextureResourceImpl> textureResource = nullptr;
+ void* proxyBuffer = nullptr;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object-layout.cpp b/tools/gfx/cuda/cuda-shader-object-layout.cpp
new file mode 100644
index 000000000..0cbe23a63
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object-layout.cpp
@@ -0,0 +1,153 @@
+// cuda-shader-object-layout.cpp
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ShaderObjectLayoutImpl::ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout)
+{
+ m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType);
+
+ initBase(renderer, m_elementTypeLayout);
+
+ // Compute the binding ranges that are used to store
+ // the logical contents of the object in memory. These will relate
+ // to the descriptor ranges in the various sets, but not always
+ // in a one-to-one fashion.
+
+ SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
+ for (SlangInt r = 0; r < bindingRangeCount; ++r)
+ {
+ slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
+ SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
+ slang::TypeLayoutReflection* slangLeafTypeLayout =
+ m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
+
+ SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
+ SlangInt rangeIndexInDescriptorSet =
+ m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
+
+ // TODO: This logic assumes that for any binding range that might consume
+ // multiple kinds of resources, the descriptor range for its uniform
+ // usage will be the first one in the range.
+ //
+ // We need to decide whether that assumption is one we intend to support
+ // applications making, or whether they should be forced to perform a
+ // linear search over the descriptor ranges for a specific binding range.
+ //
+ auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
+ descriptorSetIndex, rangeIndexInDescriptorSet);
+
+ Index baseIndex = 0;
+ Index subObjectIndex = 0;
+ switch (slangBindingType)
+ {
+ case slang::BindingType::ConstantBuffer:
+ case slang::BindingType::ParameterBlock:
+ case slang::BindingType::ExistentialValue:
+ baseIndex = m_subObjectCount;
+ subObjectIndex = baseIndex;
+ m_subObjectCount += count;
+ break;
+ case slang::BindingType::RawBuffer:
+ case slang::BindingType::MutableRawBuffer:
+ if (slangLeafTypeLayout->getType()->getElementType() != nullptr)
+ {
+ // A structured buffer occupies both a resource slot and
+ // a sub-object slot.
+ subObjectIndex = m_subObjectCount;
+ m_subObjectCount += count;
+ }
+ baseIndex = m_resourceCount;
+ m_resourceCount += count;
+ break;
+ default:
+ baseIndex = m_resourceCount;
+ m_resourceCount += count;
+ break;
+ }
+
+ BindingRangeInfo bindingRangeInfo;
+ bindingRangeInfo.bindingType = slangBindingType;
+ bindingRangeInfo.count = count;
+ bindingRangeInfo.baseIndex = baseIndex;
+ bindingRangeInfo.uniformOffset = uniformOffset;
+ bindingRangeInfo.subObjectIndex = subObjectIndex;
+ m_bindingRanges.add(bindingRangeInfo);
+ }
+
+ SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
+ for (SlangInt r = 0; r < subObjectRangeCount; ++r)
+ {
+ SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
+ auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
+ slang::TypeLayoutReflection* slangLeafTypeLayout =
+ m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
+
+ // A sub-object range can either represent a sub-object of a known
+ // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
+ // (in which case we can pre-compute a layout to use, based on
+ // the type `Foo`) *or* it can represent a sub-object of some
+ // existential type (e.g., `IBar`) in which case we cannot
+ // know the appropriate type/layout of sub-object to allocate.
+ //
+ RefPtr<ShaderObjectLayoutImpl> subObjectLayout;
+ if (slangBindingType != slang::BindingType::ExistentialValue)
+ {
+ subObjectLayout =
+ new ShaderObjectLayoutImpl(renderer, slangLeafTypeLayout->getElementTypeLayout());
+ }
+
+ SubObjectRangeInfo subObjectRange;
+ subObjectRange.bindingRangeIndex = bindingRangeIndex;
+ subObjectRange.layout = subObjectLayout;
+ subObjectRanges.add(subObjectRange);
+ }
+}
+
+Index ShaderObjectLayoutImpl::getResourceCount() const { return m_resourceCount; }
+Index ShaderObjectLayoutImpl::getSubObjectCount() const { return m_subObjectCount; }
+List<SubObjectRangeInfo>& ShaderObjectLayoutImpl::getSubObjectRanges() { return subObjectRanges; }
+BindingRangeInfo ShaderObjectLayoutImpl::getBindingRange(Index index) { return m_bindingRanges[index]; }
+Index ShaderObjectLayoutImpl::getBindingRangeCount() const { return m_bindingRanges.getCount(); }
+
+RootShaderObjectLayoutImpl::RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout)
+ : ShaderObjectLayoutImpl(renderer, inProgramLayout->getGlobalParamsTypeLayout())
+ , programLayout(inProgramLayout)
+{
+ for (UInt i = 0; i < programLayout->getEntryPointCount(); i++)
+ {
+ entryPointLayouts.add(new ShaderObjectLayoutImpl(
+ renderer,
+ programLayout->getEntryPointByIndex(i)->getTypeLayout()));
+ }
+
+}
+
+int RootShaderObjectLayoutImpl::getKernelIndex(UnownedStringSlice kernelName)
+{
+ for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
+ {
+ auto entryPoint = programLayout->getEntryPointByIndex(i);
+ if (kernelName == entryPoint->getName())
+ {
+ return i;
+ }
+ }
+ return -1;
+}
+
+void RootShaderObjectLayoutImpl::getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
+{
+ auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
+ entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object-layout.h b/tools/gfx/cuda/cuda-shader-object-layout.h
new file mode 100644
index 000000000..305129109
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object-layout.h
@@ -0,0 +1,74 @@
+// cuda-shader-object-layout.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+struct BindingRangeInfo
+{
+ slang::BindingType bindingType;
+ Index count;
+ Index baseIndex; // Flat index for sub-objects
+ Index subObjectIndex;
+
+ // TODO: The `uniformOffset` field should be removed,
+ // since it cannot be supported by the Slang reflection
+ // API once we fix some design issues.
+ //
+ // It is only being used today for pre-allocation of sub-objects
+ // for constant buffers and parameter blocks (which should be
+ // deprecated/removed anyway).
+ //
+ // Note: We would need to bring this field back, plus
+ // a lot of other complexity, if we ever want to support
+ // setting of resources/buffers directly by a binding
+ // range index and array index.
+ //
+ Index uniformOffset; // Uniform offset for a resource typed field.
+};
+
+struct SubObjectRangeInfo
+{
+ RefPtr<ShaderObjectLayoutImpl> layout;
+ Index bindingRangeIndex;
+};
+
+class ShaderObjectLayoutImpl : public ShaderObjectLayoutBase
+{
+public:
+ List<SubObjectRangeInfo> subObjectRanges;
+ List<BindingRangeInfo> m_bindingRanges;
+
+ Index m_subObjectCount = 0;
+ Index m_resourceCount = 0;
+
+ ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout);
+
+ Index getResourceCount() const;
+ Index getSubObjectCount() const;
+ List<SubObjectRangeInfo>& getSubObjectRanges();
+ BindingRangeInfo getBindingRange(Index index);
+ Index getBindingRangeCount() const;
+};
+
+class RootShaderObjectLayoutImpl : public ShaderObjectLayoutImpl
+{
+public:
+ slang::ProgramLayout* programLayout = nullptr;
+ List<RefPtr<ShaderObjectLayoutImpl>> entryPointLayouts;
+ RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout);
+
+ int getKernelIndex(UnownedStringSlice kernelName);
+
+ void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp
new file mode 100644
index 000000000..3fc55e401
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object.cpp
@@ -0,0 +1,349 @@
+// cuda-shader-object.cpp
+#include "cuda-shader-object.h"
+
+#include "cuda-shader-object-layout.h"
+#include "cuda-resource-views.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+Result ShaderObjectData::setCount(Index count)
+{
+ if (isHostOnly)
+ {
+ m_cpuBuffer.setCount(count);
+ if (!m_bufferView)
+ {
+ IResourceView::Desc viewDesc = {};
+ viewDesc.type = IResourceView::Type::UnorderedAccess;
+ m_bufferView = new ResourceViewImpl();
+ m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer();
+ m_bufferView->m_desc = viewDesc;
+ }
+ return SLANG_OK;
+ }
+
+ if (!m_bufferResource)
+ {
+ IBufferResource::Desc desc;
+ desc.type = IResource::Type::Buffer;
+ desc.sizeInBytes = count;
+ m_bufferResource = new BufferResourceImpl(desc);
+ if (count)
+ {
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
+ }
+ IResourceView::Desc viewDesc = {};
+ viewDesc.type = IResourceView::Type::UnorderedAccess;
+ m_bufferView = new ResourceViewImpl();
+ m_bufferView->memoryResource = m_bufferResource;
+ m_bufferView->m_desc = viewDesc;
+ }
+ auto oldSize = m_bufferResource->getDesc()->sizeInBytes;
+ if ((size_t)count != oldSize)
+ {
+ void* newMemory = nullptr;
+ if (count)
+ {
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
+ }
+ if (oldSize)
+ {
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+ newMemory,
+ m_bufferResource->m_cudaMemory,
+ Math::Min((size_t)count, oldSize),
+ cudaMemcpyDefault));
+ }
+ cudaFree(m_bufferResource->m_cudaMemory);
+ m_bufferResource->m_cudaMemory = newMemory;
+ m_bufferResource->getDesc()->sizeInBytes = count;
+ }
+ return SLANG_OK;
+}
+
+Slang::Index ShaderObjectData::getCount()
+{
+ if (isHostOnly)
+ return m_cpuBuffer.getCount();
+ if (m_bufferResource)
+ return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes);
+ else
+ return 0;
+}
+
+void* ShaderObjectData::getBuffer()
+{
+ if (isHostOnly)
+ return m_cpuBuffer.getBuffer();
+
+ if (m_bufferResource)
+ return m_bufferResource->m_cudaMemory;
+ return nullptr;
+}
+
+/// Returns a resource view for GPU access into the buffer content.
+ResourceViewBase* ShaderObjectData::getResourceView(
+ RendererBase* device,
+ slang::TypeLayoutReflection* elementLayout,
+ slang::BindingType bindingType)
+{
+ SLANG_UNUSED(device);
+ m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize();
+ return m_bufferView.Ptr();
+}
+
+SlangResult ShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout)
+{
+ m_layout = typeLayout;
+
+ // If the layout tells us that there is any uniform data,
+ // then we need to allocate a constant buffer to hold that data.
+ //
+ // TODO: Do we need to allocate a shadow copy for use from
+ // the CPU?
+ //
+ // TODO: When/where do we bind this constant buffer into
+ // a descriptor set for later use?
+ //
+ auto slangLayout = getLayout()->getElementTypeLayout();
+ size_t uniformSize = slangLayout->getSize();
+ if (uniformSize)
+ {
+ m_data.setCount((Index)uniformSize);
+ }
+
+ // If the layout specifies that we have any resources or sub-objects,
+ // then we need to size the appropriate arrays to account for them.
+ //
+ // Note: the counts here are the *total* number of resources/sub-objects
+ // and not just the number of resource/sub-object ranges.
+ //
+ resources.setCount(typeLayout->getResourceCount());
+ m_objects.setCount(typeLayout->getSubObjectCount());
+
+ for (auto subObjectRange : getLayout()->subObjectRanges)
+ {
+ RefPtr<ShaderObjectLayoutImpl> subObjectLayout = subObjectRange.layout;
+
+ // In the case where the sub-object range represents an
+ // existential-type leaf field (e.g., an `IBar`), we
+ // cannot pre-allocate the object(s) to go into that
+ // range, since we can't possibly know what to allocate
+ // at this point.
+ //
+ if (!subObjectLayout)
+ continue;
+ //
+ // Otherwise, we will allocate a sub-object to fill
+ // in each entry in this range, based on the layout
+ // information we already have.
+
+ auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
+ for (Index i = 0; i < bindingRangeInfo.count; ++i)
+ {
+ RefPtr<ShaderObjectImpl> subObject = new ShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
+
+ ShaderOffset offset;
+ offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
+ offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex;
+ offset.bindingArrayIndex = (GfxIndex)i;
+
+ SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
+ }
+ }
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW GfxCount SLANG_MCALL ShaderObjectImpl::getEntryPointCount()
+{
+ return 0;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint)
+{
+ *outEntryPoint = nullptr;
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW const void* SLANG_MCALL ShaderObjectImpl::getRawData()
+{
+ return m_data.getBuffer();
+}
+
+SLANG_NO_THROW Size SLANG_MCALL ShaderObjectImpl::getSize()
+{
+ return (Size)m_data.getCount();
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ShaderObjectImpl::setData(ShaderOffset const& offset, void const* data, Size size)
+{
+ Size temp = m_data.getCount() - (Size)offset.uniformOffset;
+ size = Math::Min(size, temp);
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+ (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ShaderObjectImpl::setResource(ShaderOffset const& offset, IResourceView* resourceView)
+{
+ if (!resourceView)
+ return SLANG_OK;
+
+ auto layout = getLayout();
+
+ auto bindingRangeIndex = offset.bindingRangeIndex;
+ SLANG_ASSERT(bindingRangeIndex >= 0);
+ SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+ auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+
+ auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
+ auto cudaView = static_cast<ResourceViewImpl*>(resourceView);
+
+ resources[viewIndex] = cudaView;
+
+ if (cudaView->textureResource)
+ {
+ if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess)
+ {
+ auto handle = cudaView->textureResource->m_cudaSurfObj;
+ setData(offset, &handle, sizeof(uint64_t));
+ }
+ else
+ {
+ auto handle = cudaView->textureResource->getBindlessHandle();
+ setData(offset, &handle, sizeof(uint64_t));
+ }
+ }
+ else if (cudaView->memoryResource)
+ {
+ auto handle = cudaView->memoryResource->getBindlessHandle();
+ setData(offset, &handle, sizeof(handle));
+ auto sizeOffset = offset;
+ sizeOffset.uniformOffset += sizeof(handle);
+ auto& desc = *cudaView->memoryResource->getDesc();
+ size_t size = desc.sizeInBytes;
+ if (desc.elementSize > 1)
+ size /= desc.elementSize;
+ setData(sizeOffset, &size, sizeof(size));
+ }
+ else if (cudaView->proxyBuffer)
+ {
+ auto handle = cudaView->proxyBuffer;
+ setData(offset, &handle, sizeof(handle));
+ auto sizeOffset = offset;
+ sizeOffset.uniformOffset += sizeof(handle);
+ auto& desc = *cudaView->memoryResource->getDesc();
+ size_t size = desc.sizeInBytes;
+ if (desc.elementSize > 1)
+ size /= desc.elementSize;
+ setData(sizeOffset, &size, sizeof(size));
+ }
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ShaderObjectImpl::setObject(ShaderOffset const& offset, IShaderObject* object)
+{
+ SLANG_RETURN_ON_FAIL(Super::setObject(offset, object));
+
+ auto bindingRangeIndex = offset.bindingRangeIndex;
+ auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex];
+
+ ShaderObjectImpl* subObject = static_cast<ShaderObjectImpl*>(object);
+ switch (bindingRange.bindingType)
+ {
+ default:
+ {
+ void* subObjectDataBuffer = subObject->getBuffer();
+ SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*)));
+ }
+ break;
+ case slang::BindingType::ExistentialValue:
+ case slang::BindingType::RawBuffer:
+ case slang::BindingType::MutableRawBuffer:
+ break;
+ }
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ ShaderObjectImpl::setSampler(ShaderOffset const& offset, ISamplerState* sampler)
+{
+ SLANG_UNUSED(sampler);
+ SLANG_UNUSED(offset);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL ShaderObjectImpl::setCombinedTextureSampler(
+ ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler)
+{
+ SLANG_UNUSED(sampler);
+ setResource(offset, textureView);
+ return SLANG_OK;
+}
+
+EntryPointShaderObjectImpl::EntryPointShaderObjectImpl()
+{
+ m_data.isHostOnly = true;
+}
+
+SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::addRef()
+{
+ return 1;
+}
+
+SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::release()
+{
+ return 1;
+}
+
+SlangResult RootShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout)
+{
+ SLANG_RETURN_ON_FAIL(ShaderObjectImpl::init(device, typeLayout));
+ auto programLayout = dynamic_cast<RootShaderObjectLayoutImpl*>(typeLayout);
+ for (auto& entryPoint : programLayout->entryPointLayouts)
+ {
+ RefPtr<EntryPointShaderObjectImpl> object = new EntryPointShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
+ entryPointObjects.add(object);
+ }
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW GfxCount SLANG_MCALL RootShaderObjectImpl::getEntryPointCount()
+{
+ return (GfxCount)entryPointObjects.getCount();
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ RootShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint)
+{
+ returnComPtr(outEntryPoint, entryPointObjects[index]);
+ return SLANG_OK;
+}
+
+Result RootShaderObjectImpl::collectSpecializationArgs(ExtendedShaderObjectTypeList& args)
+{
+ SLANG_RETURN_ON_FAIL(ShaderObjectImpl::collectSpecializationArgs(args));
+ for (auto& entryPoint : entryPointObjects)
+ {
+ SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
+ }
+ return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object.h b/tools/gfx/cuda/cuda-shader-object.h
new file mode 100644
index 000000000..f564f5eb5
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object.h
@@ -0,0 +1,93 @@
+// cuda-shader-object.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-buffer.h"
+#include "cuda-resource-views.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ShaderObjectData
+{
+public:
+ bool isHostOnly = false;
+ Slang::RefPtr<BufferResourceImpl> m_bufferResource;
+ Slang::RefPtr<ResourceViewImpl> m_bufferView;
+ Slang::List<uint8_t> m_cpuBuffer;
+
+ Result setCount(Index count);
+ Slang::Index getCount();
+ void* getBuffer();
+
+ /// Returns a resource view for GPU access into the buffer content.
+ ResourceViewBase* getResourceView(
+ RendererBase* device,
+ slang::TypeLayoutReflection* elementLayout,
+ slang::BindingType bindingType);
+};
+
+class ShaderObjectImpl
+ : public ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData>
+{
+ typedef ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData>
+ Super;
+
+public:
+ List<RefPtr<ResourceViewImpl>> resources;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ init(IDevice* device, ShaderObjectLayoutImpl* typeLayout);
+
+ virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override;
+
+ virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override;
+
+ virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ setData(ShaderOffset const& offset, void const* data, Size size) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ setResource(ShaderOffset const& offset, IResourceView* resourceView) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ setObject(ShaderOffset const& offset, IShaderObject* object) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ setSampler(ShaderOffset const& offset, ISamplerState* sampler) override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
+ ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override;
+};
+
+class MutableShaderObjectImpl : public MutableShaderObject< MutableShaderObjectImpl, ShaderObjectLayoutImpl>
+{};
+
+class EntryPointShaderObjectImpl : public ShaderObjectImpl
+{
+public:
+ EntryPointShaderObjectImpl();
+};
+
+class RootShaderObjectImpl : public ShaderObjectImpl
+{
+public:
+ virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override;
+ virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override;
+public:
+ List<RefPtr<EntryPointShaderObjectImpl>> entryPointObjects;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ init(IDevice* device, ShaderObjectLayoutImpl* typeLayout) override;
+ virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override;
+ virtual SLANG_NO_THROW Result SLANG_MCALL
+ getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override;
+ virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-program.cpp b/tools/gfx/cuda/cuda-shader-program.cpp
new file mode 100644
index 000000000..73e0c3c19
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-program.cpp
@@ -0,0 +1,20 @@
+// cuda-shader-program.cpp
+#include "cuda-shader-program.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ShaderProgramImpl::~ShaderProgramImpl()
+{
+ if (cudaModule)
+ cuModuleUnload(cudaModule);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-program.h b/tools/gfx/cuda/cuda-shader-program.h
new file mode 100644
index 000000000..b0961bfc3
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-program.h
@@ -0,0 +1,29 @@
+// cuda-shader-program.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-context.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ShaderProgramImpl : public ShaderProgramBase
+{
+public:
+ CUmodule cudaModule = nullptr;
+ CUfunction cudaKernel;
+ String kernelName;
+ RefPtr<RootShaderObjectLayoutImpl> layout;
+ RefPtr<CUDAContext> cudaContext;
+ ~ShaderProgramImpl();
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-texture.cpp b/tools/gfx/cuda/cuda-texture.cpp
new file mode 100644
index 000000000..6e6e42db5
--- /dev/null
+++ b/tools/gfx/cuda/cuda-texture.cpp
@@ -0,0 +1,48 @@
+// cuda-texture.cpp
+#include "cuda-texture.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+TextureResourceImpl::~TextureResourceImpl()
+{
+ if (m_cudaSurfObj)
+ {
+ SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
+ }
+ if (m_cudaTexObj)
+ {
+ SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
+ }
+ if (m_cudaArray)
+ {
+ SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
+ }
+ if (m_cudaMipMappedArray)
+ {
+ SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
+ }
+}
+
+uint64_t TextureResourceImpl::getBindlessHandle()
+{
+ return (uint64_t)m_cudaTexObj;
+}
+
+Result TextureResourceImpl::getNativeResourceHandle(InteropHandle* outHandle)
+{
+ outHandle->handleValue = getBindlessHandle();
+ outHandle->api = InteropHandleAPI::CUDA;
+ return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-texture.h b/tools/gfx/cuda/cuda-texture.h
new file mode 100644
index 000000000..c9cc8569a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-texture.h
@@ -0,0 +1,44 @@
+// cuda-texture.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-context.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class TextureResourceImpl : public TextureResource
+{
+public:
+ TextureResourceImpl(const TextureResource::Desc& desc)
+ : TextureResource(desc)
+ {}
+ ~TextureResourceImpl();
+
+ uint64_t getBindlessHandle();
+
+ // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
+ // a long long
+ CUtexObject m_cudaTexObj = CUtexObject();
+
+ // The surfObj is for reading/writing 'texture like' things, but not for sampling.
+ CUsurfObject m_cudaSurfObj = CUsurfObject();
+
+ CUarray m_cudaArray = CUarray();
+ CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
+
+ void* m_cudaExternalMemory = nullptr;
+
+ RefPtr<CUDAContext> m_cudaContext;
+
+ virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
deleted file mode 100644
index 114751a3d..000000000
--- a/tools/gfx/cuda/render-cuda.cpp
+++ /dev/null
@@ -1,2513 +0,0 @@
-#include "render-cuda.h"
-
-#ifdef GFX_ENABLE_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "core/slang-basic.h"
-#include "core/slang-blob.h"
-#include "core/slang-std-writers.h"
-
-#include "slang.h"
-#include "slang-com-ptr.h"
-#include "slang-com-helper.h"
-#include "../command-writer.h"
-#include "../renderer-shared.h"
-#include "../mutable-shader-object.h"
-#include "../simple-transient-resource-heap.h"
-#include "../slang-context.h"
-#include "../command-encoder-com-forward.h"
-
-# ifdef RENDER_TEST_OPTIX
-
-// The `optix_stubs.h` header produces warnings when compiled with MSVC
-# ifdef _MSC_VER
-# pragma warning(disable: 4996)
-# endif
-
-# include <optix.h>
-# include <optix_function_table_definition.h>
-# include <optix_stubs.h>
-# endif
-
-#endif
-
-namespace gfx
-{
-#ifdef GFX_ENABLE_CUDA
-using namespace Slang;
-
-SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
-
-// A enum used to control if errors are reported on failure of CUDA call.
-enum class CUDAReportStyle
-{
- Normal,
- Silent,
-};
-
-struct CUDAErrorInfo
-{
- CUDAErrorInfo(
- const char* filePath,
- int lineNo,
- const char* errorName = nullptr,
- const char* errorString = nullptr)
- : m_filePath(filePath)
- , m_lineNo(lineNo)
- , m_errorName(errorName)
- , m_errorString(errorString)
- {}
- SlangResult handle() const
- {
- StringBuilder builder;
- builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
-
- if (m_errorName)
- {
- builder << m_errorName << " : ";
- }
- if (m_errorString)
- {
- builder << m_errorString;
- }
-
- getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver,
- builder.getUnownedSlice().begin());
-
- // Slang::signalUnexpectedError(builder.getBuffer());
- return SLANG_FAIL;
- }
-
- const char* m_filePath;
- int m_lineNo;
- const char* m_errorName;
- const char* m_errorString;
-};
-
-// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
-
-static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
-{
- CUDAErrorInfo info(file, line);
- cuGetErrorString(cuResult, &info.m_errorString);
- cuGetErrorName(cuResult, &info.m_errorName);
- return info.handle();
-}
-
-static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
- return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
-# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
-
-# define SLANG_CUDA_RETURN_ON_FAIL(x) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- return SLANG_CUDA_HANDLE_ERROR(_res); \
- }
-# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- { \
- return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
- : SLANG_FAIL; \
- } \
- }
-
-# define SLANG_CUDA_ASSERT_ON_FAIL(x) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- { \
- SLANG_ASSERT(!"Failed CUDA call"); \
- }; \
- }
-
-# ifdef RENDER_TEST_OPTIX
-
-static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }
-
-# if 1
-static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
-{
- fprintf(
- stderr,
- "%s(%d): optix: %s (%s)\n",
- file,
- line,
- optixGetErrorString(result),
- optixGetErrorName(result));
- return SLANG_FAIL;
-}
-# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
-# else
-# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
-# endif
-
-# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \
- do \
- { \
- auto _res = EXPR; \
- if (_isError(_res)) \
- return SLANG_OPTIX_HANDLE_ERROR(_res); \
- } while (0)
-
-void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
-{
- fprintf(stderr, "optix: %s (%s)\n", message, tag);
-}
-
-# endif
-
-class CUDAContext : public RefObject
-{
-public:
- CUcontext m_context = nullptr;
- ~CUDAContext() { cuCtxDestroy(m_context); }
-};
-
-class MemoryCUDAResource : public BufferResource
-{
-public:
- MemoryCUDAResource(const Desc& _desc)
- : BufferResource(_desc)
- {}
-
- ~MemoryCUDAResource()
- {
- if (m_cudaMemory)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
- }
- }
-
- uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; }
-
- void* m_cudaExternalMemory = nullptr;
- void* m_cudaMemory = nullptr;
-
- RefPtr<CUDAContext> m_cudaContext;
-
- virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override
- {
- return (DeviceAddress)m_cudaMemory;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
- {
- outHandle->handleValue = getBindlessHandle();
- outHandle->api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- map(MemoryRange* rangeToRead, void** outPointer) override
- {
- SLANG_UNUSED(rangeToRead);
- SLANG_UNUSED(outPointer);
- return SLANG_FAIL;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override
- {
- SLANG_UNUSED(writtenRange);
- return SLANG_FAIL;
- }
-};
-
-class TextureCUDAResource : public TextureResource
-{
-public:
- TextureCUDAResource(const TextureResource::Desc& desc)
- : TextureResource(desc)
- {}
- ~TextureCUDAResource()
- {
- if (m_cudaSurfObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
- }
- if (m_cudaTexObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
- }
- if (m_cudaArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
- }
- if (m_cudaMipMappedArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
- }
- }
-
- uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; }
-
- // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
- // a long long
- CUtexObject m_cudaTexObj = CUtexObject();
-
- // The surfObj is for reading/writing 'texture like' things, but not for sampling.
- CUsurfObject m_cudaSurfObj = CUsurfObject();
-
- CUarray m_cudaArray = CUarray();
- CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
-
- void* m_cudaExternalMemory = nullptr;
-
- RefPtr<CUDAContext> m_cudaContext;
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
- {
- outHandle->handleValue = getBindlessHandle();
- outHandle->api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-};
-
-class CUDAResourceView : public ResourceViewBase
-{
-public:
- RefPtr<MemoryCUDAResource> memoryResource = nullptr;
- RefPtr<TextureCUDAResource> textureResource = nullptr;
- void* proxyBuffer = nullptr;
-};
-
-class CUDAShaderObjectLayout : public ShaderObjectLayoutBase
-{
-public:
- struct BindingRangeInfo
- {
- slang::BindingType bindingType;
- Index count;
- Index baseIndex; // Flat index for sub-ojects
- Index subObjectIndex;
-
- // TODO: The `uniformOffset` field should be removed,
- // since it cannot be supported by the Slang reflection
- // API once we fix some design issues.
- //
- // It is only being used today for pre-allocation of sub-objects
- // for constant buffers and parameter blocks (which should be
- // deprecated/removed anyway).
- //
- // Note: We would need to bring this field back, plus
- // a lot of other complexity, if we ever want to support
- // setting of resources/buffers directly by a binding
- // range index and array index.
- //
- Index uniformOffset; // Uniform offset for a resource typed field.
- };
-
- struct SubObjectRangeInfo
- {
- RefPtr<CUDAShaderObjectLayout> layout;
- Index bindingRangeIndex;
- };
-
- List<SubObjectRangeInfo> subObjectRanges;
- List<BindingRangeInfo> m_bindingRanges;
-
- Index m_subObjectCount = 0;
- Index m_resourceCount = 0;
-
- CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout)
- {
- m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType);
-
- initBase(renderer, m_elementTypeLayout);
-
- // Compute the binding ranges that are used to store
- // the logical contents of the object in memory. These will relate
- // to the descriptor ranges in the various sets, but not always
- // in a one-to-one fashion.
-
- SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
- for (SlangInt r = 0; r < bindingRangeCount; ++r)
- {
- slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
- SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
- slang::TypeLayoutReflection* slangLeafTypeLayout =
- m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
-
- SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
- SlangInt rangeIndexInDescriptorSet =
- m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
-
- // TODO: This logic assumes that for any binding range that might consume
- // multiple kinds of resources, the descriptor range for its uniform
- // usage will be the first one in the range.
- //
- // We need to decide whether that assumption is one we intend to support
- // applications making, or whether they should be forced to perform a
- // linear search over the descriptor ranges for a specific binding range.
- //
- auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
- descriptorSetIndex, rangeIndexInDescriptorSet);
-
- Index baseIndex = 0;
- Index subObjectIndex = 0;
- switch (slangBindingType)
- {
- case slang::BindingType::ConstantBuffer:
- case slang::BindingType::ParameterBlock:
- case slang::BindingType::ExistentialValue:
- baseIndex = m_subObjectCount;
- subObjectIndex = baseIndex;
- m_subObjectCount += count;
- break;
- case slang::BindingType::RawBuffer:
- case slang::BindingType::MutableRawBuffer:
- if (slangLeafTypeLayout->getType()->getElementType() != nullptr)
- {
- // A structured buffer occupies both a resource slot and
- // a sub-object slot.
- subObjectIndex = m_subObjectCount;
- m_subObjectCount += count;
- }
- baseIndex = m_resourceCount;
- m_resourceCount += count;
- break;
- default:
- baseIndex = m_resourceCount;
- m_resourceCount += count;
- break;
- }
-
- BindingRangeInfo bindingRangeInfo;
- bindingRangeInfo.bindingType = slangBindingType;
- bindingRangeInfo.count = count;
- bindingRangeInfo.baseIndex = baseIndex;
- bindingRangeInfo.uniformOffset = uniformOffset;
- bindingRangeInfo.subObjectIndex = subObjectIndex;
- m_bindingRanges.add(bindingRangeInfo);
- }
-
- SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
- for (SlangInt r = 0; r < subObjectRangeCount; ++r)
- {
- SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
- auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
- slang::TypeLayoutReflection* slangLeafTypeLayout =
- m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
-
- // A sub-object range can either represent a sub-object of a known
- // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
- // (in which case we can pre-compute a layout to use, based on
- // the type `Foo`) *or* it can represent a sub-object of some
- // existential type (e.g., `IBar`) in which case we cannot
- // know the appropraite type/layout of sub-object to allocate.
- //
- RefPtr<CUDAShaderObjectLayout> subObjectLayout;
- if (slangBindingType != slang::BindingType::ExistentialValue)
- {
- subObjectLayout =
- new CUDAShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout());
- }
-
- SubObjectRangeInfo subObjectRange;
- subObjectRange.bindingRangeIndex = bindingRangeIndex;
- subObjectRange.layout = subObjectLayout;
- subObjectRanges.add(subObjectRange);
- }
- }
-
- Index getResourceCount() const { return m_resourceCount; }
- Index getSubObjectCount() const { return m_subObjectCount; }
- List<SubObjectRangeInfo>& getSubObjectRanges() { return subObjectRanges; }
- BindingRangeInfo getBindingRange(Index index) { return m_bindingRanges[index]; }
- Index getBindingRangeCount() const { return m_bindingRanges.getCount(); }
-};
-
-class CUDAProgramLayout : public CUDAShaderObjectLayout
-{
-public:
- slang::ProgramLayout* programLayout = nullptr;
- List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts;
- CUDAProgramLayout(RendererBase* renderer, slang::ProgramLayout* inProgramLayout)
- : CUDAShaderObjectLayout(renderer, inProgramLayout->getGlobalParamsTypeLayout())
- , programLayout(inProgramLayout)
- {
- for (UInt i =0; i< programLayout->getEntryPointCount(); i++)
- {
- entryPointLayouts.add(new CUDAShaderObjectLayout(
- renderer,
- programLayout->getEntryPointByIndex(i)->getTypeLayout()));
- }
-
- }
-
- int getKernelIndex(UnownedStringSlice kernelName)
- {
- for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
- {
- auto entryPoint = programLayout->getEntryPointByIndex(i);
- if (kernelName == entryPoint->getName())
- {
- return i;
- }
- }
- return -1;
- }
-
- void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
- {
- auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
- entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
- }
-};
-
-class CUDAShaderObjectData
-{
-public:
- bool isHostOnly = false;
- Slang::RefPtr<MemoryCUDAResource> m_bufferResource;
- Slang::RefPtr<CUDAResourceView> m_bufferView;
- Slang::List<uint8_t> m_cpuBuffer;
- Result setCount(Index count)
- {
- if (isHostOnly)
- {
- m_cpuBuffer.setCount(count);
- if (!m_bufferView)
- {
- IResourceView::Desc viewDesc = {};
- viewDesc.type = IResourceView::Type::UnorderedAccess;
- m_bufferView = new CUDAResourceView();
- m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer();
- m_bufferView->m_desc = viewDesc;
- }
- return SLANG_OK;
- }
-
- if (!m_bufferResource)
- {
- IBufferResource::Desc desc;
- desc.type = IResource::Type::Buffer;
- desc.sizeInBytes = count;
- m_bufferResource = new MemoryCUDAResource(desc);
- if (count)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
- }
- IResourceView::Desc viewDesc = {};
- viewDesc.type = IResourceView::Type::UnorderedAccess;
- m_bufferView = new CUDAResourceView();
- m_bufferView->memoryResource = m_bufferResource;
- m_bufferView->m_desc = viewDesc;
- }
- auto oldSize = m_bufferResource->getDesc()->sizeInBytes;
- if ((size_t)count != oldSize)
- {
- void* newMemory = nullptr;
- if (count)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
- }
- if (oldSize)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- newMemory,
- m_bufferResource->m_cudaMemory,
- Math::Min((size_t)count, oldSize),
- cudaMemcpyDefault));
- }
- cudaFree(m_bufferResource->m_cudaMemory);
- m_bufferResource->m_cudaMemory = newMemory;
- m_bufferResource->getDesc()->sizeInBytes = count;
- }
- return SLANG_OK;
- }
-
- Slang::Index getCount()
- {
- if (isHostOnly)
- return m_cpuBuffer.getCount();
- if (m_bufferResource)
- return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes);
- else
- return 0;
- }
-
- void* getBuffer()
- {
- if (isHostOnly)
- return m_cpuBuffer.getBuffer();
-
- if (m_bufferResource)
- return m_bufferResource->m_cudaMemory;
- return nullptr;
- }
-
- /// Returns a resource view for GPU access into the buffer content.
- ResourceViewBase* getResourceView(
- RendererBase* device,
- slang::TypeLayoutReflection* elementLayout,
- slang::BindingType bindingType)
- {
- SLANG_UNUSED(device);
- m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize();
- return m_bufferView.Ptr();
- }
-};
-
-class CUDAShaderObject
- : public ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
-{
- typedef ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
- Super;
-
-public:
- List<RefPtr<CUDAResourceView>> resources;
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- init(IDevice* device, CUDAShaderObjectLayout* typeLayout);
-
- virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return 0; }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
- {
- *outEntryPoint = nullptr;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override
- {
- return m_data.getBuffer();
- }
-
- virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override
- {
- return (Size)m_data.getCount();
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setData(ShaderOffset const& offset, void const* data, Size size) override
- {
- size = Math::Min(size, (Size)m_data.getCount() - offset.uniformOffset);
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setResource(ShaderOffset const& offset, IResourceView* resourceView) override
- {
- if (!resourceView)
- return SLANG_OK;
-
- auto layout = getLayout();
-
- auto bindingRangeIndex = offset.bindingRangeIndex;
- SLANG_ASSERT(bindingRangeIndex >= 0);
- SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
-
- auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
-
- auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
- auto cudaView = static_cast<CUDAResourceView*>(resourceView);
-
- resources[viewIndex] = cudaView;
-
- if (cudaView->textureResource)
- {
- if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess)
- {
- auto handle = cudaView->textureResource->m_cudaSurfObj;
- setData(offset, &handle, sizeof(uint64_t));
- }
- else
- {
- auto handle = cudaView->textureResource->getBindlessHandle();
- setData(offset, &handle, sizeof(uint64_t));
- }
- }
- else if (cudaView->memoryResource)
- {
- auto handle = cudaView->memoryResource->getBindlessHandle();
- setData(offset, &handle, sizeof(handle));
- auto sizeOffset = offset;
- sizeOffset.uniformOffset += sizeof(handle);
- auto& desc = *cudaView->memoryResource->getDesc();
- size_t size = desc.sizeInBytes;
- if (desc.elementSize > 1)
- size /= desc.elementSize;
- setData(sizeOffset, &size, sizeof(size));
- }
- else if (cudaView->proxyBuffer)
- {
- auto handle = cudaView->proxyBuffer;
- setData(offset, &handle, sizeof(handle));
- auto sizeOffset = offset;
- sizeOffset.uniformOffset += sizeof(handle);
- auto& desc = *cudaView->memoryResource->getDesc();
- size_t size = desc.sizeInBytes;
- if (desc.elementSize > 1)
- size /= desc.elementSize;
- setData(sizeOffset, &size, sizeof(size));
- }
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setObject(ShaderOffset const& offset, IShaderObject* object) override
- {
- SLANG_RETURN_ON_FAIL(Super::setObject(offset, object));
-
- auto bindingRangeIndex = offset.bindingRangeIndex;
- auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex];
-
- CUDAShaderObject* subObject = static_cast<CUDAShaderObject*>(object);
- switch (bindingRange.bindingType)
- {
- default:
- {
- void* subObjectDataBuffer = subObject->getBuffer();
- SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*)));
- }
- break;
- case slang::BindingType::ExistentialValue:
- case slang::BindingType::RawBuffer:
- case slang::BindingType::MutableRawBuffer:
- break;
- }
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setSampler(ShaderOffset const& offset, ISamplerState* sampler) override
- {
- SLANG_UNUSED(sampler);
- SLANG_UNUSED(offset);
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
- ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override
- {
- SLANG_UNUSED(sampler);
- setResource(offset, textureView);
- return SLANG_OK;
- }
-};
-
-class CUDAMutableShaderObject : public MutableShaderObject< CUDAMutableShaderObject, CUDAShaderObjectLayout>
-{};
-
-class CUDAEntryPointShaderObject : public CUDAShaderObject
-{
-public:
- CUDAEntryPointShaderObject() { m_data.isHostOnly = true; }
-};
-
-class CUDARootShaderObject : public CUDAShaderObject
-{
-public:
- virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override { return 1; }
- virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override { return 1; }
-public:
- List<RefPtr<CUDAEntryPointShaderObject>> entryPointObjects;
- virtual SLANG_NO_THROW Result SLANG_MCALL
- init(IDevice* device, CUDAShaderObjectLayout* typeLayout) override;
- virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return (GfxCount)entryPointObjects.getCount(); }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
- {
- returnComPtr(outEntryPoint, entryPointObjects[index]);
- return SLANG_OK;
- }
- virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override
- {
- SLANG_RETURN_ON_FAIL(CUDAShaderObject::collectSpecializationArgs(args));
- for (auto& entryPoint : entryPointObjects)
- {
- SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
- }
- return SLANG_OK;
- }
-};
-
-class CUDAShaderProgram : public ShaderProgramBase
-{
-public:
- CUmodule cudaModule = nullptr;
- CUfunction cudaKernel;
- String kernelName;
- RefPtr<CUDAProgramLayout> layout;
- RefPtr<CUDAContext> cudaContext;
- ~CUDAShaderProgram()
- {
- if (cudaModule)
- cuModuleUnload(cudaModule);
- }
-};
-
-class CUDAPipelineState : public PipelineStateBase
-{
-public:
- RefPtr<CUDAShaderProgram> shaderProgram;
- void init(const ComputePipelineStateDesc& inDesc)
- {
- PipelineStateDesc pipelineDesc;
- pipelineDesc.type = PipelineType::Compute;
- pipelineDesc.compute = inDesc;
- initializeBase(pipelineDesc);
- }
-};
-
-class CUDAQueryPool : public QueryPoolBase
-{
-public:
- // The event object for each query. Owned by the pool.
- List<CUevent> m_events;
-
- // The event that marks the starting point.
- CUevent m_startEvent;
-
- Result init(const IQueryPool::Desc& desc)
- {
- cuEventCreate(&m_startEvent, 0);
- cuEventRecord(m_startEvent, 0);
- m_events.setCount(desc.count);
- for (SlangInt i = 0; i < m_events.getCount(); i++)
- {
- cuEventCreate(&m_events[i], 0);
- }
- return SLANG_OK;
- }
-
- ~CUDAQueryPool()
- {
- for (auto& e : m_events)
- {
- cuEventDestroy(e);
- }
- cuEventDestroy(m_startEvent);
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getResult(
- GfxIndex queryIndex, GfxCount count, uint64_t* data) override
- {
- for (GfxIndex i = 0; i < count; i++)
- {
- float time = 0.0f;
- cuEventSynchronize(m_events[i + queryIndex]);
- cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]);
- data[i] = (uint64_t)((double)time * 1000.0f);
- }
- return SLANG_OK;
- }
-};
-
-class CUDADevice : public RendererBase
-{
-private:
- static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
- static int _calcSMCountPerMultiProcessor(int major, int minor)
- {
- // Defines for GPU Architecture types (using the SM version to determine
- // the # of cores per SM
- struct SMInfo
- {
- int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
- int coreCount;
- };
-
- static const SMInfo infos[] = {
- {0x30, 192},
- {0x32, 192},
- {0x35, 192},
- {0x37, 192},
- {0x50, 128},
- {0x52, 128},
- {0x53, 128},
- {0x60, 64},
- {0x61, 128},
- {0x62, 128},
- {0x70, 64},
- {0x72, 64},
- {0x75, 64}};
-
- const int sm = ((major << 4) + minor);
- for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
- {
- if (infos[i].sm == sm)
- {
- return infos[i].coreCount;
- }
- }
-
- const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
-
- // It must be newer presumably
- SLANG_ASSERT(sm > last.sm);
-
- // Default to the last entry
- return last.coreCount;
- }
-
- static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
- {
- int smPerMultiproc = 0;
- int maxPerfDevice = -1;
- int deviceCount = 0;
- int devicesProhibited = 0;
-
- uint64_t maxComputePerf = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
-
- // Find the best CUDA capable GPU device
- for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
- {
- int computeMode = -1, major = 0, minor = 0;
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
-
- // If this GPU is not running on Compute Mode prohibited,
- // then we can add it to the list
- if (computeMode != cudaComputeModeProhibited)
- {
- if (major == 9999 && minor == 9999)
- {
- smPerMultiproc = 1;
- }
- else
- {
- smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
- }
-
- int multiProcessorCount = 0, clockRate = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
- &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
- uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
-
- if (compute_perf > maxComputePerf)
- {
- maxComputePerf = compute_perf;
- maxPerfDevice = currentDevice;
- }
- }
- else
- {
- devicesProhibited++;
- }
- }
-
- if (maxPerfDevice < 0)
- {
- return SLANG_FAIL;
- }
-
- *outDeviceIndex = maxPerfDevice;
- return SLANG_OK;
- }
-
- static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
- {
- static CUresult res = cuInit(0);
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
- return SLANG_OK;
- }
-
-private:
- int m_deviceIndex = -1;
- CUdevice m_device = 0;
- RefPtr<CUDAContext> m_context;
- DeviceInfo m_info;
- String m_adapterName;
-
-public:
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override
- {
- outHandles->handles[0].handleValue = (uint64_t)m_device;
- outHandles->handles[0].api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-
- class CommandQueueImpl;
-
- class CommandBufferImpl
- : public ICommandBuffer
- , public CommandWriter
- , public ComObject
- {
- public:
- SLANG_COM_OBJECT_IUNKNOWN_ALL
- ICommandBuffer* getInterface(const Guid& guid)
- {
- if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
- return static_cast<ICommandBuffer*>(this);
- return nullptr;
- }
- public:
- CUDADevice* m_device;
- TransientResourceHeapBase* m_transientHeap;
-
- void init(CUDADevice* device, TransientResourceHeapBase* transientHeap)
- {
- m_device = device;
- m_transientHeap = transientHeap;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
- IRenderPassLayout* renderPass,
- IFramebuffer* framebuffer,
- IRenderCommandEncoder** outEncoder) override
- {
- SLANG_UNUSED(renderPass);
- SLANG_UNUSED(framebuffer);
- *outEncoder = nullptr;
- }
-
- class ResourceCommandEncoderImpl : public IResourceCommandEncoder
- {
- public:
- CommandWriter* m_writer;
-
- void init(CommandBufferImpl* cmdBuffer) { m_writer = cmdBuffer; }
-
- virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
- virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
- IBufferResource* dst,
- Offset dstOffset,
- IBufferResource* src,
- Offset srcOffset,
- Size size) override
- {
- m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier(
- GfxCount count,
- ITextureResource* const* textures,
- ResourceState src,
- ResourceState dst) override
- {}
-
- virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier(
- GfxCount count,
- IBufferResource* const* buffers,
- ResourceState src,
- ResourceState dst) override
- {}
-
- virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData(
- IBufferResource* dst, Offset offset, Size size, void* data) override
- {
- m_writer->uploadBufferData(dst, offset, size, data);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- writeTimestamp(IQueryPool* pool, GfxIndex index) override
- {
- m_writer->writeTimestamp(pool, index);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL copyTexture(
- ITextureResource* dst,
- ResourceState dstState,
- SubresourceRange dstSubresource,
- ITextureResource::Offset3D dstOffset,
- ITextureResource* src,
- ResourceState srcState,
- SubresourceRange srcSubresource,
- ITextureResource::Offset3D srcOffset,
- ITextureResource::Extents extent) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(dstState);
- SLANG_UNUSED(dstSubresource);
- SLANG_UNUSED(dstOffset);
- SLANG_UNUSED(src);
- SLANG_UNUSED(srcState);
- SLANG_UNUSED(srcSubresource);
- SLANG_UNUSED(srcOffset);
- SLANG_UNUSED(extent);
- SLANG_UNIMPLEMENTED_X("copyTexture");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData(
- ITextureResource* dst,
- SubresourceRange subResourceRange,
- ITextureResource::Offset3D offset,
- ITextureResource::Extents extent,
- ITextureResource::SubresourceData* subResourceData,
- GfxCount subResourceDataCount) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(subResourceRange);
- SLANG_UNUSED(offset);
- SLANG_UNUSED(extent);
- SLANG_UNUSED(subResourceData);
- SLANG_UNUSED(subResourceDataCount);
- SLANG_UNIMPLEMENTED_X("uploadTextureData");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView(
- IResourceView* view,
- ClearValue* clearValue,
- ClearResourceViewFlags::Enum flags) override
- {
- SLANG_UNUSED(view);
- SLANG_UNUSED(clearValue);
- SLANG_UNUSED(flags);
- SLANG_UNIMPLEMENTED_X("clearResourceView");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL resolveResource(
- ITextureResource* source,
- ResourceState sourceState,
- SubresourceRange sourceRange,
- ITextureResource* dest,
- ResourceState destState,
- SubresourceRange destRange) override
- {
- SLANG_UNUSED(source);
- SLANG_UNUSED(sourceState);
- SLANG_UNUSED(sourceRange);
- SLANG_UNUSED(dest);
- SLANG_UNUSED(destState);
- SLANG_UNUSED(destRange);
- SLANG_UNIMPLEMENTED_X("resolveResource");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery(
- IQueryPool* queryPool,
- GfxIndex index,
- GfxCount count,
- IBufferResource* buffer,
- Offset offset) override
- {
- SLANG_UNUSED(queryPool);
- SLANG_UNUSED(index);
- SLANG_UNUSED(count);
- SLANG_UNUSED(buffer);
- SLANG_UNUSED(offset);
- SLANG_UNIMPLEMENTED_X("resolveQuery");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer(
- IBufferResource* dst,
- Offset dstOffset,
- Size dstSize,
- Size dstRowStride,
- ITextureResource* src,
- ResourceState srcState,
- SubresourceRange srcSubresource,
- ITextureResource::Offset3D srcOffset,
- ITextureResource::Extents extent) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(dstOffset);
- SLANG_UNUSED(dstSize);
- SLANG_UNUSED(dstRowStride);
- SLANG_UNUSED(src);
- SLANG_UNUSED(srcState);
- SLANG_UNUSED(srcSubresource);
- SLANG_UNUSED(srcOffset);
- SLANG_UNUSED(extent);
- SLANG_UNIMPLEMENTED_X("copyTextureToBuffer");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier(
- ITextureResource* texture,
- SubresourceRange subresourceRange,
- ResourceState src,
- ResourceState dst) override
- {
- SLANG_UNUSED(texture);
- SLANG_UNUSED(subresourceRange);
- SLANG_UNUSED(src);
- SLANG_UNUSED(dst);
- SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier");
- }
- virtual SLANG_NO_THROW void SLANG_MCALL
- beginDebugEvent(const char* name, float rgbColor[3]) override
- {
- SLANG_UNUSED(name);
- SLANG_UNUSED(rgbColor);
- }
- virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {}
- };
-
- ResourceCommandEncoderImpl m_resourceCommandEncoder;
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeResourceCommands(IResourceCommandEncoder** outEncoder) override
- {
- m_resourceCommandEncoder.init(this);
- *outEncoder = &m_resourceCommandEncoder;
- }
-
- class ComputeCommandEncoderImpl
- : public IComputeCommandEncoder
- , public ResourceCommandEncoderImpl
- {
- public:
- SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl)
- public:
- CommandWriter* m_writer;
- CommandBufferImpl* m_commandBuffer;
- RefPtr<ShaderObjectBase> m_rootObject;
- virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
- void init(CommandBufferImpl* cmdBuffer)
- {
- m_writer = cmdBuffer;
- m_commandBuffer = cmdBuffer;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override
- {
- m_writer->setPipelineState(state);
- PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
- SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
- pipelineImpl->m_program, m_rootObject.writeRef()));
- returnComPtr(outRootObject, m_rootObject);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override
- {
- m_writer->setPipelineState(state);
- PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
- SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
- pipelineImpl->m_program, m_rootObject.writeRef()));
- m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override
- {
- m_writer->bindRootShaderObject(m_rootObject);
- m_writer->dispatchCompute(x, y, z);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override
- {
- SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect");
- }
- };
-
- ComputeCommandEncoderImpl m_computeCommandEncoder;
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeComputeCommands(IComputeCommandEncoder** outEncoder) override
- {
- m_computeCommandEncoder.init(this);
- *outEncoder = &m_computeCommandEncoder;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override
- {
- *outEncoder = nullptr;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
- {
- return SLANG_FAIL;
- }
- };
-
- class CommandQueueImpl
- : public ICommandQueue
- , public ComObject
- {
- public:
- SLANG_COM_OBJECT_IUNKNOWN_ALL
- ICommandQueue* getInterface(const Guid& guid)
- {
- if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
- return static_cast<ICommandQueue*>(this);
- return nullptr;
- }
-
- public:
- RefPtr<CUDAPipelineState> currentPipeline;
- RefPtr<CUDARootShaderObject> currentRootObject;
- RefPtr<CUDADevice> renderer;
- CUstream stream;
- Desc m_desc;
- public:
- void init(CUDADevice* inRenderer)
- {
- renderer = inRenderer;
- m_desc.type = ICommandQueue::QueueType::Graphics;
- cuStreamCreate(&stream, 0);
- }
- ~CommandQueueImpl()
- {
- cuStreamSynchronize(stream);
- cuStreamDestroy(stream);
- currentPipeline = nullptr;
- currentRootObject = nullptr;
- }
-
- public:
- virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override
- {
- return m_desc;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers(
- GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override
- {
- SLANG_UNUSED(valueToSignal);
- // TODO: implement fence.
- assert(fence == nullptr);
- for (GfxIndex i = 0; i < count; i++)
- {
- execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
- }
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override
- {
- auto resultCode = cuStreamSynchronize(stream);
- if (resultCode != cudaSuccess)
- SLANG_CUDA_HANDLE_ERROR(resultCode);
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice(
- GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override
- {
- return SLANG_FAIL;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
- {
- return SLANG_FAIL;
- }
-
- public:
- void setPipelineState(IPipelineState* state)
- {
- currentPipeline = dynamic_cast<CUDAPipelineState*>(state);
- }
-
- Result bindRootShaderObject(IShaderObject* object)
- {
- currentRootObject = dynamic_cast<CUDARootShaderObject*>(object);
- if (currentRootObject)
- return SLANG_OK;
- return SLANG_E_INVALID_ARG;
- }
-
- void dispatchCompute(int x, int y, int z)
- {
- // Specialize the compute kernel based on the shader object bindings.
- RefPtr<PipelineStateBase> newPipeline;
- renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
- currentPipeline = static_cast<CUDAPipelineState*>(newPipeline.Ptr());
-
- // Find out thread group size from program reflection.
- auto& kernelName = currentPipeline->shaderProgram->kernelName;
- auto programLayout = static_cast<CUDAProgramLayout*>(currentRootObject->getLayout());
- int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
- SLANG_ASSERT(kernelId != -1);
- UInt threadGroupSize[3];
- programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
-
- int sharedSizeInBytes;
- cuFuncGetAttribute(
- &sharedSizeInBytes,
- CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
- currentPipeline->shaderProgram->cudaKernel);
-
- // Copy global parameter data to the `SLANG_globalParams` symbol.
- {
- CUdeviceptr globalParamsSymbol = 0;
- size_t globalParamsSymbolSize = 0;
- cuModuleGetGlobal(
- &globalParamsSymbol,
- &globalParamsSymbolSize,
- currentPipeline->shaderProgram->cudaModule,
- "SLANG_globalParams");
-
- CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
- cudaMemcpyAsync(
- (void*)globalParamsSymbol,
- (void*)globalParamsCUDAData,
- globalParamsSymbolSize,
- cudaMemcpyDefault,
- 0);
- }
- //
- // The argument data for the entry-point parameters are already
- // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
- //
- auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
- auto entryPointDataSize =
- currentRootObject->entryPointObjects[kernelId]->getBufferSize();
-
- void* extraOptions[] = {
- CU_LAUNCH_PARAM_BUFFER_POINTER,
- entryPointBuffer,
- CU_LAUNCH_PARAM_BUFFER_SIZE,
- &entryPointDataSize,
- CU_LAUNCH_PARAM_END,
- };
-
- // Once we have all the decessary data extracted and/or
- // set up, we can launch the kernel and see what happens.
- //
- auto cudaLaunchResult = cuLaunchKernel(
- currentPipeline->shaderProgram->cudaKernel,
- x,
- y,
- z,
- int(threadGroupSize[0]),
- int(threadGroupSize[1]),
- int(threadGroupSize[2]),
- sharedSizeInBytes,
- stream,
- nullptr,
- extraOptions);
-
- SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
- }
-
- void copyBuffer(
- IBufferResource* dst,
- size_t dstOffset,
- IBufferResource* src,
- size_t srcOffset,
- size_t size)
- {
- auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
- auto srcImpl = static_cast<MemoryCUDAResource*>(src);
- cudaMemcpy(
- (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
- (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
- size,
- cudaMemcpyDefault);
- }
-
- void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
- {
- auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
- cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
- }
-
- void writeTimestamp(IQueryPool* pool, SlangInt index)
- {
- auto poolImpl = static_cast<CUDAQueryPool*>(pool);
- cuEventRecord(poolImpl->m_events[index], stream);
- }
-
- void execute(CommandBufferImpl* commandBuffer)
- {
- for (auto& cmd : commandBuffer->m_commands)
- {
- switch (cmd.name)
- {
- case CommandName::SetPipelineState:
- setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
- break;
- case CommandName::BindRootShaderObject:
- bindRootShaderObject(
- commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
- break;
- case CommandName::DispatchCompute:
- dispatchCompute(
- int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
- break;
- case CommandName::CopyBuffer:
- copyBuffer(
- commandBuffer->getObject<BufferResource>(cmd.operands[0]),
- cmd.operands[1],
- commandBuffer->getObject<BufferResource>(cmd.operands[2]),
- cmd.operands[3],
- cmd.operands[4]);
- break;
- case CommandName::UploadBufferData:
- uploadBufferData(
- commandBuffer->getObject<BufferResource>(cmd.operands[0]),
- cmd.operands[1],
- cmd.operands[2],
- commandBuffer->getData<uint8_t>(cmd.operands[3]));
- break;
- case CommandName::WriteTimestamp:
- writeTimestamp(
- commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
- (SlangInt)cmd.operands[1]);
- }
- }
- }
- };
-
- using TransientResourceHeapImpl = SimpleTransientResourceHeap<CUDADevice, CommandBufferImpl>;
-
-public:
- virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override
- {
- SLANG_RETURN_ON_FAIL(slangContext.initialize(
- desc.slang,
- SLANG_PTX,
- "sm_5_1",
- makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
-
- SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
-
- SLANG_RETURN_ON_FAIL(_initCuda(reportType));
-
- SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
- m_context = new CUDAContext();
-
- int count = -1;
- cuDeviceGetCount(&count);
- SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
-
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
- cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
-
- // Not clear how to detect half support on CUDA. For now we'll assume we have it
- {
- m_features.add("half");
- }
-
- // Initialize DeviceInfo
- {
- m_info.deviceType = DeviceType::CUDA;
- m_info.bindingStyle = BindingStyle::CUDA;
- m_info.projectionStyle = ProjectionStyle::DirectX;
- m_info.apiName = "CUDA";
- static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
- ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
- cudaDeviceProp deviceProperties;
- cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
- m_adapterName = deviceProperties.name;
- m_info.adapterName = m_adapterName.begin();
- m_info.timestampFrequency = 1000000;
- }
-
- return SLANG_OK;
- }
-
- Result getCUDAFormat(Format format, CUarray_format* outFormat)
- {
- // TODO: Expand to cover all available formats that can be supported in CUDA
- switch (format)
- {
- case Format::R32G32B32A32_FLOAT:
- case Format::R32G32B32_FLOAT:
- case Format::R32G32_FLOAT:
- case Format::R32_FLOAT:
- case Format::D32_FLOAT:
- *outFormat = CU_AD_FORMAT_FLOAT;
- return SLANG_OK;
- case Format::R16G16B16A16_FLOAT:
- case Format::R16G16_FLOAT:
- case Format::R16_FLOAT:
- *outFormat = CU_AD_FORMAT_HALF;
- return SLANG_OK;
- case Format::R32G32B32A32_UINT:
- case Format::R32G32B32_UINT:
- case Format::R32G32_UINT:
- case Format::R32_UINT:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
- return SLANG_OK;
- case Format::R16G16B16A16_UINT:
- case Format::R16G16_UINT:
- case Format::R16_UINT:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
- return SLANG_OK;
- case Format::R8G8B8A8_UINT:
- case Format::R8G8_UINT:
- case Format::R8_UINT:
- case Format::R8G8B8A8_UNORM:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
- return SLANG_OK;
- case Format::R32G32B32A32_SINT:
- case Format::R32G32B32_SINT:
- case Format::R32G32_SINT:
- case Format::R32_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT32;
- return SLANG_OK;
- case Format::R16G16B16A16_SINT:
- case Format::R16G16_SINT:
- case Format::R16_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT16;
- return SLANG_OK;
- case Format::R8G8B8A8_SINT:
- case Format::R8G8_SINT:
- case Format::R8_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT8;
- return SLANG_OK;
- default:
- SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
- return SLANG_FAIL;
- }
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
- const ITextureResource::Desc& desc,
- const ITextureResource::SubresourceData* initData,
- ITextureResource** outResource) override
- {
- TextureResource::Desc srcDesc = fixupTextureDesc(desc);
-
- RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(srcDesc);
- tex->m_cudaContext = m_context;
-
- CUresourcetype resourceType;
-
- // The size of the element/texel in bytes
- size_t elementSize = 0;
-
- // Our `ITextureResource::Desc` uses an enumeration to specify
- // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
- // `cuMipmappedArrayCreate` seemingly relies on a policy where
- // the extents of the array in dimenions above the rank are
- // specified as zero (e.g., a 1D texture requires `height==0`).
- //
- // We will start by massaging the extents as specified by the
- // user into a form that CUDA wants/expects, based on the
- // texture shape as specified in the `desc`.
- //
- int width = desc.size.width;
- int height = desc.size.height;
- int depth = desc.size.depth;
- switch (desc.type)
- {
- case IResource::Type::Texture1D:
- height = 0;
- depth = 0;
- break;
-
- case IResource::Type::Texture2D:
- depth = 0;
- break;
-
- case IResource::Type::Texture3D:
- break;
-
- case IResource::Type::TextureCube:
- depth = 1;
- break;
- }
-
- {
- CUarray_format format = CU_AD_FORMAT_FLOAT;
- int numChannels = 0;
-
- SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
- FormatInfo info;
- gfxGetFormatInfo(desc.format, &info);
- numChannels = info.channelCount;
-
- switch (format)
- {
- case CU_AD_FORMAT_FLOAT:
- {
- elementSize = sizeof(float) * numChannels;
- break;
- }
- case CU_AD_FORMAT_HALF:
- {
- elementSize = sizeof(uint16_t) * numChannels;
- break;
- }
- case CU_AD_FORMAT_UNSIGNED_INT8:
- {
- elementSize = sizeof(uint8_t) * numChannels;
- break;
- }
- default:
- {
- SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
- return SLANG_FAIL;
- }
- }
-
- if (desc.numMipLevels > 1)
- {
- resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Width = width;
- arrayDesc.Height = height;
- arrayDesc.Depth = depth;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
- arrayDesc.Flags = 0;
-
- if (desc.arraySize > 1)
- {
- if (desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
- arrayDesc.Depth = desc.arraySize;
- }
- else
- {
- SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
- return SLANG_FAIL;
- }
- }
-
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- arrayDesc.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(
- cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
- }
- else
- {
- resourceType = CU_RESOURCE_TYPE_ARRAY;
-
- if (desc.arraySize > 1)
- {
- if (desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube)
- {
- SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
- return SLANG_FAIL;
- }
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- // Set the depth as the array length
- arrayDesc.Depth = desc.arraySize;
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Depth *= 6;
- }
-
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else if (desc.type == IResource::Type::Texture3D ||
- desc.type == IResource::Type::TextureCube)
- {
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Depth = depth;
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- arrayDesc.Flags = 0;
-
- // Handle cube texture
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Depth = 6;
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- // Allocate the array, will work for 1D or 2D case
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
- }
- }
- }
-
- // Work space for holding data for uploading if it needs to be rearranged
- if (initData)
- {
- List<uint8_t> workspace;
- for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
- {
- int mipWidth = width >> mipLevel;
- int mipHeight = height >> mipLevel;
- int mipDepth = depth >> mipLevel;
-
- mipWidth = (mipWidth == 0) ? 1 : mipWidth;
- mipHeight = (mipHeight == 0) ? 1 : mipHeight;
- mipDepth = (mipDepth == 0) ? 1 : mipDepth;
-
- // If it's a cubemap then the depth is always 6
- if (desc.type == IResource::Type::TextureCube)
- {
- mipDepth = 6;
- }
-
- auto dstArray = tex->m_cudaArray;
- if (tex->m_cudaMipMappedArray)
- {
- // Get the array for the mip level
- SLANG_CUDA_RETURN_ON_FAIL(
- cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
- }
- SLANG_ASSERT(dstArray);
-
- // Check using the desc to see if it's plausible
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
-
- SLANG_ASSERT(mipWidth == arrayDesc.Width);
- SLANG_ASSERT(
- mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
- }
-
- const void* srcDataPtr = nullptr;
-
- if (desc.arraySize > 1)
- {
- SLANG_ASSERT(
- desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube);
-
- // TODO(JS): Here I assume that arrays are just held contiguously within a
- // 'face' This seems reasonable and works with the Copy3D.
- const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- Index faceCount = desc.arraySize;
- if (desc.type == IResource::Type::TextureCube)
- {
- faceCount *= 6;
- }
-
- const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
- workspace.setCount(mipSizeInBytes);
-
- // We need to add the face data from each mip
- // We iterate over face count so we copy all of the cubemap faces
- for (Index j = 0; j < faceCount; j++)
- {
- const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
- // Copy over to the workspace to make contiguous
- ::memcpy(
- workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
- }
-
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- if (desc.type == IResource::Type::TextureCube)
- {
- size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- workspace.setCount(faceSizeInBytes * 6);
- // Copy the data over to make contiguous
- for (Index j = 0; j < 6; j++)
- {
- const auto srcData =
- initData[mipLevel + j * desc.numMipLevels].data;
- ::memcpy(
- workspace.getBuffer() + faceSizeInBytes * j,
- srcData,
- faceSizeInBytes);
- }
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- const auto srcData = initData[mipLevel].data;
- srcDataPtr = srcData;
- }
- }
-
- if (desc.arraySize > 1)
- {
- SLANG_ASSERT(
- desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube);
-
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- // Set the depth to the array length
- copyParam.Depth = desc.arraySize;
-
- if (desc.type == IResource::Type::TextureCube)
- {
- copyParam.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- }
- else
- {
- switch (desc.type)
- {
- case IResource::Type::Texture1D:
- case IResource::Type::Texture2D:
- {
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
- break;
- }
- case IResource::Type::Texture3D:
- case IResource::Type::TextureCube:
- {
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- copyParam.Depth = mipDepth;
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- break;
- }
-
- default:
- {
- SLANG_ASSERT(!"Not implemented");
- break;
- }
- }
- }
- }
- }
- // Set up texture sampling parameters, and create final texture obj
-
- {
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
- resDesc.resType = resourceType;
-
- if (tex->m_cudaArray)
- {
- resDesc.res.array.hArray = tex->m_cudaArray;
- }
- if (tex->m_cudaMipMappedArray)
- {
- resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
- }
-
- // If the texture might be used as a UAV, then we need to allocate
- // a CUDA "surface" for it.
- //
- // Note: We cannot do this unconditionally, because it will fail
- // on surfaces that are not usable as UAVs (e.g., those with
- // mipmaps).
- //
- // TODO: We should really only be allocating the array at the
- // time we create a resource, and then allocate the surface or
- // texture objects as part of view creation.
- //
- if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
- {
- // On CUDA surfaces only support a single MIP map
- SLANG_ASSERT(desc.numMipLevels == 1);
-
- SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
- }
-
-
- // Create handle for sampling.
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
- texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- SLANG_CUDA_RETURN_ON_FAIL(
- cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
- }
-
- returnComPtr(outResource, tex);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
- const IBufferResource::Desc& descIn,
- const void* initData,
- IBufferResource** outResource) override
- {
- auto desc = fixupBufferDesc(descIn);
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
- resource->m_cudaContext = m_context;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
- if (initData)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
- }
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle(
- InteropHandle handle,
- const IBufferResource::Desc& desc,
- IBufferResource** outResource) override
- {
- if (handle.handleValue == 0)
- {
- *outResource = nullptr;
- return SLANG_OK;
- }
-
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
- resource->m_cudaContext = m_context;
-
- // CUDA manages sharing of buffers through the idea of an
- // "external memory" object, which represents the relationship
- // with another API's objects. In order to create this external
- // memory association, we first need to fill in a descriptor struct.
- cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
- memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
- switch (handle.api)
- {
- case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
- break;
- case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
- break;
- default:
- return SLANG_FAIL;
- }
- externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
- externalMemoryHandleDesc.size = desc.sizeInBytes;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
- // Once we have filled in the descriptor, we can request
- // that CUDA create the required association between the
- // external buffer and its own memory.
- cudaExternalMemory_t externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
- resource->m_cudaExternalMemory = externalMemory;
-
- // The CUDA "external memory" handle is not itself a device
- // pointer, so we need to query for a suitable device address
- // for the buffer with another call.
- //
- // Just as for the external memory, we fill in a descriptor
- // structure (although in this case we only need to specify
- // the size).
- cudaExternalMemoryBufferDesc bufferDesc;
- memset(&bufferDesc, 0, sizeof(bufferDesc));
- bufferDesc.size = desc.sizeInBytes;
-
- // Finally, we can "map" the buffer to get a device address.
- void* deviceAddress;
- SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
- resource->m_cudaMemory = deviceAddress;
-
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle(
- InteropHandle handle,
- const ITextureResource::Desc& desc,
- const size_t size,
- ITextureResource** outResource) override
- {
- if (handle.handleValue == 0)
- {
- *outResource = nullptr;
- return SLANG_OK;
- }
-
- RefPtr<TextureCUDAResource> resource = new TextureCUDAResource(desc);
- resource->m_cudaContext = m_context;
-
- // CUDA manages sharing of buffers through the idea of an
- // "external memory" object, which represents the relationship
- // with another API's objects. In order to create this external
- // memory association, we first need to fill in a descriptor struct.
- CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
- memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
- switch (handle.api)
- {
- case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
- break;
- case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
- break;
- default:
- return SLANG_FAIL;
- }
- externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
- externalMemoryHandleDesc.size = size;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
- CUexternalMemory externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
- resource->m_cudaExternalMemory = externalMemory;
-
- FormatInfo formatInfo;
- SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- arrayDesc.Depth = desc.size.depth;
- arrayDesc.Height = desc.size.height;
- arrayDesc.Width = desc.size.width;
- arrayDesc.NumChannels = formatInfo.channelCount;
- getCUDAFormat(desc.format, &arrayDesc.Format);
- arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
-
- CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
- memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
- externalMemoryMipDesc.offset = 0;
- externalMemoryMipDesc.arrayDesc = arrayDesc;
- externalMemoryMipDesc.numLevels = desc.numMipLevels;
-
- CUmipmappedArray mipArray;
- SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
- resource->m_cudaMipMappedArray = mipArray;
-
- CUarray cuArray;
- SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
- resource->m_cudaArray = cuArray;
-
- CUDA_RESOURCE_DESC surfDesc;
- memset(&surfDesc, 0, sizeof(surfDesc));
- surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- surfDesc.res.array.hArray = cuArray;
-
- CUsurfObject surface;
- SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
- resource->m_cudaSurfObj = surface;
-
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
- ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override
- {
- RefPtr<CUDAResourceView> view = new CUDAResourceView();
- view->m_desc = desc;
- view->textureResource = dynamic_cast<TextureCUDAResource*>(texture);
- returnComPtr(outView, view);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
- IBufferResource* buffer,
- IBufferResource* counterBuffer,
- IResourceView::Desc const& desc,
- IResourceView** outView) override
- {
- RefPtr<CUDAResourceView> view = new CUDAResourceView();
- view->m_desc = desc;
- view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer);
- returnComPtr(outView, view);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool(
- const IQueryPool::Desc& desc,
- IQueryPool** outPool) override
- {
- RefPtr<CUDAQueryPool> pool = new CUDAQueryPool();
- SLANG_RETURN_ON_FAIL(pool->init(desc));
- returnComPtr(outPool, pool);
- return SLANG_OK;
- }
-
- virtual Result createShaderObjectLayout(
- slang::TypeLayoutReflection* typeLayout,
- ShaderObjectLayoutBase** outLayout) override
- {
- RefPtr<CUDAShaderObjectLayout> cudaLayout;
- cudaLayout = new CUDAShaderObjectLayout(this, typeLayout);
- returnRefPtrMove(outLayout, cudaLayout);
- return SLANG_OK;
- }
-
- virtual Result createShaderObject(
- ShaderObjectLayoutBase* layout,
- IShaderObject** outObject) override
- {
- RefPtr<CUDAShaderObject> result = new CUDAShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
- returnComPtr(outObject, result);
- return SLANG_OK;
- }
-
- virtual Result createMutableShaderObject(
- ShaderObjectLayoutBase* layout,
- IShaderObject** outObject) override
- {
- RefPtr<CUDAMutableShaderObject> result = new CUDAMutableShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
- returnComPtr(outObject, result);
- return SLANG_OK;
- }
-
- Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
- {
- auto cudaProgram = dynamic_cast<CUDAShaderProgram*>(program);
- auto cudaLayout = cudaProgram->layout;
-
- RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
- returnRefPtrMove(outObject, result);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createProgram(
- const IShaderProgram::Desc& desc,
- IShaderProgram** outProgram,
- ISlangBlob** outDiagnosticBlob) override
- {
- // If this is a specializable program, we just keep a reference to the slang program and
- // don't actually create any kernels. This program will be specialized later when we know
- // the shader object bindings.
- RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram();
- cudaProgram->init(desc);
- cudaProgram->cudaContext = m_context;
- if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
- {
- cudaProgram->layout = new CUDAProgramLayout(this, desc.slangGlobalScope->getLayout());
- returnComPtr(outProgram, cudaProgram);
- return SLANG_OK;
- }
-
- ComPtr<ISlangBlob> kernelCode;
- ComPtr<ISlangBlob> diagnostics;
- auto compileResult = desc.slangGlobalScope->getEntryPointCode(
- (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
- if (diagnostics)
- {
- getDebugCallback()->handleMessage(
- compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
- DebugMessageSource::Slang,
- (char*)diagnostics->getBufferPointer());
- if (outDiagnosticBlob)
- returnComPtr(outDiagnosticBlob, diagnostics);
- }
- SLANG_RETURN_ON_FAIL(compileResult);
-
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
- cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
- &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
-
- auto slangGlobalScope = desc.slangGlobalScope;
- if( slangGlobalScope )
- {
- cudaProgram->slangGlobalScope = slangGlobalScope;
-
- auto slangProgramLayout = slangGlobalScope->getLayout();
- if(!slangProgramLayout)
- return SLANG_FAIL;
-
- RefPtr<CUDAProgramLayout> cudaLayout;
- cudaLayout = new CUDAProgramLayout(this, slangProgramLayout);
- cudaLayout->programLayout = slangProgramLayout;
- cudaProgram->layout = cudaLayout;
- }
-
- returnComPtr(outProgram, cudaProgram);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
- const ComputePipelineStateDesc& desc, IPipelineState** outState) override
- {
- RefPtr<CUDAPipelineState> state = new CUDAPipelineState();
- state->shaderProgram = static_cast<CUDAShaderProgram*>(desc.program);
- state->init(desc);
- returnComPtr(outState, state);
- return Result();
- }
-
- void* map(IBufferResource* buffer)
- {
- return static_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory;
- }
-
- void unmap(IBufferResource* buffer)
- {
- SLANG_UNUSED(buffer);
- }
-
- virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override
- {
- return m_info;
- }
-
-public:
- virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap(
- const ITransientResourceHeap::Desc& desc,
- ITransientResourceHeap** outHeap) override
- {
- RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
- SLANG_RETURN_ON_FAIL(result->init(this, desc));
- returnComPtr(outHeap, result);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override
- {
- RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
- queue->init(this);
- returnComPtr(outQueue, queue);
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
- const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(window);
- SLANG_UNUSED(outSwapchain);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
- const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outLayout);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outFramebuffer);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
- const IRenderPassLayout::Desc& desc,
- IRenderPassLayout** outRenderPassLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outRenderPassLayout);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override
- {
- SLANG_UNUSED(desc);
- *outSampler = nullptr;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
- IInputLayout::Desc const& desc,
- IInputLayout** outLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outLayout);
- return SLANG_E_NOT_AVAILABLE;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
- const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outState);
- return SLANG_E_NOT_AVAILABLE;
- }
-
- virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
- ITextureResource* texture,
- ResourceState state,
- ISlangBlob** outBlob,
- size_t* outRowPitch,
- size_t* outPixelSize) override
- {
- auto textureImpl = static_cast<TextureCUDAResource*>(texture);
- RefPtr<ListBlob> blob = new ListBlob();
-
- auto desc = textureImpl->getDesc();
- auto width = desc->size.width;
- auto height = desc->size.height;
- FormatInfo sizeInfo;
- SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
- size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
- size_t rowPitch = width * pixelSize;
- size_t size = height * rowPitch;
- blob->m_data.setCount((Index)size);
-
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.srcArray = textureImpl->m_cudaArray;
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.dstHost = blob->m_data.getBuffer();
- copyParam.dstPitch = rowPitch;
- copyParam.WidthInBytes = copyParam.dstPitch;
- copyParam.Height = height;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
-
- *outRowPitch = rowPitch;
- *outPixelSize = pixelSize;
- returnComPtr(outBlob, blob);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
- IBufferResource* buffer,
- size_t offset,
- size_t size,
- ISlangBlob** outBlob) override
- {
- auto bufferImpl = static_cast<MemoryCUDAResource*>(buffer);
- RefPtr<ListBlob> blob = new ListBlob();
- blob->m_data.setCount((Index)size);
- cudaMemcpy(
- blob->m_data.getBuffer(),
- (uint8_t*)bufferImpl->m_cudaMemory + offset,
- size,
- cudaMemcpyDefault);
- returnComPtr(outBlob, blob);
- return SLANG_OK;
- }
-};
-
-SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
- m_layout = typeLayout;
-
- // If the layout tells us that there is any uniform data,
- // then we need to allocate a constant buffer to hold that data.
- //
- // TODO: Do we need to allocate a shadow copy for use from
- // the CPU?
- //
- // TODO: When/where do we bind this constant buffer into
- // a descriptor set for later use?
- //
- auto slangLayout = getLayout()->getElementTypeLayout();
- size_t uniformSize = slangLayout->getSize();
- if (uniformSize)
- {
- m_data.setCount((Index)uniformSize);
- }
-
- // If the layout specifies that we have any resources or sub-objects,
- // then we need to size the appropriate arrays to account for them.
- //
- // Note: the counts here are the *total* number of resources/sub-objects
- // and not just the number of resource/sub-object ranges.
- //
- resources.setCount(typeLayout->getResourceCount());
- m_objects.setCount(typeLayout->getSubObjectCount());
-
- for (auto subObjectRange : getLayout()->subObjectRanges)
- {
- RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout;
-
- // In the case where the sub-object range represents an
- // existential-type leaf field (e.g., an `IBar`), we
- // cannot pre-allocate the object(s) to go into that
- // range, since we can't possibly know what to allocate
- // at this point.
- //
- if (!subObjectLayout)
- continue;
- //
- // Otherwise, we will allocate a sub-object to fill
- // in each entry in this range, based on the layout
- // information we already have.
-
- auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
- for (Index i = 0; i < bindingRangeInfo.count; ++i)
- {
- RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject();
- SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
-
- ShaderOffset offset;
- offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
- offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex;
- offset.bindingArrayIndex = (GfxIndex)i;
-
- SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
- }
- }
- return SLANG_OK;
-}
-
-SlangResult CUDARootShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
- SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(device, typeLayout));
- auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout);
- for (auto& entryPoint : programLayout->entryPointLayouts)
- {
- RefPtr<CUDAEntryPointShaderObject> object = new CUDAEntryPointShaderObject();
- SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
- entryPointObjects.add(object);
- }
- return SLANG_OK;
-}
-
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
- RefPtr<CUDADevice> result = new CUDADevice();
- SLANG_RETURN_ON_FAIL(result->initialize(*desc));
- returnComPtr(outDevice, result);
- return SLANG_OK;
-}
-#else
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
- SLANG_UNUSED(desc);
- *outDevice = nullptr;
- return SLANG_FAIL;
-}
-#endif
-
-}
diff --git a/tools/gfx/cuda/render-cuda.h b/tools/gfx/cuda/render-cuda.h
deleted file mode 100644
index 5c477f513..000000000
--- a/tools/gfx/cuda/render-cuda.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include "../renderer-shared.h"
-
-namespace gfx
-{
-
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
-}
diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp
index ef4edb341..880e61c61 100644
--- a/tools/gfx/render.cpp
+++ b/tools/gfx/render.cpp
@@ -2,7 +2,6 @@
#include "renderer-shared.h"
#include "../../source/core/slang-math.h"
#include "open-gl/render-gl.h"
-#include "cuda/render-cuda.h"
#include "cpu/render-cpu.h"
#include "debug-layer.h"
@@ -14,6 +13,7 @@ using namespace Slang;
Result SLANG_MCALL createD3D11Device(const IDevice::Desc* desc, IDevice** outDevice);
Result SLANG_MCALL createD3D12Device(const IDevice::Desc* desc, IDevice** outDevice);
Result SLANG_MCALL createVKDevice(const IDevice::Desc* desc, IDevice** outDevice);
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
static bool debugLayerEnabled = false;