33 files changed, 3457 insertions, 2528 deletions
diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj
index 542c34371..25a48c339 100644
--- a/build/visual-studio/gfx/gfx.vcxproj
+++ b/build/visual-studio/gfx/gfx.vcxproj
@@ -303,7 +303,21 @@
     <ClInclude Include="..\..\..\tools\gfx\command-encoder-com-forward.h" />
     <ClInclude Include="..\..\..\tools\gfx\command-writer.h" />
     <ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h" />
-    <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d11\d3d11-base.h" />
@@ -390,7 +404,18 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp" />
-    <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp" />
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d11\d3d11-buffer.cpp" />
diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters
index d461d3d24..61708be0b 100644
--- a/build/visual-studio/gfx/gfx.vcxproj.filters
+++ b/build/visual-studio/gfx/gfx.vcxproj.filters
@@ -21,7 +21,49 @@
     <ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h">
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-base.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-buffer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-command-queue.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-context.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-device.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-query.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-resource-views.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-object.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-shader-program.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\cuda-texture.h">
       <Filter>Header Files</Filter>
     </ClInclude>
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h">
@@ -278,7 +320,40 @@
     <ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp">
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-buffer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-buffer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-encoder.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-command-queue.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-device.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-helper-functions.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-pipeline-state.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-query.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object-layout.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-object.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-shader-program.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\cuda-texture.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp">
diff --git a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
index 0dd5ad561..e14729718 100644
--- a/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
+++ b/tools/gfx-unit-test/get-cmd-queue-handle-test.cpp
@@ -21,7 +21,7 @@ namespace gfx_test
         GFX_CHECK_CALL_ABORT(queue->getNativeHandle(&handle));
         if (device->getDeviceInfo().deviceType == gfx::DeviceType::Vulkan)
         {
-            SLANG_CHECK(handle.handleValue != NULL);
+            SLANG_CHECK(handle.handleValue != 0);
         }
 #if SLANG_WINDOWS_FAMILY
         else
diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h
new file mode 100644
index 000000000..57a244089
--- /dev/null
+++ b/tools/gfx/cuda/cuda-base.h
@@ -0,0 +1,59 @@
+// cuda-base.h
+// Shared header file for CUDA implementation
+#pragma once
+
+#ifdef GFX_ENABLE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "core/slang-basic.h"
+#include "core/slang-blob.h"
+#include "core/slang-std-writers.h"
+
+#include "slang.h"
+#include "slang-com-ptr.h"
+#include "slang-com-helper.h"
+#include "../command-writer.h"
+#include "../renderer-shared.h"
+#include "../mutable-shader-object.h"
+#include "../simple-transient-resource-heap.h"
+#include "../slang-context.h"
+#include "../command-encoder-com-forward.h"
+
+#   ifdef RENDER_TEST_OPTIX
+
+// The `optix_stubs.h` header produces warnings when compiled with MSVC
+#       ifdef _MSC_VER
+#           pragma warning(disable: 4996)
+#       endif
+
+#       include <optix.h>
+#       include <optix_function_table_definition.h>
+#       include <optix_stubs.h>
+#   endif
+
+#endif
+
+namespace gfx
+{
+namespace cuda
+{
+    class CUDAContext;
+    class BufferResourceImpl;
+    class TextureResourceImpl;
+    class ResourceViewImpl;
+    class ShaderObjectLayoutImpl;
+    class RootShaderObjectLayoutImpl;
+    class ShaderObjectImpl;
+    class MutableShaderObjectImpl;
+    class EntryPointShaderObjectImpl;
+    class RootShaderObjectImpl;
+    class ShaderProgramImpl;
+    class PipelineStateImpl;
+    class QueryPoolImpl;
+    class DeviceImpl;
+    class CommandBufferImpl;
+    class ResourceCommandEncoderImpl;
+    class ComputeCommandEncoderImpl;
+    class CommandQueueImpl;
+}
+}
diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp
new file mode 100644
index 000000000..1cd162841
--- /dev/null
+++ b/tools/gfx/cuda/cuda-buffer.cpp
@@ -0,0 +1,51 @@
+// cuda-buffer.cpp
+#include "cuda-buffer.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+BufferResourceImpl::~BufferResourceImpl()
+{
+    if (m_cudaMemory)
+    {
+        SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+    }
+}
+
+uint64_t BufferResourceImpl::getBindlessHandle() { return (uint64_t)m_cudaMemory; }
+
+DeviceAddress BufferResourceImpl::getDeviceAddress()
+{
+    return (DeviceAddress)m_cudaMemory;
+}
+
+Result BufferResourceImpl::getNativeResourceHandle(InteropHandle* outHandle)
+{
+    outHandle->handleValue = getBindlessHandle();
+    outHandle->api = InteropHandleAPI::CUDA;
+    return SLANG_OK;
+}
+
+Result BufferResourceImpl::map(MemoryRange* rangeToRead, void** outPointer)
+{
+    SLANG_UNUSED(rangeToRead);
+    SLANG_UNUSED(outPointer);
+    return SLANG_FAIL;
+}
+
+Result BufferResourceImpl::unmap(MemoryRange* writtenRange)
+{
+    SLANG_UNUSED(writtenRange);
+    return SLANG_FAIL;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-buffer.h b/tools/gfx/cuda/cuda-buffer.h
new file mode 100644
index 000000000..838a06555
--- /dev/null
+++ b/tools/gfx/cuda/cuda-buffer.h
@@ -0,0 +1,39 @@
+// cuda-buffer.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-context.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class BufferResourceImpl : public BufferResource
+{
+public:
+    BufferResourceImpl(const Desc& _desc)
+        : BufferResource(_desc)
+    {}
+
+    ~BufferResourceImpl();
+
+    uint64_t getBindlessHandle();
+
+    void* m_cudaExternalMemory = nullptr;
+    void* m_cudaMemory = nullptr;
+
+    RefPtr<CUDAContext> m_cudaContext;
+
+    virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        map(MemoryRange* rangeToRead, void** outPointer) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-buffer.cpp b/tools/gfx/cuda/cuda-command-buffer.cpp
new file mode 100644
index 000000000..efb2486f4
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-buffer.cpp
@@ -0,0 +1,62 @@
+// cuda-command-buffer.cpp
+#include "cuda-command-buffer.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandBuffer* CommandBufferImpl::getInterface(const Guid& guid)
+{
+    if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
+        return static_cast<ICommandBuffer*>(this);
+    return nullptr;
+}
+
+void CommandBufferImpl::init(DeviceImpl* device, TransientResourceHeapBase* transientHeap)
+{
+    m_device = device;
+    m_transientHeap = transientHeap;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandBufferImpl::encodeRenderCommands(
+    IRenderPassLayout* renderPass,
+    IFramebuffer* framebuffer,
+    IRenderCommandEncoder** outEncoder)
+{
+    SLANG_UNUSED(renderPass);
+    SLANG_UNUSED(framebuffer);
+    *outEncoder = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    CommandBufferImpl::encodeResourceCommands(IResourceCommandEncoder** outEncoder)
+{
+    m_resourceCommandEncoder.init(this);
+    *outEncoder = &m_resourceCommandEncoder;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    CommandBufferImpl::encodeComputeCommands(IComputeCommandEncoder** outEncoder)
+{
+    m_computeCommandEncoder.init(this);
+    *outEncoder = &m_computeCommandEncoder;
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    CommandBufferImpl::encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder)
+{
+    *outEncoder = nullptr;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandBufferImpl::getNativeHandle(InteropHandle* outHandle)
+{
+    return SLANG_FAIL;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-buffer.h b/tools/gfx/cuda/cuda-command-buffer.h
new file mode 100644
index 000000000..76d9aa01f
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-buffer.h
@@ -0,0 +1,49 @@
+// cuda-command-buffer.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-command-encoder.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CommandBufferImpl
+    : public ICommandBuffer
+    , public CommandWriter
+    , public ComObject
+{
+public:
+    SLANG_COM_OBJECT_IUNKNOWN_ALL
+        ICommandBuffer* getInterface(const Guid& guid);
+
+public:
+    DeviceImpl* m_device;
+    TransientResourceHeapBase* m_transientHeap;
+    ResourceCommandEncoderImpl m_resourceCommandEncoder;
+    ComputeCommandEncoderImpl m_computeCommandEncoder;
+
+    void init(DeviceImpl* device, TransientResourceHeapBase* transientHeap);
+    virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
+        IRenderPassLayout* renderPass,
+        IFramebuffer* framebuffer,
+        IRenderCommandEncoder** outEncoder) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        encodeResourceCommands(IResourceCommandEncoder** outEncoder) override;
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        encodeComputeCommands(IComputeCommandEncoder** outEncoder) override;
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-encoder.cpp b/tools/gfx/cuda/cuda-command-encoder.cpp
new file mode 100644
index 000000000..6911eb649
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-encoder.cpp
@@ -0,0 +1,210 @@
+// cuda-command-encoder.cpp
+#include "cuda-command-encoder.h"
+
+#include "cuda-command-buffer.h"
+#include "cuda-device.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+void ResourceCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer)
+{ 
+    m_writer = cmdBuffer;
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyBuffer(
+    IBufferResource* dst,
+    Offset dstOffset,
+    IBufferResource* src,
+    Offset srcOffset,
+    Size size)
+{
+    m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadBufferData(
+    IBufferResource* dst, Offset offset, Size size, void* data)
+{
+    m_writer->uploadBufferData(dst, offset, size, data);
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    ResourceCommandEncoderImpl::writeTimestamp(IQueryPool* pool, GfxIndex index)
+{
+    m_writer->writeTimestamp(pool, index);
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTexture(
+    ITextureResource* dst,
+    ResourceState dstState,
+    SubresourceRange dstSubresource,
+    ITextureResource::Offset3D dstOffset,
+    ITextureResource* src,
+    ResourceState srcState,
+    SubresourceRange srcSubresource,
+    ITextureResource::Offset3D srcOffset,
+    ITextureResource::Extents extent)
+{
+    SLANG_UNUSED(dst);
+    SLANG_UNUSED(dstState);
+    SLANG_UNUSED(dstSubresource);
+    SLANG_UNUSED(dstOffset);
+    SLANG_UNUSED(src);
+    SLANG_UNUSED(srcState);
+    SLANG_UNUSED(srcSubresource);
+    SLANG_UNUSED(srcOffset);
+    SLANG_UNUSED(extent);
+    SLANG_UNIMPLEMENTED_X("copyTexture");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::uploadTextureData(
+    ITextureResource* dst,
+    SubresourceRange subResourceRange,
+    ITextureResource::Offset3D offset,
+    ITextureResource::Extents extent,
+    ITextureResource::SubresourceData* subResourceData,
+    GfxCount subResourceDataCount)
+{
+    SLANG_UNUSED(dst);
+    SLANG_UNUSED(subResourceRange);
+    SLANG_UNUSED(offset);
+    SLANG_UNUSED(extent);
+    SLANG_UNUSED(subResourceData);
+    SLANG_UNUSED(subResourceDataCount);
+    SLANG_UNIMPLEMENTED_X("uploadTextureData");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::clearResourceView(
+    IResourceView* view,
+    ClearValue* clearValue,
+    ClearResourceViewFlags::Enum flags)
+{
+    SLANG_UNUSED(view);
+    SLANG_UNUSED(clearValue);
+    SLANG_UNUSED(flags);
+    SLANG_UNIMPLEMENTED_X("clearResourceView");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveResource(
+    ITextureResource* source,
+    ResourceState sourceState,
+    SubresourceRange sourceRange,
+    ITextureResource* dest,
+    ResourceState destState,
+    SubresourceRange destRange)
+{
+    SLANG_UNUSED(source);
+    SLANG_UNUSED(sourceState);
+    SLANG_UNUSED(sourceRange);
+    SLANG_UNUSED(dest);
+    SLANG_UNUSED(destState);
+    SLANG_UNUSED(destRange);
+    SLANG_UNIMPLEMENTED_X("resolveResource");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::resolveQuery(
+    IQueryPool* queryPool,
+    GfxIndex index,
+    GfxCount count,
+    IBufferResource* buffer,
+    Offset offset)
+{
+    SLANG_UNUSED(queryPool);
+    SLANG_UNUSED(index);
+    SLANG_UNUSED(count);
+    SLANG_UNUSED(buffer);
+    SLANG_UNUSED(offset);
+    SLANG_UNIMPLEMENTED_X("resolveQuery");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::copyTextureToBuffer(
+    IBufferResource* dst,
+    Offset dstOffset,
+    Size dstSize,
+    Size dstRowStride,
+    ITextureResource* src,
+    ResourceState srcState,
+    SubresourceRange srcSubresource,
+    ITextureResource::Offset3D srcOffset,
+    ITextureResource::Extents extent)
+{
+    SLANG_UNUSED(dst);
+    SLANG_UNUSED(dstOffset);
+    SLANG_UNUSED(dstSize);
+    SLANG_UNUSED(dstRowStride);
+    SLANG_UNUSED(src);
+    SLANG_UNUSED(srcState);
+    SLANG_UNUSED(srcSubresource);
+    SLANG_UNUSED(srcOffset);
+    SLANG_UNUSED(extent);
+    SLANG_UNIMPLEMENTED_X("copyTextureToBuffer");
+}
+
+SLANG_NO_THROW void SLANG_MCALL ResourceCommandEncoderImpl::textureSubresourceBarrier(
+    ITextureResource* texture,
+    SubresourceRange subresourceRange,
+    ResourceState src,
+    ResourceState dst)
+{
+    SLANG_UNUSED(texture);
+    SLANG_UNUSED(subresourceRange);
+    SLANG_UNUSED(src);
+    SLANG_UNUSED(dst);
+    SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier");
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    ResourceCommandEncoderImpl::beginDebugEvent(const char* name, float rgbColor[3])
+{
+    SLANG_UNUSED(name);
+    SLANG_UNUSED(rgbColor);
+}
+
+void ComputeCommandEncoderImpl::init(CommandBufferImpl* cmdBuffer)
+{
+    m_writer = cmdBuffer;
+    m_commandBuffer = cmdBuffer;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ComputeCommandEncoderImpl::bindPipeline(IPipelineState* state, IShaderObject** outRootObject)
+{
+    m_writer->setPipelineState(state);
+    PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
+    SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
+        pipelineImpl->m_program, m_rootObject.writeRef()));
+    returnComPtr(outRootObject, m_rootObject);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ComputeCommandEncoderImpl::bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject)
+{
+    m_writer->setPipelineState(state);
+    PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
+    SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
+        pipelineImpl->m_program, m_rootObject.writeRef()));
+    m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW void SLANG_MCALL ComputeCommandEncoderImpl::dispatchCompute(int x, int y, int z)
+{
+    m_writer->bindRootShaderObject(m_rootObject);
+    m_writer->dispatchCompute(x, y, z);
+}
+
+SLANG_NO_THROW void SLANG_MCALL
+    ComputeCommandEncoderImpl::dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset)
+{
+    SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect");
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-encoder.h b/tools/gfx/cuda/cuda-command-encoder.h
new file mode 100644
index 000000000..73660534a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-encoder.h
@@ -0,0 +1,135 @@
+// cuda-command-encoder.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ResourceCommandEncoderImpl : public IResourceCommandEncoder
+{
+public:
+    CommandWriter* m_writer;
+
+    void init(CommandBufferImpl* cmdBuffer);
+
+    virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+    virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
+        IBufferResource* dst,
+        Offset dstOffset,
+        IBufferResource* src,
+        Offset srcOffset,
+        Size size) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier(
+        GfxCount count,
+        ITextureResource* const* textures,
+        ResourceState src,
+        ResourceState dst) override
+    {}
+
+    virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier(
+        GfxCount count,
+        IBufferResource* const* buffers,
+        ResourceState src,
+        ResourceState dst) override
+    {}
+
+    virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData(
+        IBufferResource* dst, Offset offset, Size size, void* data) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        writeTimestamp(IQueryPool* pool, GfxIndex index) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL copyTexture(
+        ITextureResource* dst,
+        ResourceState dstState,
+        SubresourceRange dstSubresource,
+        ITextureResource::Offset3D dstOffset,
+        ITextureResource* src,
+        ResourceState srcState,
+        SubresourceRange srcSubresource,
+        ITextureResource::Offset3D srcOffset,
+        ITextureResource::Extents extent) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData(
+        ITextureResource* dst,
+        SubresourceRange subResourceRange,
+        ITextureResource::Offset3D offset,
+        ITextureResource::Extents extent,
+        ITextureResource::SubresourceData* subResourceData,
+        GfxCount subResourceDataCount) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView(
+        IResourceView* view,
+        ClearValue* clearValue,
+        ClearResourceViewFlags::Enum flags) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL resolveResource(
+        ITextureResource* source,
+        ResourceState sourceState,
+        SubresourceRange sourceRange,
+        ITextureResource* dest,
+        ResourceState destState,
+        SubresourceRange destRange) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery(
+        IQueryPool* queryPool,
+        GfxIndex index,
+        GfxCount count,
+        IBufferResource* buffer,
+        Offset offset) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer(
+        IBufferResource* dst,
+        Offset dstOffset,
+        Size dstSize,
+        Size dstRowStride,
+        ITextureResource* src,
+        ResourceState srcState,
+        SubresourceRange srcSubresource,
+        ITextureResource::Offset3D srcOffset,
+        ITextureResource::Extents extent) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier(
+        ITextureResource* texture,
+        SubresourceRange subresourceRange,
+        ResourceState src,
+        ResourceState dst) override;
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        beginDebugEvent(const char* name, float rgbColor[3]) override;
+    virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {}
+};
+
+class ComputeCommandEncoderImpl
+    : public IComputeCommandEncoder
+    , public ResourceCommandEncoderImpl
+{
+public:
+    SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl)
+public:
+    CommandWriter* m_writer;
+    CommandBufferImpl* m_commandBuffer;
+    RefPtr<ShaderObjectBase> m_rootObject;
+    virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+    void init(CommandBufferImpl* cmdBuffer);
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL
+        dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
new file mode 100644
index 000000000..60e81246d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -0,0 +1,225 @@
+// cuda-command-queue.cpp
+#include "cuda-command-queue.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-buffer.h"
+#include "cuda-query.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ICommandQueue* CommandQueueImpl::getInterface(const Guid& guid)
+{
+    if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
+        return static_cast<ICommandQueue*>(this);
+    return nullptr;
+}
+
+void CommandQueueImpl::init(DeviceImpl* inRenderer)
+{
+    renderer = inRenderer;
+    m_desc.type = ICommandQueue::QueueType::Graphics;
+    cuStreamCreate(&stream, 0);
+}
+CommandQueueImpl::~CommandQueueImpl()
+{
+    cuStreamSynchronize(stream);
+    cuStreamDestroy(stream);
+    currentPipeline = nullptr;
+    currentRootObject = nullptr;
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::executeCommandBuffers(
+    GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal)
+{
+    SLANG_UNUSED(valueToSignal);
+    // TODO: implement fence.
+    assert(fence == nullptr);
+    for (GfxIndex i = 0; i < count; i++)
+    {
+        execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
+    }
+}
+
+SLANG_NO_THROW void SLANG_MCALL CommandQueueImpl::waitOnHost()
+{
+    auto resultCode = cuStreamSynchronize(stream);
+    if (resultCode != cudaSuccess)
+        SLANG_CUDA_HANDLE_ERROR(resultCode);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::waitForFenceValuesOnDevice(
+    GfxCount fenceCount, IFence** fences, uint64_t* waitValues)
+{
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL CommandQueueImpl::getNativeHandle(InteropHandle* outHandle)
+{
+    return SLANG_FAIL;
+}
+
+void CommandQueueImpl::setPipelineState(IPipelineState* state)
+{
+    currentPipeline = dynamic_cast<ComputePipelineStateImpl*>(state);
+}
+
+Result CommandQueueImpl::bindRootShaderObject(IShaderObject* object)
+{
+    currentRootObject = dynamic_cast<RootShaderObjectImpl*>(object);
+    if (currentRootObject)
+        return SLANG_OK;
+    return SLANG_E_INVALID_ARG;
+}
+
+void CommandQueueImpl::dispatchCompute(int x, int y, int z)
+{
+    // Specialize the compute kernel based on the shader object bindings.
+    RefPtr<PipelineStateBase> newPipeline;
+    renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
+    currentPipeline = static_cast<ComputePipelineStateImpl*>(newPipeline.Ptr());
+
+    // Find out thread group size from program reflection.
+    auto& kernelName = currentPipeline->shaderProgram->kernelName;
+    auto programLayout = static_cast<RootShaderObjectLayoutImpl*>(currentRootObject->getLayout());
+    int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
+    SLANG_ASSERT(kernelId != -1);
+    UInt threadGroupSize[3];
+    programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
+
+    int sharedSizeInBytes;
+    cuFuncGetAttribute(
+        &sharedSizeInBytes,
+        CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+        currentPipeline->shaderProgram->cudaKernel);
+
+    // Copy global parameter data to the `SLANG_globalParams` symbol.
+    {
+        CUdeviceptr globalParamsSymbol = 0;
+        size_t globalParamsSymbolSize = 0;
+        cuModuleGetGlobal(
+            &globalParamsSymbol,
+            &globalParamsSymbolSize,
+            currentPipeline->shaderProgram->cudaModule,
+            "SLANG_globalParams");
+
+        CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
+        cudaMemcpyAsync(
+            (void*)globalParamsSymbol,
+            (void*)globalParamsCUDAData,
+            globalParamsSymbolSize,
+            cudaMemcpyDefault,
+            0);
+    }
+    //
+    // The argument data for the entry-point parameters are already
+    // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
+    //
+    auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
+    auto entryPointDataSize =
+        currentRootObject->entryPointObjects[kernelId]->getBufferSize();
+
+    void* extraOptions[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        entryPointBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &entryPointDataSize,
+        CU_LAUNCH_PARAM_END,
+    };
+
+    // Once we have all the necessary data extracted and/or
+    // set up, we can launch the kernel and see what happens.
+    //
+    auto cudaLaunchResult = cuLaunchKernel(
+        currentPipeline->shaderProgram->cudaKernel,
+        x,
+        y,
+        z,
+        int(threadGroupSize[0]),
+        int(threadGroupSize[1]),
+        int(threadGroupSize[2]),
+        sharedSizeInBytes,
+        stream,
+        nullptr,
+        extraOptions);
+
+    SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
+}
+
+void CommandQueueImpl::copyBuffer(
+    IBufferResource* dst,
+    size_t dstOffset,
+    IBufferResource* src,
+    size_t srcOffset,
+    size_t size)
+{
+    auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+    auto srcImpl = static_cast<BufferResourceImpl*>(src);
+    cudaMemcpy(
+        (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
+        (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
+        size,
+        cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+{
+    auto dstImpl = static_cast<BufferResourceImpl*>(dst);
+    cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+}
+
+void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
+{
+    auto poolImpl = static_cast<QueryPoolImpl*>(pool);
+    cuEventRecord(poolImpl->m_events[index], stream);
+}
+
+void CommandQueueImpl::execute(CommandBufferImpl* commandBuffer)
+{
+    for (auto& cmd : commandBuffer->m_commands)
+    {
+        switch (cmd.name)
+        {
+        case CommandName::SetPipelineState:
+            setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
+            break;
+        case CommandName::BindRootShaderObject:
+            bindRootShaderObject(
+                commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
+            break;
+        case CommandName::DispatchCompute:
+            dispatchCompute(
+                int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
+            break;
+        case CommandName::CopyBuffer:
+            copyBuffer(
+                commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+                cmd.operands[1],
+                commandBuffer->getObject<BufferResource>(cmd.operands[2]),
+                cmd.operands[3],
+                cmd.operands[4]);
+            break;
+        case CommandName::UploadBufferData:
+            uploadBufferData(
+                commandBuffer->getObject<BufferResource>(cmd.operands[0]),
+                cmd.operands[1],
+                cmd.operands[2],
+                commandBuffer->getData<uint8_t>(cmd.operands[3]));
+            break;
+        case CommandName::WriteTimestamp:
+            writeTimestamp(
+                commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
+                (SlangInt)cmd.operands[1]);
+        }
+    }
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-command-queue.h b/tools/gfx/cuda/cuda-command-queue.h
new file mode 100644
index 000000000..10601890e
--- /dev/null
+++ b/tools/gfx/cuda/cuda-command-queue.h
@@ -0,0 +1,69 @@
+// cuda-command-queue.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-device.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-shader-object.h"
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CommandQueueImpl
+    : public ICommandQueue
+    , public ComObject
+{
+public:
+    SLANG_COM_OBJECT_IUNKNOWN_ALL
+        ICommandQueue* getInterface(const Guid& guid);
+
+    RefPtr<ComputePipelineStateImpl> currentPipeline;
+    RefPtr<RootShaderObjectImpl> currentRootObject;
+    RefPtr<DeviceImpl> renderer;
+    CUstream stream;
+    Desc m_desc;
+
+    void init(DeviceImpl* inRenderer);
+    ~CommandQueueImpl();
+
+    virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override { return m_desc; }
+
+    virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers(
+        GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override;
+
+    virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice(
+        GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override;
+
+    void setPipelineState(IPipelineState* state);
+
+    Result bindRootShaderObject(IShaderObject* object);
+
+    void dispatchCompute(int x, int y, int z);
+
+    void copyBuffer(
+        IBufferResource* dst,
+        size_t dstOffset,
+        IBufferResource* src,
+        size_t srcOffset,
+        size_t size);
+
+    void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data);
+
+    void writeTimestamp(IQueryPool* pool, SlangInt index);
+
+    void execute(CommandBufferImpl* commandBuffer);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-context.h b/tools/gfx/cuda/cuda-context.h
new file mode 100644
index 000000000..d1a53e4e9
--- /dev/null
+++ b/tools/gfx/cuda/cuda-context.h
@@ -0,0 +1,22 @@
+// cuda-context.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class CUDAContext : public RefObject
+{
+public:
+    CUcontext m_context = nullptr;
+    ~CUDAContext() { cuCtxDestroy(m_context); }
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
new file mode 100644
index 000000000..1a4a142d0
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -0,0 +1,1115 @@
+// cuda-device.cpp
+#include "cuda-device.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-queue.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-query.h"
+#include "cuda-shader-object.h"
+#include "cuda-shader-object-layout.h"
+#include "cuda-shader-program.h"
+#include "cuda-resource-views.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+int DeviceImpl::_calcSMCountPerMultiProcessor(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    struct SMInfo
+    {
+        int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+        int coreCount;
+    };
+
+    static const SMInfo infos[] = {
+        {0x30, 192},
+        {0x32, 192},
+        {0x35, 192},
+        {0x37, 192},
+        {0x50, 128},
+        {0x52, 128},
+        {0x53, 128},
+        {0x60, 64},
+        {0x61, 128},
+        {0x62, 128},
+        {0x70, 64},
+        {0x72, 64},
+        {0x75, 64} };
+
+    const int sm = ((major << 4) + minor);
+    for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+    {
+        if (infos[i].sm == sm)
+        {
+            return infos[i].coreCount;
+        }
+    }
+
+    const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+    // It must be newer presumably
+    SLANG_ASSERT(sm > last.sm);
+
+    // Default to the last entry
+    return last.coreCount;
+}
+
+SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
+{
+    int smPerMultiproc = 0;
+    int maxPerfDevice = -1;
+    int deviceCount = 0;
+    int devicesProhibited = 0;
+
+    uint64_t maxComputePerf = 0;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+    // Find the best CUDA capable GPU device
+    for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+    {
+        int computeMode = -1, major = 0, minor = 0;
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+        // If this GPU is not running on Compute Mode prohibited,
+        // then we can add it to the list
+        if (computeMode != cudaComputeModeProhibited)
+        {
+            if (major == 9999 && minor == 9999)
+            {
+                smPerMultiproc = 1;
+            }
+            else
+            {
+                smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+            }
+
+            int multiProcessorCount = 0, clockRate = 0;
+            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
+                &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+            uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+            if (compute_perf > maxComputePerf)
+            {
+                maxComputePerf = compute_perf;
+                maxPerfDevice = currentDevice;
+            }
+        }
+        else
+        {
+            devicesProhibited++;
+        }
+    }
+
+    if (maxPerfDevice < 0)
+    {
+        return SLANG_FAIL;
+    }
+
+    *outDeviceIndex = maxPerfDevice;
+    return SLANG_OK;
+}
+
+SlangResult DeviceImpl::_initCuda(CUDAReportStyle reportType)
+{
+    static CUresult res = cuInit(0);
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::getNativeDeviceHandles(InteropHandles* outHandles)
+{
+    outHandles->handles[0].handleValue = (uint64_t)m_device;
+    outHandles->handles[0].api = InteropHandleAPI::CUDA;
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
+{
+    SLANG_RETURN_ON_FAIL(slangContext.initialize(
+        desc.slang,
+        SLANG_PTX,
+        "sm_5_1",
+        makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
+
+    SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
+
+    SLANG_RETURN_ON_FAIL(_initCuda(reportType));
+
+    SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
+
+    m_context = new CUDAContext();
+
+    int count = -1;
+    cuDeviceGetCount(&count);
+    SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
+
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
+        cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
+
+    // Not clear how to detect half support on CUDA. For now we'll assume we have it
+    {
+        m_features.add("half");
+    }
+
+    // Initialize DeviceInfo
+    {
+        m_info.deviceType = DeviceType::CUDA;
+        m_info.bindingStyle = BindingStyle::CUDA;
+        m_info.projectionStyle = ProjectionStyle::DirectX;
+        m_info.apiName = "CUDA";
+        static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
+        ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
+        cudaDeviceProp deviceProperties;
+        cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
+        m_adapterName = deviceProperties.name;
+        m_info.adapterName = m_adapterName.begin();
+        m_info.timestampFrequency = 1000000;
+    }
+
+    return SLANG_OK;
+}
+
+Result DeviceImpl::getCUDAFormat(Format format, CUarray_format* outFormat)
+{
+    // TODO: Expand to cover all available formats that can be supported in CUDA
+    switch (format)
+    {
+    case Format::R32G32B32A32_FLOAT:
+    case Format::R32G32B32_FLOAT:
+    case Format::R32G32_FLOAT:
+    case Format::R32_FLOAT:
+    case Format::D32_FLOAT:
+        *outFormat = CU_AD_FORMAT_FLOAT;
+        return SLANG_OK;
+    case Format::R16G16B16A16_FLOAT:
+    case Format::R16G16_FLOAT:
+    case Format::R16_FLOAT:
+        *outFormat = CU_AD_FORMAT_HALF;
+        return SLANG_OK;
+    case Format::R32G32B32A32_UINT:
+    case Format::R32G32B32_UINT:
+    case Format::R32G32_UINT:
+    case Format::R32_UINT:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
+        return SLANG_OK;
+    case Format::R16G16B16A16_UINT:
+    case Format::R16G16_UINT:
+    case Format::R16_UINT:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
+        return SLANG_OK;
+    case Format::R8G8B8A8_UINT:
+    case Format::R8G8_UINT:
+    case Format::R8_UINT:
+    case Format::R8G8B8A8_UNORM:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+        return SLANG_OK;
+    case Format::R32G32B32A32_SINT:
+    case Format::R32G32B32_SINT:
+    case Format::R32G32_SINT:
+    case Format::R32_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT32;
+        return SLANG_OK;
+    case Format::R16G16B16A16_SINT:
+    case Format::R16G16_SINT:
+    case Format::R16_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT16;
+        return SLANG_OK;
+    case Format::R8G8B8A8_SINT:
+    case Format::R8G8_SINT:
+    case Format::R8_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT8;
+        return SLANG_OK;
+    default:
+        SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+        return SLANG_FAIL;
+    }
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureResource(
+    const ITextureResource::Desc& desc,
+    const ITextureResource::SubresourceData* initData,
+    ITextureResource** outResource)
+{
+    TextureResource::Desc srcDesc = fixupTextureDesc(desc);
+
+    RefPtr<TextureResourceImpl> tex = new TextureResourceImpl(srcDesc);
+    tex->m_cudaContext = m_context;
+
+    CUresourcetype resourceType;
+
+    // The size of the element/texel in bytes
+    size_t elementSize = 0;
+
+    // Our `ITextureResource::Desc` uses an enumeration to specify
+    // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
+    // `cuMipmappedArrayCreate` seemingly relies on a policy where
+    // the extents of the array in dimenions above the rank are
+    // specified as zero (e.g., a 1D texture requires `height==0`).
+    //
+    // We will start by massaging the extents as specified by the
+    // user into a form that CUDA wants/expects, based on the
+    // texture shape as specified in the `desc`.
+    //
+    int width = desc.size.width;
+    int height = desc.size.height;
+    int depth = desc.size.depth;
+    switch (desc.type)
+    {
+    case IResource::Type::Texture1D:
+        height = 0;
+        depth = 0;
+        break;
+
+    case IResource::Type::Texture2D:
+        depth = 0;
+        break;
+
+    case IResource::Type::Texture3D:
+        break;
+
+    case IResource::Type::TextureCube:
+        depth = 1;
+        break;
+    }
+
+    {
+        CUarray_format format = CU_AD_FORMAT_FLOAT;
+        int numChannels = 0;
+
+        SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
+        FormatInfo info;
+        gfxGetFormatInfo(desc.format, &info);
+        numChannels = info.channelCount;
+
+        switch (format)
+        {
+        case CU_AD_FORMAT_FLOAT:
+        {
+            elementSize = sizeof(float) * numChannels;
+            break;
+        }
+        case CU_AD_FORMAT_HALF:
+        {
+            elementSize = sizeof(uint16_t) * numChannels;
+            break;
+        }
+        case CU_AD_FORMAT_UNSIGNED_INT8:
+        {
+            elementSize = sizeof(uint8_t) * numChannels;
+            break;
+        }
+        default:
+        {
+            SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+            return SLANG_FAIL;
+        }
+        }
+
+        if (desc.numMipLevels > 1)
+        {
+            resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+            arrayDesc.Width = width;
+            arrayDesc.Height = height;
+            arrayDesc.Depth = depth;
+            arrayDesc.Format = format;
+            arrayDesc.NumChannels = numChannels;
+            arrayDesc.Flags = 0;
+
+            if (desc.arraySize > 1)
+            {
+                if (desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
+                    arrayDesc.Depth = desc.arraySize;
+                }
+                else
+                {
+                    SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
+                    return SLANG_FAIL;
+                }
+            }
+
+            if (desc.type == IResource::Type::TextureCube)
+            {
+                arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                arrayDesc.Depth *= 6;
+            }
+
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
+        }
+        else
+        {
+            resourceType = CU_RESOURCE_TYPE_ARRAY;
+
+            if (desc.arraySize > 1)
+            {
+                if (desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube)
+                {
+                    SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
+                    return SLANG_FAIL;
+                }
+
+                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                // Set the depth as the array length
+                arrayDesc.Depth = desc.arraySize;
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Depth *= 6;
+                }
+
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+            else if (desc.type == IResource::Type::Texture3D ||
+                desc.type == IResource::Type::TextureCube)
+            {
+                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                arrayDesc.Depth = depth;
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                arrayDesc.Flags = 0;
+
+                // Handle cube texture
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Depth = 6;
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+            else
+            {
+                CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                // Allocate the array, will work for 1D or 2D case
+                SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+        }
+    }
+
+    // Work space for holding data for uploading if it needs to be rearranged
+    if (initData)
+    {
+        List<uint8_t> workspace;
+        for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
+        {
+            int mipWidth = width >> mipLevel;
+            int mipHeight = height >> mipLevel;
+            int mipDepth = depth >> mipLevel;
+
+            mipWidth = (mipWidth == 0) ? 1 : mipWidth;
+            mipHeight = (mipHeight == 0) ? 1 : mipHeight;
+            mipDepth = (mipDepth == 0) ? 1 : mipDepth;
+
+            // If it's a cubemap then the depth is always 6
+            if (desc.type == IResource::Type::TextureCube)
+            {
+                mipDepth = 6;
+            }
+
+            auto dstArray = tex->m_cudaArray;
+            if (tex->m_cudaMipMappedArray)
+            {
+                // Get the array for the mip level
+                SLANG_CUDA_RETURN_ON_FAIL(
+                    cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
+            }
+            SLANG_ASSERT(dstArray);
+
+            // Check using the desc to see if it's plausible
+            {
+                CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
+
+                SLANG_ASSERT(mipWidth == arrayDesc.Width);
+                SLANG_ASSERT(
+                    mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
+            }
+
+            const void* srcDataPtr = nullptr;
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube);
+
+                // TODO(JS): Here I assume that arrays are just held contiguously within a
+                // 'face' This seems reasonable and works with the Copy3D.
+                const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                Index faceCount = desc.arraySize;
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    faceCount *= 6;
+                }
+
+                const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
+                workspace.setCount(mipSizeInBytes);
+
+                // We need to add the face data from each mip
+                // We iterate over face count so we copy all of the cubemap faces
+                for (Index j = 0; j < faceCount; j++)
+                {
+                    const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
+                    // Copy over to the workspace to make contiguous
+                    ::memcpy(
+                        workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
+                }
+
+                srcDataPtr = workspace.getBuffer();
+            }
+            else
+            {
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                    workspace.setCount(faceSizeInBytes * 6);
+                    // Copy the data over to make contiguous
+                    for (Index j = 0; j < 6; j++)
+                    {
+                        const auto srcData =
+                            initData[mipLevel + j * desc.numMipLevels].data;
+                        ::memcpy(
+                            workspace.getBuffer() + faceSizeInBytes * j,
+                            srcData,
+                            faceSizeInBytes);
+                    }
+                    srcDataPtr = workspace.getBuffer();
+                }
+                else
+                {
+                    const auto srcData = initData[mipLevel].data;
+                    srcDataPtr = srcData;
+                }
+            }
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube);
+
+                CUDA_MEMCPY3D copyParam;
+                memset(&copyParam, 0, sizeof(copyParam));
+
+                copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                copyParam.dstArray = dstArray;
+
+                copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                copyParam.srcHost = srcDataPtr;
+                copyParam.srcPitch = mipWidth * elementSize;
+                copyParam.WidthInBytes = copyParam.srcPitch;
+                copyParam.Height = mipHeight;
+                // Set the depth to the array length
+                copyParam.Depth = desc.arraySize;
+
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    copyParam.Depth *= 6;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+            }
+            else
+            {
+                switch (desc.type)
+                {
+                case IResource::Type::Texture1D:
+                case IResource::Type::Texture2D:
+                {
+                    CUDA_MEMCPY2D copyParam;
+                    memset(&copyParam, 0, sizeof(copyParam));
+                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                    copyParam.dstArray = dstArray;
+                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                    copyParam.srcHost = srcDataPtr;
+                    copyParam.srcPitch = mipWidth * elementSize;
+                    copyParam.WidthInBytes = copyParam.srcPitch;
+                    copyParam.Height = mipHeight;
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+                    break;
+                }
+                case IResource::Type::Texture3D:
+                case IResource::Type::TextureCube:
+                {
+                    CUDA_MEMCPY3D copyParam;
+                    memset(&copyParam, 0, sizeof(copyParam));
+
+                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                    copyParam.dstArray = dstArray;
+
+                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                    copyParam.srcHost = srcDataPtr;
+                    copyParam.srcPitch = mipWidth * elementSize;
+                    copyParam.WidthInBytes = copyParam.srcPitch;
+                    copyParam.Height = mipHeight;
+                    copyParam.Depth = mipDepth;
+
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+                    break;
+                }
+
+                default:
+                {
+                    SLANG_ASSERT(!"Not implemented");
+                    break;
+                }
+                }
+            }
+        }
+    }
+    // Set up texture sampling parameters, and create final texture obj
+
+    {
+        CUDA_RESOURCE_DESC resDesc;
+        memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+        resDesc.resType = resourceType;
+
+        if (tex->m_cudaArray)
+        {
+            resDesc.res.array.hArray = tex->m_cudaArray;
+        }
+        if (tex->m_cudaMipMappedArray)
+        {
+            resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
+        }
+
+        // If the texture might be used as a UAV, then we need to allocate
+        // a CUDA "surface" for it.
+        //
+        // Note: We cannot do this unconditionally, because it will fail
+        // on surfaces that are not usable as UAVs (e.g., those with
+        // mipmaps).
+        //
+        // TODO: We should really only be allocating the array at the
+        // time we create a resource, and then allocate the surface or
+        // texture objects as part of view creation.
+        //
+        if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
+        {
+            // On CUDA surfaces only support a single MIP map
+            SLANG_ASSERT(desc.numMipLevels == 1);
+
+            SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+        }
+
+
+        // Create handle for sampling.
+        CUDA_TEXTURE_DESC texDesc;
+        memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+        texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+        texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+    }
+
+    returnComPtr(outResource, tex);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
+    const IBufferResource::Desc& descIn,
+    const void* initData,
+    IBufferResource** outResource)
+{
+    auto desc = fixupBufferDesc(descIn);
+    RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+    if (initData)
+    {
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+    }
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
+    InteropHandle handle,
+    const IBufferResource::Desc& desc,
+    IBufferResource** outResource)
+{
+    if (handle.handleValue == 0)
+    {
+        *outResource = nullptr;
+        return SLANG_OK;
+    }
+
+    RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+
+    // CUDA manages sharing of buffers through the idea of an
+    // "external memory" object, which represents the relationship
+    // with another API's objects. In order to create this external
+    // memory association, we first need to fill in a descriptor struct.
+    cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+    memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+    switch (handle.api)
+    {
+    case InteropHandleAPI::D3D12:
+        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+        break;
+    case InteropHandleAPI::Vulkan:
+        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+        break;
+    default:
+        return SLANG_FAIL;
+    }
+    externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+    externalMemoryHandleDesc.size = desc.sizeInBytes;
+    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+    // Once we have filled in the descriptor, we can request
+    // that CUDA create the required association between the
+    // external buffer and its own memory.
+    cudaExternalMemory_t externalMemory;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+    resource->m_cudaExternalMemory = externalMemory;
+
+    // The CUDA "external memory" handle is not itself a device
+    // pointer, so we need to query for a suitable device address
+    // for the buffer with another call.
+    //
+    // Just as for the external memory, we fill in a descriptor
+    // structure (although in this case we only need to specify
+    // the size).
+    cudaExternalMemoryBufferDesc bufferDesc;
+    memset(&bufferDesc, 0, sizeof(bufferDesc));
+    bufferDesc.size = desc.sizeInBytes;
+
+    // Finally, we can "map" the buffer to get a device address.
+    void* deviceAddress;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+    resource->m_cudaMemory = deviceAddress;
+
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
+    InteropHandle handle,
+    const ITextureResource::Desc& desc,
+    const size_t size,
+    ITextureResource** outResource)
+{
+    if (handle.handleValue == 0)
+    {
+        *outResource = nullptr;
+        return SLANG_OK;
+    }
+
+    RefPtr<TextureResourceImpl> resource = new TextureResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+
+    // CUDA manages sharing of buffers through the idea of an
+    // "external memory" object, which represents the relationship
+    // with another API's objects. In order to create this external
+    // memory association, we first need to fill in a descriptor struct.
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
+    memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+    switch (handle.api)
+    {
+    case InteropHandleAPI::D3D12:
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
+        break;
+    case InteropHandleAPI::Vulkan:
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
+        break;
+    default:
+        return SLANG_FAIL;
+    }
+    externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+    externalMemoryHandleDesc.size = size;
+    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+    CUexternalMemory externalMemory;
+    SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+    resource->m_cudaExternalMemory = externalMemory;
+
+    FormatInfo formatInfo;
+    SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    arrayDesc.Depth = desc.size.depth;
+    arrayDesc.Height = desc.size.height;
+    arrayDesc.Width = desc.size.width;
+    arrayDesc.NumChannels = formatInfo.channelCount;
+    getCUDAFormat(desc.format, &arrayDesc.Format);
+    arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
+
+    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
+    memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
+    externalMemoryMipDesc.offset = 0;
+    externalMemoryMipDesc.arrayDesc = arrayDesc;
+    externalMemoryMipDesc.numLevels = desc.numMipLevels;
+
+    CUmipmappedArray mipArray;
+    SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
+    resource->m_cudaMipMappedArray = mipArray;
+
+    CUarray cuArray;
+    SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
+    resource->m_cudaArray = cuArray;
+
+    CUDA_RESOURCE_DESC surfDesc;
+    memset(&surfDesc, 0, sizeof(surfDesc));
+    surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    surfDesc.res.array.hArray = cuArray;
+
+    CUsurfObject surface;
+    SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
+    resource->m_cudaSurfObj = surface;
+
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureView(
+    ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView)
+{
+    RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+    view->m_desc = desc;
+    view->textureResource = dynamic_cast<TextureResourceImpl*>(texture);
+    returnComPtr(outView, view);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferView(
+    IBufferResource* buffer,
+    IBufferResource* counterBuffer,
+    IResourceView::Desc const& desc,
+    IResourceView** outView)
+{
+    RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+    view->m_desc = desc;
+    view->memoryResource = dynamic_cast<BufferResourceImpl*>(buffer);
+    returnComPtr(outView, view);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createQueryPool(
+    const IQueryPool::Desc& desc,
+    IQueryPool** outPool)
+{
+    RefPtr<QueryPoolImpl> pool = new QueryPoolImpl();
+    SLANG_RETURN_ON_FAIL(pool->init(desc));
+    returnComPtr(outPool, pool);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObjectLayout(
+    slang::TypeLayoutReflection* typeLayout,
+    ShaderObjectLayoutBase** outLayout)
+{
+    RefPtr<ShaderObjectLayoutImpl> cudaLayout;
+    cudaLayout = new ShaderObjectLayoutImpl(this, typeLayout);
+    returnRefPtrMove(outLayout, cudaLayout);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObject(
+    ShaderObjectLayoutBase* layout,
+    IShaderObject** outObject)
+{
+    RefPtr<ShaderObjectImpl> result = new ShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+    returnComPtr(outObject, result);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createMutableShaderObject(
+    ShaderObjectLayoutBase* layout,
+    IShaderObject** outObject)
+{
+    RefPtr<MutableShaderObjectImpl> result = new MutableShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+    returnComPtr(outObject, result);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
+{
+    auto cudaProgram = dynamic_cast<ShaderProgramImpl*>(program);
+    auto cudaLayout = cudaProgram->layout;
+
+    RefPtr<RootShaderObjectImpl> result = new RootShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
+    returnRefPtrMove(outObject, result);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+    const IShaderProgram::Desc& desc,
+    IShaderProgram** outProgram,
+    ISlangBlob** outDiagnosticBlob)
+{
+    // If this is a specializable program, we just keep a reference to the slang program and
+    // don't actually create any kernels. This program will be specialized later when we know
+    // the shader object bindings.
+    RefPtr<ShaderProgramImpl> cudaProgram = new ShaderProgramImpl();
+    cudaProgram->init(desc);
+    cudaProgram->cudaContext = m_context;
+    if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
+    {
+        cudaProgram->layout = new RootShaderObjectLayoutImpl(this, desc.slangGlobalScope->getLayout());
+        returnComPtr(outProgram, cudaProgram);
+        return SLANG_OK;
+    }
+
+    ComPtr<ISlangBlob> kernelCode;
+    ComPtr<ISlangBlob> diagnostics;
+    auto compileResult = desc.slangGlobalScope->getEntryPointCode(
+        (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
+    if (diagnostics)
+    {
+        getDebugCallback()->handleMessage(
+            compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
+            DebugMessageSource::Slang,
+            (char*)diagnostics->getBufferPointer());
+        if (outDiagnosticBlob)
+            returnComPtr(outDiagnosticBlob, diagnostics);
+    }
+    SLANG_RETURN_ON_FAIL(compileResult);
+
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
+    cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
+        &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
+
+    auto slangGlobalScope = desc.slangGlobalScope;
+    if (slangGlobalScope)
+    {
+        cudaProgram->slangGlobalScope = slangGlobalScope;
+
+        auto slangProgramLayout = slangGlobalScope->getLayout();
+        if (!slangProgramLayout)
+            return SLANG_FAIL;
+
+        RefPtr<RootShaderObjectLayoutImpl> cudaLayout;
+        cudaLayout = new RootShaderObjectLayoutImpl(this, slangProgramLayout);
+        cudaLayout->programLayout = slangProgramLayout;
+        cudaProgram->layout = cudaLayout;
+    }
+
+    returnComPtr(outProgram, cudaProgram);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createComputePipelineState(
+    const ComputePipelineStateDesc& desc, IPipelineState** outState)
+{
+    RefPtr<ComputePipelineStateImpl> state = new ComputePipelineStateImpl();
+    state->shaderProgram = static_cast<ShaderProgramImpl*>(desc.program);
+    state->init(desc);
+    returnComPtr(outState, state);
+    return Result();
+}
+
+void* DeviceImpl::map(IBufferResource* buffer)
+{
+    return static_cast<BufferResourceImpl*>(buffer)->m_cudaMemory;
+}
+
+void DeviceImpl::unmap(IBufferResource* buffer)
+{
+    SLANG_UNUSED(buffer);
+}
+
+SLANG_NO_THROW const DeviceInfo& SLANG_MCALL DeviceImpl::getDeviceInfo() const
+{
+    return m_info;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTransientResourceHeap(
+    const ITransientResourceHeap::Desc& desc,
+    ITransientResourceHeap** outHeap)
+{
+    RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, desc));
+    returnComPtr(outHeap, result);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue)
+{
+    RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
+    queue->init(this);
+    returnComPtr(outQueue, queue);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createSwapchain(
+    const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(window);
+    SLANG_UNUSED(outSwapchain);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createFramebufferLayout(
+    const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outLayout);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outFramebuffer);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createRenderPassLayout(
+    const IRenderPassLayout::Desc& desc,
+    IRenderPassLayout** outRenderPassLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outRenderPassLayout);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler)
+{
+    SLANG_UNUSED(desc);
+    *outSampler = nullptr;
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createInputLayout(
+    IInputLayout::Desc const& desc,
+    IInputLayout** outLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outLayout);
+    return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createGraphicsPipelineState(
+    const GraphicsPipelineStateDesc& desc, IPipelineState** outState)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outState);
+    return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::readTextureResource(
+    ITextureResource* texture,
+    ResourceState state,
+    ISlangBlob** outBlob,
+    size_t* outRowPitch,
+    size_t* outPixelSize)
+{
+    auto textureImpl = static_cast<TextureResourceImpl*>(texture);
+    RefPtr<ListBlob> blob = new ListBlob();
+
+    auto desc = textureImpl->getDesc();
+    auto width = desc->size.width;
+    auto height = desc->size.height;
+    FormatInfo sizeInfo;
+    SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
+    size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
+    size_t rowPitch = width * pixelSize;
+    size_t size = height * rowPitch;
+    blob->m_data.setCount((Index)size);
+
+    CUDA_MEMCPY2D copyParam;
+    memset(&copyParam, 0, sizeof(copyParam));
+
+    copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+    copyParam.srcArray = textureImpl->m_cudaArray;
+
+    copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
+    copyParam.dstHost = blob->m_data.getBuffer();
+    copyParam.dstPitch = rowPitch;
+    copyParam.WidthInBytes = copyParam.dstPitch;
+    copyParam.Height = height;
+    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+
+    *outRowPitch = rowPitch;
+    *outPixelSize = pixelSize;
+    returnComPtr(outBlob, blob);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
+    IBufferResource* buffer,
+    size_t offset,
+    size_t size,
+    ISlangBlob** outBlob)
+{
+    auto bufferImpl = static_cast<BufferResourceImpl*>(buffer);
+    RefPtr<ListBlob> blob = new ListBlob();
+    blob->m_data.setCount((Index)size);
+    cudaMemcpy(
+        blob->m_data.getBuffer(),
+        (uint8_t*)bufferImpl->m_cudaMemory + offset,
+        size,
+        cudaMemcpyDefault);
+    returnComPtr(outBlob, blob);
+    return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-device.h b/tools/gfx/cuda/cuda-device.h
new file mode 100644
index 000000000..e6a91393b
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.h
@@ -0,0 +1,152 @@
+// cuda-device.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-command-buffer.h"
+#include "cuda-context.h"
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class DeviceImpl : public RendererBase
+{
+private:
+    static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
+    static int _calcSMCountPerMultiProcessor(int major, int minor);
+
+    static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex);
+
+    static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal);
+
+private:
+    int m_deviceIndex = -1;
+    CUdevice m_device = 0;
+    RefPtr<CUDAContext> m_context;
+    DeviceInfo m_info;
+    String m_adapterName;
+
+public:
+    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override;
+
+    virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override;
+
+    Result getCUDAFormat(Format format, CUarray_format* outFormat);
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
+        const ITextureResource::Desc& desc,
+        const ITextureResource::SubresourceData* initData,
+        ITextureResource** outResource) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
+        const IBufferResource::Desc& descIn,
+        const void* initData,
+        IBufferResource** outResource) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle(
+        InteropHandle handle,
+        const IBufferResource::Desc& desc,
+        IBufferResource** outResource) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle(
+        InteropHandle handle,
+        const ITextureResource::Desc& desc,
+        const size_t size,
+        ITextureResource** outResource) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
+        ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
+        IBufferResource* buffer,
+        IBufferResource* counterBuffer,
+        IResourceView::Desc const& desc,
+        IResourceView** outView) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool(
+        const IQueryPool::Desc& desc,
+        IQueryPool** outPool) override;
+
+    virtual Result createShaderObjectLayout(
+        slang::TypeLayoutReflection* typeLayout,
+        ShaderObjectLayoutBase** outLayout) override;
+
+    virtual Result createShaderObject(
+        ShaderObjectLayoutBase* layout,
+        IShaderObject** outObject) override;
+
+    virtual Result createMutableShaderObject(
+        ShaderObjectLayoutBase* layout,
+        IShaderObject** outObject) override;
+
+    Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject);
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+        const IShaderProgram::Desc& desc,
+        IShaderProgram** outProgram,
+        ISlangBlob** outDiagnosticBlob) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
+        const ComputePipelineStateDesc& desc, IPipelineState** outState) override;
+
+    void* map(IBufferResource* buffer);
+
+    void unmap(IBufferResource* buffer);
+
+    virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override;
+
+public:
+    using TransientResourceHeapImpl = SimpleTransientResourceHeap<DeviceImpl, CommandBufferImpl>;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap(
+        const ITransientResourceHeap::Desc& desc,
+        ITransientResourceHeap** outHeap) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
+        const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
+        const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
+        const IRenderPassLayout::Desc& desc,
+        IRenderPassLayout** outRenderPassLayout) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override;
+    
+    virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
+        IInputLayout::Desc const& desc,
+        IInputLayout** outLayout) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
+        const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override;
+
+    virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
+        ITextureResource* texture,
+        ResourceState state,
+        ISlangBlob** outBlob,
+        size_t* outRowPitch,
+        size_t* outPixelSize) override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
+        IBufferResource* buffer,
+        size_t offset,
+        size_t size,
+        ISlangBlob** outBlob) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp
new file mode 100644
index 000000000..6325d9fc6
--- /dev/null
+++ b/tools/gfx/cuda/cuda-helper-functions.cpp
@@ -0,0 +1,91 @@
+// cuda-helper-functions.cpp
+#include "cuda-helper-functions.h"
+
+#include "cuda-device.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+SlangResult CUDAErrorInfo::handle() const
+{
+    StringBuilder builder;
+    builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
+
+    if (m_errorName)
+    {
+        builder << m_errorName << " : ";
+    }
+    if (m_errorString)
+    {
+        builder << m_errorString;
+    }
+
+    getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver,
+        builder.getUnownedSlice().begin());
+
+    // Slang::signalUnexpectedError(builder.getBuffer());
+    return SLANG_FAIL;
+}
+
+SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
+{
+    CUDAErrorInfo info(file, line);
+    cuGetErrorString(cuResult, &info.m_errorString);
+    cuGetErrorName(cuResult, &info.m_errorName);
+    return info.handle();
+}
+
+SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
+{
+    return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
+}
+
+#    ifdef RENDER_TEST_OPTIX
+
+static bool _isError(OptixResult result)
+{
+    return result != OPTIX_SUCCESS;
+}
+
+#        if 1
+static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
+{
+    fprintf(
+        stderr,
+        "%s(%d): optix: %s (%s)\n",
+        file,
+        line,
+        optixGetErrorString(result),
+        optixGetErrorName(result));
+    return SLANG_FAIL;
+}
+
+void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
+{
+    fprintf(stderr, "optix: %s (%s)\n", message, tag);
+}
+#       endif
+#    endif
+} // namespace cuda
+
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
+{
+RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl();
+SLANG_RETURN_ON_FAIL(result->initialize(*desc));
+returnComPtr(outDevice, result);
+return SLANG_OK;
+}
+#else
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
+{
+SLANG_UNUSED(desc);
+*outDevice = nullptr;
+return SLANG_FAIL;
+}
+#endif
+
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h
new file mode 100644
index 000000000..001e3946a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-helper-functions.h
@@ -0,0 +1,106 @@
+// cuda-helper-functions.h
+#pragma once
+
+#include "slang-gfx.h"
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; }
+SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; }
+
+// A enum used to control if errors are reported on failure of CUDA call.
+enum class CUDAReportStyle
+{
+    Normal,
+    Silent,
+};
+
+struct CUDAErrorInfo
+{
+    CUDAErrorInfo(
+        const char* filePath,
+        int lineNo,
+        const char* errorName = nullptr,
+        const char* errorString = nullptr)
+        : m_filePath(filePath)
+        , m_lineNo(lineNo)
+        , m_errorName(errorName)
+        , m_errorString(errorString)
+    {}
+    SlangResult handle() const;
+
+    const char* m_filePath;
+    int m_lineNo;
+    const char* m_errorName;
+    const char* m_errorString;
+};
+
+// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
+
+SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line);
+SlangResult _handleCUDAError(cudaError_t error, const char* file, int line);
+
+#    define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
+
+#    define SLANG_CUDA_RETURN_ON_FAIL(x)              \
+        {                                             \
+            auto _res = x;                            \
+            if (_isError(_res))                       \
+                return SLANG_CUDA_HANDLE_ERROR(_res); \
+        }
+
+#    define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r)                               \
+        {                                                                             \
+            auto _res = x;                                                            \
+            if (_isError(_res))                                                       \
+            {                                                                         \
+                return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
+                                                        : SLANG_FAIL;                 \
+            }                                                                         \
+        }
+
+#    define SLANG_CUDA_ASSERT_ON_FAIL(x)           \
+        {                                          \
+            auto _res = x;                         \
+            if (_isError(_res))                    \
+            {                                      \
+                SLANG_ASSERT(!"Failed CUDA call"); \
+            };                                     \
+        }
+
+#    ifdef RENDER_TEST_OPTIX
+
+bool _isError(OptixResult result);
+
+#        if 1
+SlangResult _handleOptixError(OptixResult result, char const* file, int line);
+
+#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
+#        else
+#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
+#        endif
+
+#        define SLANG_OPTIX_RETURN_ON_FAIL(EXPR)           \
+            do                                             \
+            {                                              \
+                auto _res = EXPR;                          \
+                if (_isError(_res))                        \
+                    return SLANG_OPTIX_HANDLE_ERROR(_res); \
+            } while (0)
+
+void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData);
+
+#    endif
+
+} // namespace cuda
+#endif
+
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
+
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-pipeline-state.cpp b/tools/gfx/cuda/cuda-pipeline-state.cpp
new file mode 100644
index 000000000..fdbc0ea0e
--- /dev/null
+++ b/tools/gfx/cuda/cuda-pipeline-state.cpp
@@ -0,0 +1,22 @@
+// cuda-pipeline-state.cpp
+#include "cuda-pipeline-state.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+void ComputePipelineStateImpl::init(const ComputePipelineStateDesc& inDesc)
+{
+    PipelineStateDesc pipelineDesc;
+    pipelineDesc.type = PipelineType::Compute;
+    pipelineDesc.compute = inDesc;
+    initializeBase(pipelineDesc);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-pipeline-state.h b/tools/gfx/cuda/cuda-pipeline-state.h
new file mode 100644
index 000000000..e13266d8b
--- /dev/null
+++ b/tools/gfx/cuda/cuda-pipeline-state.h
@@ -0,0 +1,28 @@
+// cuda-pipeline-state.h
+#pragma once
+#include "cuda-base.h"
+#include "cuda-shader-program.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class PipelineStateImpl : public PipelineStateBase
+{
+public:
+};
+
+class ComputePipelineStateImpl : public PipelineStateImpl
+{
+public:
+    RefPtr<ShaderProgramImpl> shaderProgram;
+    void init(const ComputePipelineStateDesc& inDesc);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-query.cpp b/tools/gfx/cuda/cuda-query.cpp
new file mode 100644
index 000000000..7e97699f8
--- /dev/null
+++ b/tools/gfx/cuda/cuda-query.cpp
@@ -0,0 +1,48 @@
+// cuda-query.cpp
+#include "cuda-query.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+Result QueryPoolImpl::init(const IQueryPool::Desc& desc)
+{
+    cuEventCreate(&m_startEvent, 0);
+    cuEventRecord(m_startEvent, 0);
+    m_events.setCount(desc.count);
+    for (SlangInt i = 0; i < m_events.getCount(); i++)
+    {
+        cuEventCreate(&m_events[i], 0);
+    }
+    return SLANG_OK;
+}
+
+QueryPoolImpl::~QueryPoolImpl()
+{
+    for (auto& e : m_events)
+    {
+        cuEventDestroy(e);
+    }
+    cuEventDestroy(m_startEvent);
+}
+
+SLANG_NO_THROW Result SLANG_MCALL QueryPoolImpl::getResult(
+    GfxIndex queryIndex, GfxCount count, uint64_t* data)
+{
+    for (GfxIndex i = 0; i < count; i++)
+    {
+        float time = 0.0f;
+        cuEventSynchronize(m_events[i + queryIndex]);
+        cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]);
+        data[i] = (uint64_t)((double)time * 1000.0f);
+    }
+    return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-query.h b/tools/gfx/cuda/cuda-query.h
new file mode 100644
index 000000000..db29f488d
--- /dev/null
+++ b/tools/gfx/cuda/cuda-query.h
@@ -0,0 +1,32 @@
+// cuda-query.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class QueryPoolImpl : public QueryPoolBase
+{
+public:
+    // The event object for each query. Owned by the pool.
+    List<CUevent> m_events;
+
+    // The event that marks the starting point.
+    CUevent m_startEvent;
+
+    Result init(const IQueryPool::Desc& desc);
+
+    ~QueryPoolImpl();
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL getResult(
+        GfxIndex queryIndex, GfxCount count, uint64_t* data) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-resource-views.h b/tools/gfx/cuda/cuda-resource-views.h
new file mode 100644
index 000000000..33bf557d7
--- /dev/null
+++ b/tools/gfx/cuda/cuda-resource-views.h
@@ -0,0 +1,26 @@
+// cuda-resource-views.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-buffer.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ResourceViewImpl : public ResourceViewBase
+{
+public:
+    RefPtr<BufferResourceImpl> memoryResource = nullptr;
+    RefPtr<TextureResourceImpl> textureResource = nullptr;
+    void* proxyBuffer = nullptr;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object-layout.cpp b/tools/gfx/cuda/cuda-shader-object-layout.cpp
new file mode 100644
index 000000000..0cbe23a63
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object-layout.cpp
@@ -0,0 +1,153 @@
+// cuda-shader-object-layout.cpp
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ShaderObjectLayoutImpl::ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout)
+{
+    m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType);
+
+    initBase(renderer, m_elementTypeLayout);
+
+    // Compute the binding ranges that are used to store
+    // the logical contents of the object in memory. These will relate
+    // to the descriptor ranges in the various sets, but not always
+    // in a one-to-one fashion.
+
+    SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
+    for (SlangInt r = 0; r < bindingRangeCount; ++r)
+    {
+        slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
+        SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
+        slang::TypeLayoutReflection* slangLeafTypeLayout =
+            m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
+
+        SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
+        SlangInt rangeIndexInDescriptorSet =
+            m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
+
+        // TODO: This logic assumes that for any binding range that might consume
+        // multiple kinds of resources, the descriptor range for its uniform
+        // usage will be the first one in the range.
+        //
+        // We need to decide whether that assumption is one we intend to support
+        // applications making, or whether they should be forced to perform a
+        // linear search over the descriptor ranges for a specific binding range.
+        //
+        auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
+            descriptorSetIndex, rangeIndexInDescriptorSet);
+
+        Index baseIndex = 0;
+        Index subObjectIndex = 0;
+        switch (slangBindingType)
+        {
+        case slang::BindingType::ConstantBuffer:
+        case slang::BindingType::ParameterBlock:
+        case slang::BindingType::ExistentialValue:
+            baseIndex = m_subObjectCount;
+            subObjectIndex = baseIndex;
+            m_subObjectCount += count;
+            break;
+        case slang::BindingType::RawBuffer:
+        case slang::BindingType::MutableRawBuffer:
+            if (slangLeafTypeLayout->getType()->getElementType() != nullptr)
+            {
+                // A structured buffer occupies both a resource slot and
+                // a sub-object slot.
+                subObjectIndex = m_subObjectCount;
+                m_subObjectCount += count;
+            }
+            baseIndex = m_resourceCount;
+            m_resourceCount += count;
+            break;
+        default:
+            baseIndex = m_resourceCount;
+            m_resourceCount += count;
+            break;
+        }
+
+        BindingRangeInfo bindingRangeInfo;
+        bindingRangeInfo.bindingType = slangBindingType;
+        bindingRangeInfo.count = count;
+        bindingRangeInfo.baseIndex = baseIndex;
+        bindingRangeInfo.uniformOffset = uniformOffset;
+        bindingRangeInfo.subObjectIndex = subObjectIndex;
+        m_bindingRanges.add(bindingRangeInfo);
+    }
+
+    SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
+    for (SlangInt r = 0; r < subObjectRangeCount; ++r)
+    {
+        SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
+        auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
+        slang::TypeLayoutReflection* slangLeafTypeLayout =
+            m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
+
+        // A sub-object range can either represent a sub-object of a known
+        // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
+        // (in which case we can pre-compute a layout to use, based on
+        // the type `Foo`) *or* it can represent a sub-object of some
+        // existential type (e.g., `IBar`) in which case we cannot
+        // know the appropriate type/layout of sub-object to allocate.
+        //
+        RefPtr<ShaderObjectLayoutImpl> subObjectLayout;
+        if (slangBindingType != slang::BindingType::ExistentialValue)
+        {
+            subObjectLayout =
+                new ShaderObjectLayoutImpl(renderer, slangLeafTypeLayout->getElementTypeLayout());
+        }
+
+        SubObjectRangeInfo subObjectRange;
+        subObjectRange.bindingRangeIndex = bindingRangeIndex;
+        subObjectRange.layout = subObjectLayout;
+        subObjectRanges.add(subObjectRange);
+    }
+}
+
+Index ShaderObjectLayoutImpl::getResourceCount() const { return m_resourceCount; }
+Index ShaderObjectLayoutImpl::getSubObjectCount() const { return m_subObjectCount; }
+List<SubObjectRangeInfo>& ShaderObjectLayoutImpl::getSubObjectRanges() { return subObjectRanges; }
+BindingRangeInfo ShaderObjectLayoutImpl::getBindingRange(Index index) { return m_bindingRanges[index]; }
+Index ShaderObjectLayoutImpl::getBindingRangeCount() const { return m_bindingRanges.getCount(); }
+
+RootShaderObjectLayoutImpl::RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout)
+    : ShaderObjectLayoutImpl(renderer, inProgramLayout->getGlobalParamsTypeLayout())
+    , programLayout(inProgramLayout)
+{
+    for (UInt i = 0; i < programLayout->getEntryPointCount(); i++)
+    {
+        entryPointLayouts.add(new ShaderObjectLayoutImpl(
+            renderer,
+            programLayout->getEntryPointByIndex(i)->getTypeLayout()));
+    }
+
+}
+
+int RootShaderObjectLayoutImpl::getKernelIndex(UnownedStringSlice kernelName)
+{
+    for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
+    {
+        auto entryPoint = programLayout->getEntryPointByIndex(i);
+        if (kernelName == entryPoint->getName())
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+void RootShaderObjectLayoutImpl::getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
+{
+    auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
+    entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object-layout.h b/tools/gfx/cuda/cuda-shader-object-layout.h
new file mode 100644
index 000000000..305129109
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object-layout.h
@@ -0,0 +1,74 @@
+// cuda-shader-object-layout.h
+#pragma once
+#include "cuda-base.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+struct BindingRangeInfo
+{
+    slang::BindingType bindingType;
+    Index count;
+    Index baseIndex; // Flat index for sub-objects
+    Index subObjectIndex;
+
+    // TODO: The `uniformOffset` field should be removed,
+    // since it cannot be supported by the Slang reflection
+    // API once we fix some design issues.
+    //
+    // It is only being used today for pre-allocation of sub-objects
+    // for constant buffers and parameter blocks (which should be
+    // deprecated/removed anyway).
+    //
+    // Note: We would need to bring this field back, plus
+    // a lot of other complexity, if we ever want to support
+    // setting of resources/buffers directly by a binding
+    // range index and array index.
+    //
+    Index uniformOffset; // Uniform offset for a resource typed field.
+};
+
+struct SubObjectRangeInfo
+{
+    RefPtr<ShaderObjectLayoutImpl> layout;
+    Index bindingRangeIndex;
+};
+
+class ShaderObjectLayoutImpl : public ShaderObjectLayoutBase
+{
+public:
+    List<SubObjectRangeInfo> subObjectRanges;
+    List<BindingRangeInfo> m_bindingRanges;
+
+    Index m_subObjectCount = 0;
+    Index m_resourceCount = 0;
+
+    ShaderObjectLayoutImpl(RendererBase* renderer, slang::TypeLayoutReflection* layout);
+
+    Index getResourceCount() const;
+    Index getSubObjectCount() const;
+    List<SubObjectRangeInfo>& getSubObjectRanges();
+    BindingRangeInfo getBindingRange(Index index);
+    Index getBindingRangeCount() const;
+};
+
+class RootShaderObjectLayoutImpl : public ShaderObjectLayoutImpl
+{
+public:
+    slang::ProgramLayout* programLayout = nullptr;
+    List<RefPtr<ShaderObjectLayoutImpl>> entryPointLayouts;
+    RootShaderObjectLayoutImpl(RendererBase* renderer, slang::ProgramLayout* inProgramLayout);
+
+    int getKernelIndex(UnownedStringSlice kernelName);
+
+    void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes);
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp
new file mode 100644
index 000000000..3fc55e401
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object.cpp
@@ -0,0 +1,349 @@
+// cuda-shader-object.cpp
+#include "cuda-shader-object.h"
+
+#include "cuda-shader-object-layout.h"
+#include "cuda-resource-views.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+Result ShaderObjectData::setCount(Index count)
+{
+    if (isHostOnly)
+    {
+        m_cpuBuffer.setCount(count);
+        if (!m_bufferView)
+        {
+            IResourceView::Desc viewDesc = {};
+            viewDesc.type = IResourceView::Type::UnorderedAccess;
+            m_bufferView = new ResourceViewImpl();
+            m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer();
+            m_bufferView->m_desc = viewDesc;
+        }
+        return SLANG_OK;
+    }
+
+    if (!m_bufferResource)
+    {
+        IBufferResource::Desc desc;
+        desc.type = IResource::Type::Buffer;
+        desc.sizeInBytes = count;
+        m_bufferResource = new BufferResourceImpl(desc);
+        if (count)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
+        }
+        IResourceView::Desc viewDesc = {};
+        viewDesc.type = IResourceView::Type::UnorderedAccess;
+        m_bufferView = new ResourceViewImpl();
+        m_bufferView->memoryResource = m_bufferResource;
+        m_bufferView->m_desc = viewDesc;
+    }
+    auto oldSize = m_bufferResource->getDesc()->sizeInBytes;
+    if ((size_t)count != oldSize)
+    {
+        void* newMemory = nullptr;
+        if (count)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
+        }
+        if (oldSize)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+                newMemory,
+                m_bufferResource->m_cudaMemory,
+                Math::Min((size_t)count, oldSize),
+                cudaMemcpyDefault));
+        }
+        cudaFree(m_bufferResource->m_cudaMemory);
+        m_bufferResource->m_cudaMemory = newMemory;
+        m_bufferResource->getDesc()->sizeInBytes = count;
+    }
+    return SLANG_OK;
+}
+
+Slang::Index ShaderObjectData::getCount()
+{
+    if (isHostOnly)
+        return m_cpuBuffer.getCount();
+    if (m_bufferResource)
+        return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes);
+    else
+        return 0;
+}
+
+void* ShaderObjectData::getBuffer()
+{
+    if (isHostOnly)
+        return m_cpuBuffer.getBuffer();
+
+    if (m_bufferResource)
+        return m_bufferResource->m_cudaMemory;
+    return nullptr;
+}
+
+/// Returns a resource view for GPU access into the buffer content.
+ResourceViewBase* ShaderObjectData::getResourceView(
+    RendererBase* device,
+    slang::TypeLayoutReflection* elementLayout,
+    slang::BindingType bindingType)
+{
+    SLANG_UNUSED(device);
+    m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize();
+    return m_bufferView.Ptr();
+}
+
+SlangResult ShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout)
+{
+    m_layout = typeLayout;
+
+    // If the layout tells us that there is any uniform data,
+    // then we need to allocate a constant buffer to hold that data.
+    //
+    // TODO: Do we need to allocate a shadow copy for use from
+    // the CPU?
+    //
+    // TODO: When/where do we bind this constant buffer into
+    // a descriptor set for later use?
+    //
+    auto slangLayout = getLayout()->getElementTypeLayout();
+    size_t uniformSize = slangLayout->getSize();
+    if (uniformSize)
+    {
+        m_data.setCount((Index)uniformSize);
+    }
+
+    // If the layout specifies that we have any resources or sub-objects,
+    // then we need to size the appropriate arrays to account for them.
+    //
+    // Note: the counts here are the *total* number of resources/sub-objects
+    // and not just the number of resource/sub-object ranges.
+    //
+    resources.setCount(typeLayout->getResourceCount());
+    m_objects.setCount(typeLayout->getSubObjectCount());
+
+    for (auto subObjectRange : getLayout()->subObjectRanges)
+    {
+        RefPtr<ShaderObjectLayoutImpl> subObjectLayout = subObjectRange.layout;
+
+        // In the case where the sub-object range represents an
+        // existential-type leaf field (e.g., an `IBar`), we
+        // cannot pre-allocate the object(s) to go into that
+        // range, since we can't possibly know what to allocate
+        // at this point.
+        //
+        if (!subObjectLayout)
+            continue;
+        //
+        // Otherwise, we will allocate a sub-object to fill
+        // in each entry in this range, based on the layout
+        // information we already have.
+
+        auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
+        for (Index i = 0; i < bindingRangeInfo.count; ++i)
+        {
+            RefPtr<ShaderObjectImpl> subObject = new ShaderObjectImpl();
+            SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
+
+            ShaderOffset offset;
+            offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
+            offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex;
+            offset.bindingArrayIndex = (GfxIndex)i;
+
+            SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
+        }
+    }
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW GfxCount SLANG_MCALL ShaderObjectImpl::getEntryPointCount()
+{
+    return 0;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint)
+{
+    *outEntryPoint = nullptr;
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW const void* SLANG_MCALL ShaderObjectImpl::getRawData()
+{
+    return m_data.getBuffer();
+}
+
+SLANG_NO_THROW Size SLANG_MCALL ShaderObjectImpl::getSize()
+{
+    return (Size)m_data.getCount();
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ShaderObjectImpl::setData(ShaderOffset const& offset, void const* data, Size size)
+{
+    Size temp = m_data.getCount() - (Size)offset.uniformOffset;
+    size = Math::Min(size, temp);
+    SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+        (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ShaderObjectImpl::setResource(ShaderOffset const& offset, IResourceView* resourceView)
+{
+    if (!resourceView)
+        return SLANG_OK;
+
+    auto layout = getLayout();
+
+    auto bindingRangeIndex = offset.bindingRangeIndex;
+    SLANG_ASSERT(bindingRangeIndex >= 0);
+    SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+    auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+
+    auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
+    auto cudaView = static_cast<ResourceViewImpl*>(resourceView);
+
+    resources[viewIndex] = cudaView;
+
+    if (cudaView->textureResource)
+    {
+        if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess)
+        {
+            auto handle = cudaView->textureResource->m_cudaSurfObj;
+            setData(offset, &handle, sizeof(uint64_t));
+        }
+        else
+        {
+            auto handle = cudaView->textureResource->getBindlessHandle();
+            setData(offset, &handle, sizeof(uint64_t));
+        }
+    }
+    else if (cudaView->memoryResource)
+    {
+        auto handle = cudaView->memoryResource->getBindlessHandle();
+        setData(offset, &handle, sizeof(handle));
+        auto sizeOffset = offset;
+        sizeOffset.uniformOffset += sizeof(handle);
+        auto& desc = *cudaView->memoryResource->getDesc();
+        size_t size = desc.sizeInBytes;
+        if (desc.elementSize > 1)
+            size /= desc.elementSize;
+        setData(sizeOffset, &size, sizeof(size));
+    }
+    else if (cudaView->proxyBuffer)
+    {
+        auto handle = cudaView->proxyBuffer;
+        setData(offset, &handle, sizeof(handle));
+        auto sizeOffset = offset;
+        sizeOffset.uniformOffset += sizeof(handle);
+        auto& desc = *cudaView->memoryResource->getDesc();
+        size_t size = desc.sizeInBytes;
+        if (desc.elementSize > 1)
+            size /= desc.elementSize;
+        setData(sizeOffset, &size, sizeof(size));
+    }
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ShaderObjectImpl::setObject(ShaderOffset const& offset, IShaderObject* object)
+{
+    SLANG_RETURN_ON_FAIL(Super::setObject(offset, object));
+
+    auto bindingRangeIndex = offset.bindingRangeIndex;
+    auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex];
+
+    ShaderObjectImpl* subObject = static_cast<ShaderObjectImpl*>(object);
+    switch (bindingRange.bindingType)
+    {
+    default:
+    {
+        void* subObjectDataBuffer = subObject->getBuffer();
+        SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*)));
+    }
+    break;
+    case slang::BindingType::ExistentialValue:
+    case slang::BindingType::RawBuffer:
+    case slang::BindingType::MutableRawBuffer:
+        break;
+    }
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    ShaderObjectImpl::setSampler(ShaderOffset const& offset, ISamplerState* sampler)
+{
+    SLANG_UNUSED(sampler);
+    SLANG_UNUSED(offset);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL ShaderObjectImpl::setCombinedTextureSampler(
+    ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler)
+{
+    SLANG_UNUSED(sampler);
+    setResource(offset, textureView);
+    return SLANG_OK;
+}
+
+EntryPointShaderObjectImpl::EntryPointShaderObjectImpl()
+{
+    m_data.isHostOnly = true;
+}
+
+SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::addRef()
+{
+    return 1;
+}
+
+SLANG_NO_THROW uint32_t SLANG_MCALL RootShaderObjectImpl::release()
+{
+    return 1;
+}
+
+SlangResult RootShaderObjectImpl::init(IDevice* device, ShaderObjectLayoutImpl* typeLayout)
+{
+    SLANG_RETURN_ON_FAIL(ShaderObjectImpl::init(device, typeLayout));
+    auto programLayout = dynamic_cast<RootShaderObjectLayoutImpl*>(typeLayout);
+    for (auto& entryPoint : programLayout->entryPointLayouts)
+    {
+        RefPtr<EntryPointShaderObjectImpl> object = new EntryPointShaderObjectImpl();
+        SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
+        entryPointObjects.add(object);
+    }
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW GfxCount SLANG_MCALL RootShaderObjectImpl::getEntryPointCount()
+{
+    return (GfxCount)entryPointObjects.getCount();
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    RootShaderObjectImpl::getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint)
+{
+    returnComPtr(outEntryPoint, entryPointObjects[index]);
+    return SLANG_OK;
+}
+
+Result RootShaderObjectImpl::collectSpecializationArgs(ExtendedShaderObjectTypeList& args)
+{
+    SLANG_RETURN_ON_FAIL(ShaderObjectImpl::collectSpecializationArgs(args));
+    for (auto& entryPoint : entryPointObjects)
+    {
+        SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
+    }
+    return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-object.h b/tools/gfx/cuda/cuda-shader-object.h
new file mode 100644
index 000000000..f564f5eb5
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-object.h
@@ -0,0 +1,93 @@
+// cuda-shader-object.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-buffer.h"
+#include "cuda-resource-views.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ShaderObjectData
+{
+public:
+    bool isHostOnly = false;
+    Slang::RefPtr<BufferResourceImpl> m_bufferResource;
+    Slang::RefPtr<ResourceViewImpl> m_bufferView;
+    Slang::List<uint8_t> m_cpuBuffer;
+
+    Result setCount(Index count);
+    Slang::Index getCount();
+    void* getBuffer();
+
+    /// Returns a resource view for GPU access into the buffer content.
+    ResourceViewBase* getResourceView(
+        RendererBase* device,
+        slang::TypeLayoutReflection* elementLayout,
+        slang::BindingType bindingType);
+};
+
+class ShaderObjectImpl
+    : public ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData>
+{
+    typedef ShaderObjectBaseImpl<ShaderObjectImpl, ShaderObjectLayoutImpl, ShaderObjectData>
+        Super;
+
+public:
+    List<RefPtr<ResourceViewImpl>> resources;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        init(IDevice* device, ShaderObjectLayoutImpl* typeLayout);
+
+    virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override;
+
+    virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override;
+
+    virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setData(ShaderOffset const& offset, void const* data, Size size) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setResource(ShaderOffset const& offset, IResourceView* resourceView) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setObject(ShaderOffset const& offset, IShaderObject* object) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setSampler(ShaderOffset const& offset, ISamplerState* sampler) override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
+        ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override;
+};
+
+class MutableShaderObjectImpl : public MutableShaderObject< MutableShaderObjectImpl, ShaderObjectLayoutImpl>
+{};
+
+class EntryPointShaderObjectImpl : public ShaderObjectImpl
+{
+public:
+    EntryPointShaderObjectImpl();
+};
+
+class RootShaderObjectImpl : public ShaderObjectImpl
+{
+public:
+    virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override;
+    virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override;
+public:
+    List<RefPtr<EntryPointShaderObjectImpl>> entryPointObjects;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        init(IDevice* device, ShaderObjectLayoutImpl* typeLayout) override;
+    virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override;
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override;
+    virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-program.cpp b/tools/gfx/cuda/cuda-shader-program.cpp
new file mode 100644
index 000000000..73e0c3c19
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-program.cpp
@@ -0,0 +1,20 @@
+// cuda-shader-program.cpp
+#include "cuda-shader-program.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+ShaderProgramImpl::~ShaderProgramImpl()
+{
+    if (cudaModule)
+        cuModuleUnload(cudaModule);
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-shader-program.h b/tools/gfx/cuda/cuda-shader-program.h
new file mode 100644
index 000000000..b0961bfc3
--- /dev/null
+++ b/tools/gfx/cuda/cuda-shader-program.h
@@ -0,0 +1,29 @@
+// cuda-shader-program.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-context.h"
+#include "cuda-shader-object-layout.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class ShaderProgramImpl : public ShaderProgramBase
+{
+public:
+    CUmodule cudaModule = nullptr;
+    CUfunction cudaKernel;
+    String kernelName;
+    RefPtr<RootShaderObjectLayoutImpl> layout;
+    RefPtr<CUDAContext> cudaContext;
+    ~ShaderProgramImpl();
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-texture.cpp b/tools/gfx/cuda/cuda-texture.cpp
new file mode 100644
index 000000000..6e6e42db5
--- /dev/null
+++ b/tools/gfx/cuda/cuda-texture.cpp
@@ -0,0 +1,48 @@
+// cuda-texture.cpp
+#include "cuda-texture.h"
+
+#include "cuda-helper-functions.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+TextureResourceImpl::~TextureResourceImpl()
+{
+    if (m_cudaSurfObj)
+    {
+        SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
+    }
+    if (m_cudaTexObj)
+    {
+        SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
+    }
+    if (m_cudaArray)
+    {
+        SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
+    }
+    if (m_cudaMipMappedArray)
+    {
+        SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
+    }
+}
+
+uint64_t TextureResourceImpl::getBindlessHandle()
+{
+    return (uint64_t)m_cudaTexObj;
+}
+
+Result TextureResourceImpl::getNativeResourceHandle(InteropHandle* outHandle)
+{
+    outHandle->handleValue = getBindlessHandle();
+    outHandle->api = InteropHandleAPI::CUDA;
+    return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/cuda-texture.h b/tools/gfx/cuda/cuda-texture.h
new file mode 100644
index 000000000..c9cc8569a
--- /dev/null
+++ b/tools/gfx/cuda/cuda-texture.h
@@ -0,0 +1,44 @@
+// cuda-texture.h
+#pragma once
+#include "cuda-base.h"
+
+#include "cuda-context.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+class TextureResourceImpl : public TextureResource
+{
+public:
+    TextureResourceImpl(const TextureResource::Desc& desc)
+        : TextureResource(desc)
+    {}
+    ~TextureResourceImpl();
+
+    uint64_t getBindlessHandle();
+
+    // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
+    // a long long
+    CUtexObject m_cudaTexObj = CUtexObject();
+
+    // The surfObj is for reading/writing 'texture like' things, but not for sampling.
+    CUsurfObject m_cudaSurfObj = CUsurfObject();
+
+    CUarray m_cudaArray = CUarray();
+    CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
+
+    void* m_cudaExternalMemory = nullptr;
+
+    RefPtr<CUDAContext> m_cudaContext;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override;
+};
+
+} // namespace cuda
+#endif
+} // namespace gfx
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
deleted file mode 100644
index 114751a3d..000000000
--- a/tools/gfx/cuda/render-cuda.cpp
+++ /dev/null
@@ -1,2513 +0,0 @@
-#include "render-cuda.h"
-
-#ifdef GFX_ENABLE_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "core/slang-basic.h"
-#include "core/slang-blob.h"
-#include "core/slang-std-writers.h"
-
-#include "slang.h"
-#include "slang-com-ptr.h"
-#include "slang-com-helper.h"
-#include "../command-writer.h"
-#include "../renderer-shared.h"
-#include "../mutable-shader-object.h"
-#include "../simple-transient-resource-heap.h"
-#include "../slang-context.h"
-#include "../command-encoder-com-forward.h"
-
-#   ifdef RENDER_TEST_OPTIX
-
-// The `optix_stubs.h` header produces warnings when compiled with MSVC
-#       ifdef _MSC_VER
-#           pragma warning(disable: 4996)
-#       endif
-
-#       include <optix.h>
-#       include <optix_function_table_definition.h>
-#       include <optix_stubs.h>
-#   endif
-
-#endif
-
-namespace gfx
-{
-#ifdef GFX_ENABLE_CUDA
-using namespace Slang;
-
-SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
-
-// A enum used to control if errors are reported on failure of CUDA call.
-enum class CUDAReportStyle
-{
-    Normal,
-    Silent,
-};
-
-struct CUDAErrorInfo
-{
-    CUDAErrorInfo(
-        const char* filePath,
-        int lineNo,
-        const char* errorName = nullptr,
-        const char* errorString = nullptr)
-        : m_filePath(filePath)
-        , m_lineNo(lineNo)
-        , m_errorName(errorName)
-        , m_errorString(errorString)
-    {}
-    SlangResult handle() const
-    {
-        StringBuilder builder;
-        builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
-
-        if (m_errorName)
-        {
-            builder << m_errorName << " : ";
-        }
-        if (m_errorString)
-        {
-            builder << m_errorString;
-        }
-
-        getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver,
-            builder.getUnownedSlice().begin());
-
-        // Slang::signalUnexpectedError(builder.getBuffer());
-        return SLANG_FAIL;
-    }
-
-    const char* m_filePath;
-    int m_lineNo;
-    const char* m_errorName;
-    const char* m_errorString;
-};
-
-// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
-
-static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
-{
-    CUDAErrorInfo info(file, line);
-    cuGetErrorString(cuResult, &info.m_errorString);
-    cuGetErrorName(cuResult, &info.m_errorName);
-    return info.handle();
-}
-
-static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
-    return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
-#    define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
-
-#    define SLANG_CUDA_RETURN_ON_FAIL(x)              \
-        {                                             \
-            auto _res = x;                            \
-            if (_isError(_res))                       \
-                return SLANG_CUDA_HANDLE_ERROR(_res); \
-        }
-#    define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r)                               \
-        {                                                                             \
-            auto _res = x;                                                            \
-            if (_isError(_res))                                                       \
-            {                                                                         \
-                return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
-                                                      : SLANG_FAIL;                   \
-            }                                                                         \
-        }
-
-#    define SLANG_CUDA_ASSERT_ON_FAIL(x)           \
-        {                                          \
-            auto _res = x;                         \
-            if (_isError(_res))                    \
-            {                                      \
-                SLANG_ASSERT(!"Failed CUDA call"); \
-            };                                     \
-        }
-
-#    ifdef RENDER_TEST_OPTIX
-
-static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }
-
-#        if 1
-static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
-{
-    fprintf(
-        stderr,
-        "%s(%d): optix: %s (%s)\n",
-        file,
-        line,
-        optixGetErrorString(result),
-        optixGetErrorName(result));
-    return SLANG_FAIL;
-}
-#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
-#        else
-#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
-#        endif
-
-#        define SLANG_OPTIX_RETURN_ON_FAIL(EXPR)           \
-            do                                             \
-            {                                              \
-                auto _res = EXPR;                          \
-                if (_isError(_res))                        \
-                    return SLANG_OPTIX_HANDLE_ERROR(_res); \
-            } while (0)
-
-void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
-{
-    fprintf(stderr, "optix: %s (%s)\n", message, tag);
-}
-
-#    endif
-
-class CUDAContext : public RefObject
-{
-public:
-    CUcontext m_context = nullptr;
-    ~CUDAContext() { cuCtxDestroy(m_context); }
-};
-
-class MemoryCUDAResource : public BufferResource
-{
-public:
-    MemoryCUDAResource(const Desc& _desc)
-        : BufferResource(_desc)
-    {}
-
-    ~MemoryCUDAResource()
-    {
-        if (m_cudaMemory)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
-        }
-    }
-
-    uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; }
-
-    void* m_cudaExternalMemory = nullptr;
-    void* m_cudaMemory = nullptr;
-
-    RefPtr<CUDAContext> m_cudaContext;
-
-    virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override
-    {
-        return (DeviceAddress)m_cudaMemory;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
-    {
-        outHandle->handleValue = getBindlessHandle();
-        outHandle->api = InteropHandleAPI::CUDA;
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        map(MemoryRange* rangeToRead, void** outPointer) override
-    {
-        SLANG_UNUSED(rangeToRead);
-        SLANG_UNUSED(outPointer);
-        return SLANG_FAIL;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override
-    {
-        SLANG_UNUSED(writtenRange);
-        return SLANG_FAIL;
-    }
-};
-
-class TextureCUDAResource : public TextureResource
-{
-public:
-    TextureCUDAResource(const TextureResource::Desc& desc)
-        : TextureResource(desc)
-    {}
-    ~TextureCUDAResource()
-    {
-        if (m_cudaSurfObj)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
-        }
-        if (m_cudaTexObj)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
-        }
-        if (m_cudaArray)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
-        }
-        if (m_cudaMipMappedArray)
-        {
-            SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
-        }
-    }
-
-    uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; }
-
-    // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
-    // a long long
-    CUtexObject m_cudaTexObj = CUtexObject();
-
-    // The surfObj is for reading/writing 'texture like' things, but not for sampling.
-    CUsurfObject m_cudaSurfObj = CUsurfObject();
-
-    CUarray m_cudaArray = CUarray();
-    CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
-
-    void* m_cudaExternalMemory = nullptr;
-
-    RefPtr<CUDAContext> m_cudaContext;
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
-    {
-        outHandle->handleValue = getBindlessHandle();
-        outHandle->api = InteropHandleAPI::CUDA;
-        return SLANG_OK;
-    }
-};
-
-class CUDAResourceView : public ResourceViewBase
-{
-public:
-    RefPtr<MemoryCUDAResource> memoryResource = nullptr;
-    RefPtr<TextureCUDAResource> textureResource = nullptr;
-    void* proxyBuffer = nullptr;
-};
-
-class CUDAShaderObjectLayout : public ShaderObjectLayoutBase
-{
-public:
-    struct BindingRangeInfo
-    {
-        slang::BindingType bindingType;
-        Index count;
-        Index baseIndex; // Flat index for sub-ojects
-        Index subObjectIndex;
-
-        // TODO: The `uniformOffset` field should be removed,
-        // since it cannot be supported by the Slang reflection
-        // API once we fix some design issues.
-        //
-        // It is only being used today for pre-allocation of sub-objects
-        // for constant buffers and parameter blocks (which should be
-        // deprecated/removed anyway).
-        //
-        // Note: We would need to bring this field back, plus
-        // a lot of other complexity, if we ever want to support
-        // setting of resources/buffers directly by a binding
-        // range index and array index.
-        //
-        Index uniformOffset; // Uniform offset for a resource typed field.
-    };
-
-    struct SubObjectRangeInfo
-    {
-        RefPtr<CUDAShaderObjectLayout> layout;
-        Index bindingRangeIndex;
-    };
-
-    List<SubObjectRangeInfo> subObjectRanges;
-    List<BindingRangeInfo> m_bindingRanges;
-
-    Index m_subObjectCount = 0;
-    Index m_resourceCount = 0;
-
-    CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout)
-    {
-        m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType);
-
-        initBase(renderer, m_elementTypeLayout);
-
-        // Compute the binding ranges that are used to store
-        // the logical contents of the object in memory. These will relate
-        // to the descriptor ranges in the various sets, but not always
-        // in a one-to-one fashion.
-
-        SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
-        for (SlangInt r = 0; r < bindingRangeCount; ++r)
-        {
-            slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
-            SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
-            slang::TypeLayoutReflection* slangLeafTypeLayout =
-                m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
-
-            SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
-            SlangInt rangeIndexInDescriptorSet =
-                m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
-
-            // TODO: This logic assumes that for any binding range that might consume
-            // multiple kinds of resources, the descriptor range for its uniform
-            // usage will be the first one in the range.
-            //
-            // We need to decide whether that assumption is one we intend to support
-            // applications making, or whether they should be forced to perform a
-            // linear search over the descriptor ranges for a specific binding range.
-            //
-            auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
-                descriptorSetIndex, rangeIndexInDescriptorSet);
-
-            Index baseIndex = 0;
-            Index subObjectIndex = 0;
-            switch (slangBindingType)
-            {
-            case slang::BindingType::ConstantBuffer:
-            case slang::BindingType::ParameterBlock:
-            case slang::BindingType::ExistentialValue:
-                baseIndex = m_subObjectCount;
-                subObjectIndex = baseIndex;
-                m_subObjectCount += count;
-                break;
-            case slang::BindingType::RawBuffer:
-            case slang::BindingType::MutableRawBuffer:
-                if (slangLeafTypeLayout->getType()->getElementType() != nullptr)
-                {
-                    // A structured buffer occupies both a resource slot and
-                    // a sub-object slot.
-                    subObjectIndex = m_subObjectCount;
-                    m_subObjectCount += count;
-                }
-                baseIndex = m_resourceCount;
-                m_resourceCount += count;
-                break;
-            default:
-                baseIndex = m_resourceCount;
-                m_resourceCount += count;
-                break;
-            }
-
-            BindingRangeInfo bindingRangeInfo;
-            bindingRangeInfo.bindingType = slangBindingType;
-            bindingRangeInfo.count = count;
-            bindingRangeInfo.baseIndex = baseIndex;
-            bindingRangeInfo.uniformOffset = uniformOffset;
-            bindingRangeInfo.subObjectIndex = subObjectIndex;
-            m_bindingRanges.add(bindingRangeInfo);
-        }
-
-        SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
-        for (SlangInt r = 0; r < subObjectRangeCount; ++r)
-        {
-            SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
-            auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
-            slang::TypeLayoutReflection* slangLeafTypeLayout =
-                m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
-
-            // A sub-object range can either represent a sub-object of a known
-            // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
-            // (in which case we can pre-compute a layout to use, based on
-            // the type `Foo`) *or* it can represent a sub-object of some
-            // existential type (e.g., `IBar`) in which case we cannot
-            // know the appropraite type/layout of sub-object to allocate.
-            //
-            RefPtr<CUDAShaderObjectLayout> subObjectLayout;
-            if (slangBindingType != slang::BindingType::ExistentialValue)
-            {
-                subObjectLayout =
-                    new CUDAShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout());
-            }
-
-            SubObjectRangeInfo subObjectRange;
-            subObjectRange.bindingRangeIndex = bindingRangeIndex;
-            subObjectRange.layout = subObjectLayout;
-            subObjectRanges.add(subObjectRange);
-        }
-    }
-
-    Index getResourceCount() const { return m_resourceCount; }
-    Index getSubObjectCount() const { return m_subObjectCount; }
-    List<SubObjectRangeInfo>& getSubObjectRanges() { return subObjectRanges; }
-    BindingRangeInfo getBindingRange(Index index) { return m_bindingRanges[index]; }
-    Index getBindingRangeCount() const { return m_bindingRanges.getCount(); }
-};
-
-class CUDAProgramLayout : public CUDAShaderObjectLayout
-{
-public:
-    slang::ProgramLayout* programLayout = nullptr;
-    List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts;
-    CUDAProgramLayout(RendererBase* renderer, slang::ProgramLayout* inProgramLayout)
-        : CUDAShaderObjectLayout(renderer, inProgramLayout->getGlobalParamsTypeLayout())
-        , programLayout(inProgramLayout)
-    {
-        for (UInt i =0; i< programLayout->getEntryPointCount(); i++)
-        {
-            entryPointLayouts.add(new CUDAShaderObjectLayout(
-                renderer,
-                programLayout->getEntryPointByIndex(i)->getTypeLayout()));
-        }
-
-    }
-
-    int getKernelIndex(UnownedStringSlice kernelName)
-    {
-        for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
-        {
-            auto entryPoint = programLayout->getEntryPointByIndex(i);
-            if (kernelName == entryPoint->getName())
-            {
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
-    {
-        auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
-        entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
-    }
-};
-
-class CUDAShaderObjectData
-{
-public:
-    bool isHostOnly = false;
-    Slang::RefPtr<MemoryCUDAResource> m_bufferResource;
-    Slang::RefPtr<CUDAResourceView> m_bufferView;
-    Slang::List<uint8_t> m_cpuBuffer;
-    Result setCount(Index count)
-    {
-        if (isHostOnly)
-        {
-            m_cpuBuffer.setCount(count);
-            if (!m_bufferView)
-            {
-                IResourceView::Desc viewDesc = {};
-                viewDesc.type = IResourceView::Type::UnorderedAccess;
-                m_bufferView = new CUDAResourceView();
-                m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer();
-                m_bufferView->m_desc = viewDesc;
-            }
-            return SLANG_OK;
-        }
-
-        if (!m_bufferResource)
-        {
-            IBufferResource::Desc desc;
-            desc.type = IResource::Type::Buffer;
-            desc.sizeInBytes = count;
-            m_bufferResource = new MemoryCUDAResource(desc);
-            if (count)
-            {
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
-            }
-            IResourceView::Desc viewDesc = {};
-            viewDesc.type = IResourceView::Type::UnorderedAccess;
-            m_bufferView = new CUDAResourceView();
-            m_bufferView->memoryResource = m_bufferResource;
-            m_bufferView->m_desc = viewDesc;
-        }
-        auto oldSize = m_bufferResource->getDesc()->sizeInBytes;
-        if ((size_t)count != oldSize)
-        {
-            void* newMemory = nullptr;
-            if (count)
-            {
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
-            }
-            if (oldSize)
-            {
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-                    newMemory,
-                    m_bufferResource->m_cudaMemory,
-                    Math::Min((size_t)count, oldSize),
-                    cudaMemcpyDefault));
-            }
-            cudaFree(m_bufferResource->m_cudaMemory);
-            m_bufferResource->m_cudaMemory = newMemory;
-            m_bufferResource->getDesc()->sizeInBytes = count;
-        }
-        return SLANG_OK;
-    }
-
-    Slang::Index getCount()
-    {
-        if (isHostOnly)
-            return m_cpuBuffer.getCount();
-        if (m_bufferResource)
-            return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes);
-        else
-            return 0;
-    }
-
-    void* getBuffer()
-    {
-        if (isHostOnly)
-            return m_cpuBuffer.getBuffer();
-
-        if (m_bufferResource)
-            return m_bufferResource->m_cudaMemory;
-        return nullptr;
-    }
-
-    /// Returns a resource view for GPU access into the buffer content.
-    ResourceViewBase* getResourceView(
-        RendererBase* device,
-        slang::TypeLayoutReflection* elementLayout,
-        slang::BindingType bindingType)
-    {
-        SLANG_UNUSED(device);
-        m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize();
-        return m_bufferView.Ptr();
-    }
-};
-
-class CUDAShaderObject
-    : public ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
-{
-    typedef ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
-        Super;
-
-public:
-    List<RefPtr<CUDAResourceView>> resources;
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        init(IDevice* device, CUDAShaderObjectLayout* typeLayout);
-
-    virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return 0; }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
-    {
-        *outEntryPoint = nullptr;
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override
-    {
-        return m_data.getBuffer();
-    }
-
-    virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override
-    {
-        return (Size)m_data.getCount();
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        setData(ShaderOffset const& offset, void const* data, Size size) override
-    {
-        size = Math::Min(size, (Size)m_data.getCount() - offset.uniformOffset);
-        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-            (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
-        return SLANG_OK;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        setResource(ShaderOffset const& offset, IResourceView* resourceView) override
-    {
-        if (!resourceView)
-            return SLANG_OK;
-
-        auto layout = getLayout();
-
-        auto bindingRangeIndex = offset.bindingRangeIndex;
-        SLANG_ASSERT(bindingRangeIndex >= 0);
-        SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
-
-        auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
-
-        auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
-        auto cudaView = static_cast<CUDAResourceView*>(resourceView);
-
-        resources[viewIndex] = cudaView;
-
-        if (cudaView->textureResource)
-        {
-            if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess)
-            {
-                auto handle = cudaView->textureResource->m_cudaSurfObj;
-                setData(offset, &handle, sizeof(uint64_t));
-            }
-            else
-            {
-                auto handle = cudaView->textureResource->getBindlessHandle();
-                setData(offset, &handle, sizeof(uint64_t));
-            }
-        }
-        else if (cudaView->memoryResource)
-        {
-            auto handle = cudaView->memoryResource->getBindlessHandle();
-            setData(offset, &handle, sizeof(handle));
-            auto sizeOffset = offset;
-            sizeOffset.uniformOffset += sizeof(handle);
-            auto& desc = *cudaView->memoryResource->getDesc();
-            size_t size = desc.sizeInBytes;
-            if (desc.elementSize > 1)
-                size /= desc.elementSize;
-            setData(sizeOffset, &size, sizeof(size));
-        }
-        else if (cudaView->proxyBuffer)
-        {
-            auto handle = cudaView->proxyBuffer;
-            setData(offset, &handle, sizeof(handle));
-            auto sizeOffset = offset;
-            sizeOffset.uniformOffset += sizeof(handle);
-            auto& desc = *cudaView->memoryResource->getDesc();
-            size_t size = desc.sizeInBytes;
-            if (desc.elementSize > 1)
-                size /= desc.elementSize;
-            setData(sizeOffset, &size, sizeof(size));
-        }
-        return SLANG_OK;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        setObject(ShaderOffset const& offset, IShaderObject* object) override
-    {
-        SLANG_RETURN_ON_FAIL(Super::setObject(offset, object));
-
-        auto bindingRangeIndex = offset.bindingRangeIndex;
-        auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex];
-
-        CUDAShaderObject* subObject = static_cast<CUDAShaderObject*>(object);
-        switch (bindingRange.bindingType)
-        {
-        default:
-            {
-                void* subObjectDataBuffer = subObject->getBuffer();
-                SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*)));
-            }
-            break;
-        case slang::BindingType::ExistentialValue:
-        case slang::BindingType::RawBuffer:
-        case slang::BindingType::MutableRawBuffer:
-            break;
-        }
-        return SLANG_OK;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        setSampler(ShaderOffset const& offset, ISamplerState* sampler) override
-    {
-        SLANG_UNUSED(sampler);
-        SLANG_UNUSED(offset);
-        return SLANG_OK;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
-        ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override
-    {
-        SLANG_UNUSED(sampler);
-        setResource(offset, textureView);
-        return SLANG_OK;
-    }
-};
-
-class CUDAMutableShaderObject : public MutableShaderObject< CUDAMutableShaderObject, CUDAShaderObjectLayout>
-{};
-
-class CUDAEntryPointShaderObject : public CUDAShaderObject
-{
-public:
-    CUDAEntryPointShaderObject() { m_data.isHostOnly = true; }
-};
-
-class CUDARootShaderObject : public CUDAShaderObject
-{
-public:
-    virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override { return 1; }
-    virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override { return 1; }
-public:
-    List<RefPtr<CUDAEntryPointShaderObject>> entryPointObjects;
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        init(IDevice* device, CUDAShaderObjectLayout* typeLayout) override;
-    virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return (GfxCount)entryPointObjects.getCount(); }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
-    {
-        returnComPtr(outEntryPoint, entryPointObjects[index]);
-        return SLANG_OK;
-    }
-    virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override
-    {
-        SLANG_RETURN_ON_FAIL(CUDAShaderObject::collectSpecializationArgs(args));
-        for (auto& entryPoint : entryPointObjects)
-        {
-            SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
-        }
-        return SLANG_OK;
-    }
-};
-
-class CUDAShaderProgram : public ShaderProgramBase
-{
-public:
-    CUmodule cudaModule = nullptr;
-    CUfunction cudaKernel;
-    String kernelName;
-    RefPtr<CUDAProgramLayout> layout;
-    RefPtr<CUDAContext> cudaContext;
-    ~CUDAShaderProgram()
-    {
-        if (cudaModule)
-            cuModuleUnload(cudaModule);
-    }
-};
-
-class CUDAPipelineState : public PipelineStateBase
-{
-public:
-    RefPtr<CUDAShaderProgram> shaderProgram;
-    void init(const ComputePipelineStateDesc& inDesc)
-    {
-        PipelineStateDesc pipelineDesc;
-        pipelineDesc.type = PipelineType::Compute;
-        pipelineDesc.compute = inDesc;
-        initializeBase(pipelineDesc);
-    }
-};
-
-class CUDAQueryPool : public QueryPoolBase
-{
-public:
-    // The event object for each query. Owned by the pool.
-    List<CUevent> m_events;
-
-    // The event that marks the starting point.
-    CUevent m_startEvent;
-
-    Result init(const IQueryPool::Desc& desc)
-    {
-        cuEventCreate(&m_startEvent, 0);
-        cuEventRecord(m_startEvent, 0);
-        m_events.setCount(desc.count);
-        for (SlangInt i = 0; i < m_events.getCount(); i++)
-        {
-            cuEventCreate(&m_events[i], 0);
-        }
-        return SLANG_OK;
-    }
-
-    ~CUDAQueryPool()
-    {
-        for (auto& e : m_events)
-        {
-            cuEventDestroy(e);
-        }
-        cuEventDestroy(m_startEvent);
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL getResult(
-        GfxIndex queryIndex, GfxCount count, uint64_t* data) override
-    {
-        for (GfxIndex i = 0; i < count; i++)
-        {
-            float time = 0.0f;
-            cuEventSynchronize(m_events[i + queryIndex]);
-            cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]);
-            data[i] = (uint64_t)((double)time * 1000.0f);
-        }
-        return SLANG_OK;
-    }
-};
-
-class CUDADevice : public RendererBase
-{
-private:
-    static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
-    static int _calcSMCountPerMultiProcessor(int major, int minor)
-    {
-        // Defines for GPU Architecture types (using the SM version to determine
-        // the # of cores per SM
-        struct SMInfo
-        {
-            int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
-            int coreCount;
-        };
-
-        static const SMInfo infos[] = {
-            {0x30, 192},
-            {0x32, 192},
-            {0x35, 192},
-            {0x37, 192},
-            {0x50, 128},
-            {0x52, 128},
-            {0x53, 128},
-            {0x60, 64},
-            {0x61, 128},
-            {0x62, 128},
-            {0x70, 64},
-            {0x72, 64},
-            {0x75, 64}};
-
-        const int sm = ((major << 4) + minor);
-        for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
-        {
-            if (infos[i].sm == sm)
-            {
-                return infos[i].coreCount;
-            }
-        }
-
-        const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
-
-        // It must be newer presumably
-        SLANG_ASSERT(sm > last.sm);
-
-        // Default to the last entry
-        return last.coreCount;
-    }
-
-    static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
-    {
-        int smPerMultiproc = 0;
-        int maxPerfDevice = -1;
-        int deviceCount = 0;
-        int devicesProhibited = 0;
-
-        uint64_t maxComputePerf = 0;
-        SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
-
-        // Find the best CUDA capable GPU device
-        for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
-        {
-            int computeMode = -1, major = 0, minor = 0;
-            SLANG_CUDA_RETURN_ON_FAIL(
-                cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
-            SLANG_CUDA_RETURN_ON_FAIL(
-                cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
-            SLANG_CUDA_RETURN_ON_FAIL(
-                cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
-
-            // If this GPU is not running on Compute Mode prohibited,
-            // then we can add it to the list
-            if (computeMode != cudaComputeModeProhibited)
-            {
-                if (major == 9999 && minor == 9999)
-                {
-                    smPerMultiproc = 1;
-                }
-                else
-                {
-                    smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
-                }
-
-                int multiProcessorCount = 0, clockRate = 0;
-                SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
-                    &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
-                SLANG_CUDA_RETURN_ON_FAIL(
-                    cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
-                uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
-
-                if (compute_perf > maxComputePerf)
-                {
-                    maxComputePerf = compute_perf;
-                    maxPerfDevice = currentDevice;
-                }
-            }
-            else
-            {
-                devicesProhibited++;
-            }
-        }
-
-        if (maxPerfDevice < 0)
-        {
-            return SLANG_FAIL;
-        }
-
-        *outDeviceIndex = maxPerfDevice;
-        return SLANG_OK;
-    }
-
-    static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
-    {
-        static CUresult res = cuInit(0);
-        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
-        return SLANG_OK;
-    }
-
-private:
-    int m_deviceIndex = -1;
-    CUdevice m_device = 0;
-    RefPtr<CUDAContext> m_context;
-    DeviceInfo m_info;
-    String m_adapterName;
-
-public:
-    virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override
-    {
-        outHandles->handles[0].handleValue = (uint64_t)m_device;
-        outHandles->handles[0].api = InteropHandleAPI::CUDA;
-        return SLANG_OK;
-    }
-
-    class CommandQueueImpl;
-
-    class CommandBufferImpl
-        : public ICommandBuffer
-        , public CommandWriter
-        , public ComObject
-    {
-    public:
-        SLANG_COM_OBJECT_IUNKNOWN_ALL
-        ICommandBuffer* getInterface(const Guid& guid)
-        {
-            if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
-                return static_cast<ICommandBuffer*>(this);
-            return nullptr;
-        }
-    public:
-        CUDADevice* m_device;
-        TransientResourceHeapBase* m_transientHeap;
-
-        void init(CUDADevice* device, TransientResourceHeapBase* transientHeap)
-        {
-            m_device = device;
-            m_transientHeap = transientHeap;
-        }
-
-        virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
-            IRenderPassLayout* renderPass,
-            IFramebuffer* framebuffer,
-            IRenderCommandEncoder** outEncoder) override
-        {
-            SLANG_UNUSED(renderPass);
-            SLANG_UNUSED(framebuffer);
-            *outEncoder = nullptr;
-        }
-
-        class ResourceCommandEncoderImpl : public IResourceCommandEncoder
-        {
-        public:
-            CommandWriter* m_writer;
-
-            void init(CommandBufferImpl* cmdBuffer) { m_writer = cmdBuffer; }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
-            virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
-                IBufferResource* dst,
-                Offset dstOffset,
-                IBufferResource* src,
-                Offset srcOffset,
-                Size size) override
-            {
-                m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier(
-                GfxCount count,
-                ITextureResource* const* textures,
-                ResourceState src,
-                ResourceState dst) override
-            {}
-
-            virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier(
-                GfxCount count,
-                IBufferResource* const* buffers,
-                ResourceState src,
-                ResourceState dst) override
-            {}
-
-            virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData(
-                IBufferResource* dst, Offset offset, Size size, void* data) override
-            {
-                m_writer->uploadBufferData(dst, offset, size, data);
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL
-                writeTimestamp(IQueryPool* pool, GfxIndex index) override
-            {
-                m_writer->writeTimestamp(pool, index);
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL copyTexture(
-                ITextureResource* dst,
-                ResourceState dstState,
-                SubresourceRange dstSubresource,
-                ITextureResource::Offset3D dstOffset,
-                ITextureResource* src,
-                ResourceState srcState,
-                SubresourceRange srcSubresource,
-                ITextureResource::Offset3D srcOffset,
-                ITextureResource::Extents extent) override
-            {
-                SLANG_UNUSED(dst);
-                SLANG_UNUSED(dstState);
-                SLANG_UNUSED(dstSubresource);
-                SLANG_UNUSED(dstOffset);
-                SLANG_UNUSED(src);
-                SLANG_UNUSED(srcState);
-                SLANG_UNUSED(srcSubresource);
-                SLANG_UNUSED(srcOffset);
-                SLANG_UNUSED(extent);
-                SLANG_UNIMPLEMENTED_X("copyTexture");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData(
-                ITextureResource* dst,
-                SubresourceRange subResourceRange,
-                ITextureResource::Offset3D offset,
-                ITextureResource::Extents extent,
-                ITextureResource::SubresourceData* subResourceData,
-                GfxCount subResourceDataCount) override
-            {
-                SLANG_UNUSED(dst);
-                SLANG_UNUSED(subResourceRange);
-                SLANG_UNUSED(offset);
-                SLANG_UNUSED(extent);
-                SLANG_UNUSED(subResourceData);
-                SLANG_UNUSED(subResourceDataCount);
-                SLANG_UNIMPLEMENTED_X("uploadTextureData");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView(
-                IResourceView* view,
-                ClearValue* clearValue,
-                ClearResourceViewFlags::Enum flags) override
-            {
-                SLANG_UNUSED(view);
-                SLANG_UNUSED(clearValue);
-                SLANG_UNUSED(flags);
-                SLANG_UNIMPLEMENTED_X("clearResourceView");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL resolveResource(
-                ITextureResource* source,
-                ResourceState sourceState,
-                SubresourceRange sourceRange,
-                ITextureResource* dest,
-                ResourceState destState,
-                SubresourceRange destRange) override
-            {
-                SLANG_UNUSED(source);
-                SLANG_UNUSED(sourceState);
-                SLANG_UNUSED(sourceRange);
-                SLANG_UNUSED(dest);
-                SLANG_UNUSED(destState);
-                SLANG_UNUSED(destRange);
-                SLANG_UNIMPLEMENTED_X("resolveResource");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery(
-                IQueryPool* queryPool,
-                GfxIndex index,
-                GfxCount count,
-                IBufferResource* buffer,
-                Offset offset) override
-            {
-                SLANG_UNUSED(queryPool);
-                SLANG_UNUSED(index);
-                SLANG_UNUSED(count);
-                SLANG_UNUSED(buffer);
-                SLANG_UNUSED(offset);
-                SLANG_UNIMPLEMENTED_X("resolveQuery");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer(
-                IBufferResource* dst,
-                Offset dstOffset,
-                Size dstSize,
-                Size dstRowStride,
-                ITextureResource* src,
-                ResourceState srcState,
-                SubresourceRange srcSubresource,
-                ITextureResource::Offset3D srcOffset,
-                ITextureResource::Extents extent) override
-            {
-                SLANG_UNUSED(dst);
-                SLANG_UNUSED(dstOffset);
-                SLANG_UNUSED(dstSize);
-                SLANG_UNUSED(dstRowStride);
-                SLANG_UNUSED(src);
-                SLANG_UNUSED(srcState);
-                SLANG_UNUSED(srcSubresource);
-                SLANG_UNUSED(srcOffset);
-                SLANG_UNUSED(extent);
-                SLANG_UNIMPLEMENTED_X("copyTextureToBuffer");
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier(
-                ITextureResource* texture,
-                SubresourceRange subresourceRange,
-                ResourceState src,
-                ResourceState dst) override
-            {
-                SLANG_UNUSED(texture);
-                SLANG_UNUSED(subresourceRange);
-                SLANG_UNUSED(src);
-                SLANG_UNUSED(dst);
-                SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier");
-            }
-            virtual SLANG_NO_THROW void SLANG_MCALL
-                beginDebugEvent(const char* name, float rgbColor[3]) override
-            {
-                SLANG_UNUSED(name);
-                SLANG_UNUSED(rgbColor);
-            }
-            virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {}
-        };
-
-        ResourceCommandEncoderImpl m_resourceCommandEncoder;
-
-        virtual SLANG_NO_THROW void SLANG_MCALL
-            encodeResourceCommands(IResourceCommandEncoder** outEncoder) override
-        {
-            m_resourceCommandEncoder.init(this);
-            *outEncoder = &m_resourceCommandEncoder;
-        }
-
-        class ComputeCommandEncoderImpl
-            : public IComputeCommandEncoder
-            , public ResourceCommandEncoderImpl
-        {
-        public:
-            SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl)
-        public:
-            CommandWriter* m_writer;
-            CommandBufferImpl* m_commandBuffer;
-            RefPtr<ShaderObjectBase> m_rootObject;
-            virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
-            void init(CommandBufferImpl* cmdBuffer)
-            {
-                m_writer = cmdBuffer;
-                m_commandBuffer = cmdBuffer;
-            }
-
-            virtual SLANG_NO_THROW Result SLANG_MCALL
-                bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override
-            {
-                m_writer->setPipelineState(state);
-                PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
-                SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
-                    pipelineImpl->m_program, m_rootObject.writeRef()));
-                returnComPtr(outRootObject, m_rootObject);
-                return SLANG_OK;
-            }
-
-            virtual SLANG_NO_THROW Result SLANG_MCALL
-                bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override
-            {
-                m_writer->setPipelineState(state);
-                PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
-                SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
-                    pipelineImpl->m_program, m_rootObject.writeRef()));
-                m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap);
-                return SLANG_OK;
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override
-            {
-                m_writer->bindRootShaderObject(m_rootObject);
-                m_writer->dispatchCompute(x, y, z);
-            }
-
-            virtual SLANG_NO_THROW void SLANG_MCALL
-                dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override
-            {
-                SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect");
-            }
-        };
-
-        ComputeCommandEncoderImpl m_computeCommandEncoder;
-        virtual SLANG_NO_THROW void SLANG_MCALL
-            encodeComputeCommands(IComputeCommandEncoder** outEncoder) override
-        {
-            m_computeCommandEncoder.init(this);
-            *outEncoder = &m_computeCommandEncoder;
-        }
-
-        virtual SLANG_NO_THROW void SLANG_MCALL
-            encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override
-        {
-            *outEncoder = nullptr;
-        }
-
-        virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
-
-        virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
-        {
-            return SLANG_FAIL;
-        }
-    };
-
-    class CommandQueueImpl
-        : public ICommandQueue
-        , public ComObject
-    {
-    public:
-        SLANG_COM_OBJECT_IUNKNOWN_ALL
-        ICommandQueue* getInterface(const Guid& guid)
-        {
-            if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
-                return static_cast<ICommandQueue*>(this);
-            return nullptr;
-        }
-
-    public:
-        RefPtr<CUDAPipelineState> currentPipeline;
-        RefPtr<CUDARootShaderObject> currentRootObject;
-        RefPtr<CUDADevice> renderer;
-        CUstream stream;
-        Desc m_desc;
-    public:
-        void init(CUDADevice* inRenderer)
-        {
-            renderer = inRenderer;
-            m_desc.type = ICommandQueue::QueueType::Graphics;
-            cuStreamCreate(&stream, 0);
-        }
-        ~CommandQueueImpl()
-        {
-            cuStreamSynchronize(stream);
-            cuStreamDestroy(stream);
-            currentPipeline = nullptr;
-            currentRootObject = nullptr;
-        }
-
-    public:
-        virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override
-        {
-            return m_desc;
-        }
-
-        virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers(
-            GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override
-        {
-            SLANG_UNUSED(valueToSignal);
-            // TODO: implement fence.
-            assert(fence == nullptr);
-            for (GfxIndex i = 0; i < count; i++)
-            {
-                execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
-            }
-        }
-
-        virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override
-        {
-            auto resultCode = cuStreamSynchronize(stream);
-            if (resultCode != cudaSuccess)
-                SLANG_CUDA_HANDLE_ERROR(resultCode);
-        }
-
-        virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice(
-            GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override
-        {
-            return SLANG_FAIL;
-        }
-
-        virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
-        {
-            return SLANG_FAIL;
-        }
-
-    public:
-        void setPipelineState(IPipelineState* state)
-        {
-            currentPipeline = dynamic_cast<CUDAPipelineState*>(state);
-        }
-
-        Result bindRootShaderObject(IShaderObject* object)
-        {
-            currentRootObject = dynamic_cast<CUDARootShaderObject*>(object);
-            if (currentRootObject)
-                return SLANG_OK;
-            return SLANG_E_INVALID_ARG;
-        }
-
-        void dispatchCompute(int x, int y, int z)
-        {
-            // Specialize the compute kernel based on the shader object bindings.
-            RefPtr<PipelineStateBase> newPipeline;
-            renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
-            currentPipeline = static_cast<CUDAPipelineState*>(newPipeline.Ptr());
-
-            // Find out thread group size from program reflection.
-            auto& kernelName = currentPipeline->shaderProgram->kernelName;
-            auto programLayout = static_cast<CUDAProgramLayout*>(currentRootObject->getLayout());
-            int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
-            SLANG_ASSERT(kernelId != -1);
-            UInt threadGroupSize[3];
-            programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
-
-            int sharedSizeInBytes;
-            cuFuncGetAttribute(
-                &sharedSizeInBytes,
-                CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                currentPipeline->shaderProgram->cudaKernel);
-
-            // Copy global parameter data to the `SLANG_globalParams` symbol.
-            {
-                CUdeviceptr globalParamsSymbol = 0;
-                size_t globalParamsSymbolSize = 0;
-                cuModuleGetGlobal(
-                    &globalParamsSymbol,
-                    &globalParamsSymbolSize,
-                    currentPipeline->shaderProgram->cudaModule,
-                    "SLANG_globalParams");
-
-                CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
-                cudaMemcpyAsync(
-                    (void*)globalParamsSymbol,
-                    (void*)globalParamsCUDAData,
-                    globalParamsSymbolSize,
-                    cudaMemcpyDefault,
-                    0);
-            }
-            //
-            // The argument data for the entry-point parameters are already
-            // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
-            //
-            auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
-            auto entryPointDataSize =
-                currentRootObject->entryPointObjects[kernelId]->getBufferSize();
-
-            void* extraOptions[] = {
-                CU_LAUNCH_PARAM_BUFFER_POINTER,
-                entryPointBuffer,
-                CU_LAUNCH_PARAM_BUFFER_SIZE,
-                &entryPointDataSize,
-                CU_LAUNCH_PARAM_END,
-            };
-
-            // Once we have all the decessary data extracted and/or
-            // set up, we can launch the kernel and see what happens.
-            //
-            auto cudaLaunchResult = cuLaunchKernel(
-                currentPipeline->shaderProgram->cudaKernel,
-                x,
-                y,
-                z,
-                int(threadGroupSize[0]),
-                int(threadGroupSize[1]),
-                int(threadGroupSize[2]),
-                sharedSizeInBytes,
-                stream,
-                nullptr,
-                extraOptions);
-
-            SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
-        }
-
-        void copyBuffer(
-            IBufferResource* dst,
-            size_t dstOffset,
-            IBufferResource* src,
-            size_t srcOffset,
-            size_t size)
-        {
-            auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
-            auto srcImpl = static_cast<MemoryCUDAResource*>(src);
-            cudaMemcpy(
-                (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
-                (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
-                size,
-                cudaMemcpyDefault);
-        }
-
-        void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
-        {
-            auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
-            cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
-        }
-
-        void writeTimestamp(IQueryPool* pool, SlangInt index)
-        {
-            auto poolImpl = static_cast<CUDAQueryPool*>(pool);
-            cuEventRecord(poolImpl->m_events[index], stream);
-        }
-
-        void execute(CommandBufferImpl* commandBuffer)
-        {
-            for (auto& cmd : commandBuffer->m_commands)
-            {
-                switch (cmd.name)
-                {
-                case CommandName::SetPipelineState:
-                    setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
-                    break;
-                case CommandName::BindRootShaderObject:
-                    bindRootShaderObject(
-                        commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
-                    break;
-                case CommandName::DispatchCompute:
-                    dispatchCompute(
-                        int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
-                    break;
-                case CommandName::CopyBuffer:
-                    copyBuffer(
-                        commandBuffer->getObject<BufferResource>(cmd.operands[0]),
-                        cmd.operands[1],
-                        commandBuffer->getObject<BufferResource>(cmd.operands[2]),
-                        cmd.operands[3],
-                        cmd.operands[4]);
-                    break;
-                case CommandName::UploadBufferData:
-                    uploadBufferData(
-                        commandBuffer->getObject<BufferResource>(cmd.operands[0]),
-                        cmd.operands[1],
-                        cmd.operands[2],
-                        commandBuffer->getData<uint8_t>(cmd.operands[3]));
-                    break;
-                case CommandName::WriteTimestamp:
-                    writeTimestamp(
-                        commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
-                        (SlangInt)cmd.operands[1]);
-                }
-            }
-        }
-    };
-
-    using TransientResourceHeapImpl = SimpleTransientResourceHeap<CUDADevice, CommandBufferImpl>;
-
-public:
-    virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override
-    {
-        SLANG_RETURN_ON_FAIL(slangContext.initialize(
-            desc.slang,
-            SLANG_PTX,
-            "sm_5_1",
-            makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
-
-        SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
-
-        SLANG_RETURN_ON_FAIL(_initCuda(reportType));
-
-        SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
-        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
-        m_context = new CUDAContext();
-
-        int count = -1;
-        cuDeviceGetCount(&count);
-        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
-
-        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
-            cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
-
-        // Not clear how to detect half support on CUDA. For now we'll assume we have it
-        {
-            m_features.add("half");
-        }
-
-        // Initialize DeviceInfo
-        {
-            m_info.deviceType = DeviceType::CUDA;
-            m_info.bindingStyle = BindingStyle::CUDA;
-            m_info.projectionStyle = ProjectionStyle::DirectX;
-            m_info.apiName = "CUDA";
-            static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
-            ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
-            cudaDeviceProp deviceProperties;
-            cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
-            m_adapterName = deviceProperties.name;
-            m_info.adapterName = m_adapterName.begin();
-            m_info.timestampFrequency = 1000000;
-        }
-
-        return SLANG_OK;
-    }
-
-    Result getCUDAFormat(Format format, CUarray_format* outFormat)
-    {
-        // TODO: Expand to cover all available formats that can be supported in CUDA
-        switch (format)
-        {
-        case Format::R32G32B32A32_FLOAT:
-        case Format::R32G32B32_FLOAT:
-        case Format::R32G32_FLOAT:
-        case Format::R32_FLOAT:
-        case Format::D32_FLOAT:
-            *outFormat = CU_AD_FORMAT_FLOAT;
-            return SLANG_OK;
-        case Format::R16G16B16A16_FLOAT:
-        case Format::R16G16_FLOAT:
-        case Format::R16_FLOAT:
-            *outFormat = CU_AD_FORMAT_HALF;
-            return SLANG_OK;
-        case Format::R32G32B32A32_UINT:
-        case Format::R32G32B32_UINT:
-        case Format::R32G32_UINT:
-        case Format::R32_UINT:
-            *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
-            return SLANG_OK;
-        case Format::R16G16B16A16_UINT:
-        case Format::R16G16_UINT:
-        case Format::R16_UINT:
-            *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
-            return SLANG_OK;
-        case Format::R8G8B8A8_UINT:
-        case Format::R8G8_UINT:
-        case Format::R8_UINT:
-        case Format::R8G8B8A8_UNORM:
-            *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
-            return SLANG_OK;
-        case Format::R32G32B32A32_SINT:
-        case Format::R32G32B32_SINT:
-        case Format::R32G32_SINT:
-        case Format::R32_SINT:
-            *outFormat = CU_AD_FORMAT_SIGNED_INT32;
-            return SLANG_OK;
-        case Format::R16G16B16A16_SINT:
-        case Format::R16G16_SINT:
-        case Format::R16_SINT:
-            *outFormat = CU_AD_FORMAT_SIGNED_INT16;
-            return SLANG_OK;
-        case Format::R8G8B8A8_SINT:
-        case Format::R8G8_SINT:
-        case Format::R8_SINT:
-            *outFormat = CU_AD_FORMAT_SIGNED_INT8;
-            return SLANG_OK;
-        default:
-            SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
-            return SLANG_FAIL;
-        }
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
-        const ITextureResource::Desc& desc,
-        const ITextureResource::SubresourceData* initData,
-        ITextureResource** outResource) override
-    {
-        TextureResource::Desc srcDesc = fixupTextureDesc(desc);
-
-        RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(srcDesc);
-        tex->m_cudaContext = m_context;
-
-        CUresourcetype resourceType;
-
-        // The size of the element/texel in bytes
-        size_t elementSize = 0;
-
-        // Our `ITextureResource::Desc` uses an enumeration to specify
-        // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
-        // `cuMipmappedArrayCreate` seemingly relies on a policy where
-        // the extents of the array in dimenions above the rank are
-        // specified as zero (e.g., a 1D texture requires `height==0`).
-        //
-        // We will start by massaging the extents as specified by the
-        // user into a form that CUDA wants/expects, based on the
-        // texture shape as specified in the `desc`.
-        //
-        int width = desc.size.width;
-        int height = desc.size.height;
-        int depth = desc.size.depth;
-        switch (desc.type)
-        {
-        case IResource::Type::Texture1D:
-            height = 0;
-            depth = 0;
-            break;
-
-        case IResource::Type::Texture2D:
-            depth = 0;
-            break;
-
-        case IResource::Type::Texture3D:
-            break;
-
-        case IResource::Type::TextureCube:
-            depth = 1;
-            break;
-        }
-
-        {
-            CUarray_format format = CU_AD_FORMAT_FLOAT;
-            int numChannels = 0;
-
-            SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
-            FormatInfo info;
-            gfxGetFormatInfo(desc.format, &info);
-            numChannels = info.channelCount;
-
-            switch (format)
-            {
-            case CU_AD_FORMAT_FLOAT:
-            {
-                elementSize = sizeof(float) * numChannels;
-                break;
-            }
-            case CU_AD_FORMAT_HALF:
-            {
-                elementSize = sizeof(uint16_t) * numChannels;
-                break;
-            }
-            case CU_AD_FORMAT_UNSIGNED_INT8:
-            {
-                elementSize = sizeof(uint8_t) * numChannels;
-                break;
-            }
-            default:
-            {
-                SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
-                return SLANG_FAIL;
-            }
-            }
-
-            if (desc.numMipLevels > 1)
-            {
-                resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
-
-                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-                memset(&arrayDesc, 0, sizeof(arrayDesc));
-
-                arrayDesc.Width = width;
-                arrayDesc.Height = height;
-                arrayDesc.Depth = depth;
-                arrayDesc.Format = format;
-                arrayDesc.NumChannels = numChannels;
-                arrayDesc.Flags = 0;
-
-                if (desc.arraySize > 1)
-                {
-                    if (desc.type == IResource::Type::Texture1D ||
-                        desc.type == IResource::Type::Texture2D ||
-                        desc.type == IResource::Type::TextureCube)
-                    {
-                        arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
-                        arrayDesc.Depth = desc.arraySize;
-                    }
-                    else
-                    {
-                        SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
-                        return SLANG_FAIL;
-                    }
-                }
-
-                if (desc.type == IResource::Type::TextureCube)
-                {
-                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
-                    arrayDesc.Depth *= 6;
-                }
-
-                SLANG_CUDA_RETURN_ON_FAIL(
-                    cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
-            }
-            else
-            {
-                resourceType = CU_RESOURCE_TYPE_ARRAY;
-
-                if (desc.arraySize > 1)
-                {
-                    if (desc.type == IResource::Type::Texture1D ||
-                        desc.type == IResource::Type::Texture2D ||
-                        desc.type == IResource::Type::TextureCube)
-                    {
-                        SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
-                        return SLANG_FAIL;
-                    }
-
-                    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-                    memset(&arrayDesc, 0, sizeof(arrayDesc));
-
-                    // Set the depth as the array length
-                    arrayDesc.Depth = desc.arraySize;
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        arrayDesc.Depth *= 6;
-                    }
-
-                    arrayDesc.Height = height;
-                    arrayDesc.Width = width;
-                    arrayDesc.Format = format;
-                    arrayDesc.NumChannels = numChannels;
-
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
-                    }
-
-                    SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
-                }
-                else if (desc.type == IResource::Type::Texture3D ||
-                    desc.type == IResource::Type::TextureCube)
-                {
-                    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-                    memset(&arrayDesc, 0, sizeof(arrayDesc));
-
-                    arrayDesc.Depth = depth;
-                    arrayDesc.Height = height;
-                    arrayDesc.Width = width;
-                    arrayDesc.Format = format;
-                    arrayDesc.NumChannels = numChannels;
-
-                    arrayDesc.Flags = 0;
-
-                    // Handle cube texture
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        arrayDesc.Depth = 6;
-                        arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
-                    }
-
-                    SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
-                }
-                else
-                {
-                    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-                    memset(&arrayDesc, 0, sizeof(arrayDesc));
-
-                    arrayDesc.Height = height;
-                    arrayDesc.Width = width;
-                    arrayDesc.Format = format;
-                    arrayDesc.NumChannels = numChannels;
-
-                    // Allocate the array, will work for 1D or 2D case
-                    SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
-                }
-            }
-        }
-
-        // Work space for holding data for uploading if it needs to be rearranged
-        if (initData)
-        {
-            List<uint8_t> workspace;
-            for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
-            {
-                int mipWidth = width >> mipLevel;
-                int mipHeight = height >> mipLevel;
-                int mipDepth = depth >> mipLevel;
-
-                mipWidth = (mipWidth == 0) ? 1 : mipWidth;
-                mipHeight = (mipHeight == 0) ? 1 : mipHeight;
-                mipDepth = (mipDepth == 0) ? 1 : mipDepth;
-
-                // If it's a cubemap then the depth is always 6
-                if (desc.type == IResource::Type::TextureCube)
-                {
-                    mipDepth = 6;
-                }
-
-                auto dstArray = tex->m_cudaArray;
-                if (tex->m_cudaMipMappedArray)
-                {
-                    // Get the array for the mip level
-                    SLANG_CUDA_RETURN_ON_FAIL(
-                        cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
-                }
-                SLANG_ASSERT(dstArray);
-
-                // Check using the desc to see if it's plausible
-                {
-                    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-                    SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
-
-                    SLANG_ASSERT(mipWidth == arrayDesc.Width);
-                    SLANG_ASSERT(
-                        mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
-                }
-
-                const void* srcDataPtr = nullptr;
-
-                if (desc.arraySize > 1)
-                {
-                    SLANG_ASSERT(
-                        desc.type == IResource::Type::Texture1D ||
-                        desc.type == IResource::Type::Texture2D ||
-                        desc.type == IResource::Type::TextureCube);
-
-                    // TODO(JS): Here I assume that arrays are just held contiguously within a
-                    // 'face' This seems reasonable and works with the Copy3D.
-                    const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
-                    Index faceCount = desc.arraySize;
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        faceCount *= 6;
-                    }
-
-                    const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
-                    workspace.setCount(mipSizeInBytes);
-
-                    // We need to add the face data from each mip
-                    // We iterate over face count so we copy all of the cubemap faces
-                    for (Index j = 0; j < faceCount; j++)
-                    {
-                        const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
-                        // Copy over to the workspace to make contiguous
-                        ::memcpy(
-                            workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
-                    }
-
-                    srcDataPtr = workspace.getBuffer();
-                }
-                else
-                {
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
-                        workspace.setCount(faceSizeInBytes * 6);
-                        // Copy the data over to make contiguous
-                        for (Index j = 0; j < 6; j++)
-                        {
-                            const auto srcData =
-                                initData[mipLevel + j * desc.numMipLevels].data;
-                            ::memcpy(
-                                workspace.getBuffer() + faceSizeInBytes * j,
-                                srcData,
-                                faceSizeInBytes);
-                        }
-                        srcDataPtr = workspace.getBuffer();
-                    }
-                    else
-                    {
-                        const auto srcData = initData[mipLevel].data;
-                        srcDataPtr = srcData;
-                    }
-                }
-
-                if (desc.arraySize > 1)
-                {
-                    SLANG_ASSERT(
-                        desc.type == IResource::Type::Texture1D ||
-                        desc.type == IResource::Type::Texture2D ||
-                        desc.type == IResource::Type::TextureCube);
-
-                    CUDA_MEMCPY3D copyParam;
-                    memset(&copyParam, 0, sizeof(copyParam));
-
-                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                    copyParam.dstArray = dstArray;
-
-                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-                    copyParam.srcHost = srcDataPtr;
-                    copyParam.srcPitch = mipWidth * elementSize;
-                    copyParam.WidthInBytes = copyParam.srcPitch;
-                    copyParam.Height = mipHeight;
-                    // Set the depth to the array length
-                    copyParam.Depth = desc.arraySize;
-
-                    if (desc.type == IResource::Type::TextureCube)
-                    {
-                        copyParam.Depth *= 6;
-                    }
-
-                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
-                }
-                else
-                {
-                    switch (desc.type)
-                    {
-                    case IResource::Type::Texture1D:
-                    case IResource::Type::Texture2D:
-                        {
-                            CUDA_MEMCPY2D copyParam;
-                            memset(&copyParam, 0, sizeof(copyParam));
-                            copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                            copyParam.dstArray = dstArray;
-                            copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-                            copyParam.srcHost = srcDataPtr;
-                            copyParam.srcPitch = mipWidth * elementSize;
-                            copyParam.WidthInBytes = copyParam.srcPitch;
-                            copyParam.Height = mipHeight;
-                            SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
-                            break;
-                        }
-                    case IResource::Type::Texture3D:
-                    case IResource::Type::TextureCube:
-                        {
-                            CUDA_MEMCPY3D copyParam;
-                            memset(&copyParam, 0, sizeof(copyParam));
-
-                            copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                            copyParam.dstArray = dstArray;
-
-                            copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-                            copyParam.srcHost = srcDataPtr;
-                            copyParam.srcPitch = mipWidth * elementSize;
-                            copyParam.WidthInBytes = copyParam.srcPitch;
-                            copyParam.Height = mipHeight;
-                            copyParam.Depth = mipDepth;
-
-                            SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
-                            break;
-                        }
-
-                    default:
-                        {
-                            SLANG_ASSERT(!"Not implemented");
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-        // Set up texture sampling parameters, and create final texture obj
-
-        {
-            CUDA_RESOURCE_DESC resDesc;
-            memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
-            resDesc.resType = resourceType;
-
-            if (tex->m_cudaArray)
-            {
-                resDesc.res.array.hArray = tex->m_cudaArray;
-            }
-            if (tex->m_cudaMipMappedArray)
-            {
-                resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
-            }
-
-            // If the texture might be used as a UAV, then we need to allocate
-            // a CUDA "surface" for it.
-            //
-            // Note: We cannot do this unconditionally, because it will fail
-            // on surfaces that are not usable as UAVs (e.g., those with
-            // mipmaps).
-            //
-            // TODO: We should really only be allocating the array at the
-            // time we create a resource, and then allocate the surface or
-            // texture objects as part of view creation.
-            //
-            if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
-            {
-                // On CUDA surfaces only support a single MIP map
-                SLANG_ASSERT(desc.numMipLevels == 1);
-
-                SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
-            }
-
-            
-            // Create handle for sampling.
-            CUDA_TEXTURE_DESC texDesc;
-            memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
-            texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
-            texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
-            texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
-            texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-            texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-            SLANG_CUDA_RETURN_ON_FAIL(
-                cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
-        }
-
-        returnComPtr(outResource, tex);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
-        const IBufferResource::Desc& descIn,
-        const void* initData,
-        IBufferResource** outResource) override
-    {
-        auto desc = fixupBufferDesc(descIn);
-        RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
-        resource->m_cudaContext = m_context;
-        SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
-        if (initData)
-        {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
-        }
-        returnComPtr(outResource, resource);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle(
-        InteropHandle handle,
-        const IBufferResource::Desc& desc,
-        IBufferResource** outResource) override
-    {
-        if (handle.handleValue == 0)
-        {
-            *outResource = nullptr;
-            return SLANG_OK;
-        }
-
-        RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
-        resource->m_cudaContext = m_context;
-
-        // CUDA manages sharing of buffers through the idea of an
-        // "external memory" object, which represents the relationship
-        // with another API's objects. In order to create this external
-        // memory association, we first need to fill in a descriptor struct.
-        cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
-        memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
-        switch (handle.api)
-        {
-        case InteropHandleAPI::D3D12:
-            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
-            break;
-        case InteropHandleAPI::Vulkan:
-            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
-            break;
-        default:
-            return SLANG_FAIL;
-        }
-        externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
-        externalMemoryHandleDesc.size = desc.sizeInBytes;
-        externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
-        // Once we have filled in the descriptor, we can request
-        // that CUDA create the required association between the
-        // external buffer and its own memory.
-        cudaExternalMemory_t externalMemory;
-        SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
-        resource->m_cudaExternalMemory = externalMemory;
-
-        // The CUDA "external memory" handle is not itself a device
-        // pointer, so we need to query for a suitable device address
-        // for the buffer with another call.
-        //
-        // Just as for the external memory, we fill in a descriptor
-        // structure (although in this case we only need to specify
-        // the size).
-        cudaExternalMemoryBufferDesc bufferDesc;
-        memset(&bufferDesc, 0, sizeof(bufferDesc));
-        bufferDesc.size = desc.sizeInBytes;
-
-        // Finally, we can "map" the buffer to get a device address.
-        void* deviceAddress;
-        SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
-        resource->m_cudaMemory = deviceAddress;
-
-        returnComPtr(outResource, resource);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle(
-        InteropHandle handle,
-        const ITextureResource::Desc& desc,
-        const size_t size,
-        ITextureResource** outResource) override
-    {
-        if (handle.handleValue == 0)
-        {
-            *outResource = nullptr;
-            return SLANG_OK;
-        }
-
-        RefPtr<TextureCUDAResource> resource = new TextureCUDAResource(desc);
-        resource->m_cudaContext = m_context;
-
-        // CUDA manages sharing of buffers through the idea of an
-        // "external memory" object, which represents the relationship
-        // with another API's objects. In order to create this external
-        // memory association, we first need to fill in a descriptor struct.
-        CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
-        memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
-        switch (handle.api)
-        {
-        case InteropHandleAPI::D3D12:
-            externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
-            break;
-        case InteropHandleAPI::Vulkan:
-            externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
-            break;
-        default:
-            return SLANG_FAIL;
-        }
-        externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
-        externalMemoryHandleDesc.size = size;
-        externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
-        CUexternalMemory externalMemory;
-        SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
-        resource->m_cudaExternalMemory = externalMemory;
-
-        FormatInfo formatInfo;
-        SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
-        CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-        arrayDesc.Depth = desc.size.depth;
-        arrayDesc.Height = desc.size.height;
-        arrayDesc.Width = desc.size.width;
-        arrayDesc.NumChannels = formatInfo.channelCount;
-        getCUDAFormat(desc.format, &arrayDesc.Format);
-        arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
-
-        CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
-        memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
-        externalMemoryMipDesc.offset = 0;
-        externalMemoryMipDesc.arrayDesc = arrayDesc;
-        externalMemoryMipDesc.numLevels = desc.numMipLevels;
-
-        CUmipmappedArray mipArray;
-        SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
-        resource->m_cudaMipMappedArray = mipArray;
-
-        CUarray cuArray;
-        SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
-        resource->m_cudaArray = cuArray;
-
-        CUDA_RESOURCE_DESC surfDesc;
-        memset(&surfDesc, 0, sizeof(surfDesc));
-        surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-        surfDesc.res.array.hArray = cuArray;
-
-        CUsurfObject surface;
-        SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
-        resource->m_cudaSurfObj = surface;
-
-        returnComPtr(outResource, resource);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
-        ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override
-    {
-        RefPtr<CUDAResourceView> view = new CUDAResourceView();
-        view->m_desc = desc;
-        view->textureResource = dynamic_cast<TextureCUDAResource*>(texture);
-        returnComPtr(outView, view);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
-        IBufferResource* buffer,
-        IBufferResource* counterBuffer,
-        IResourceView::Desc const& desc,
-        IResourceView** outView) override
-    {
-        RefPtr<CUDAResourceView> view = new CUDAResourceView();
-        view->m_desc = desc;
-        view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer);
-        returnComPtr(outView, view);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool(
-        const IQueryPool::Desc& desc,
-        IQueryPool** outPool) override
-    {
-        RefPtr<CUDAQueryPool> pool = new CUDAQueryPool();
-        SLANG_RETURN_ON_FAIL(pool->init(desc));
-        returnComPtr(outPool, pool);
-        return SLANG_OK;
-    }
-
-    virtual Result createShaderObjectLayout(
-        slang::TypeLayoutReflection*    typeLayout,
-        ShaderObjectLayoutBase**        outLayout) override
-    {
-        RefPtr<CUDAShaderObjectLayout> cudaLayout;
-        cudaLayout = new CUDAShaderObjectLayout(this, typeLayout);
-        returnRefPtrMove(outLayout, cudaLayout);
-        return SLANG_OK;
-    }
-
-    virtual Result createShaderObject(
-        ShaderObjectLayoutBase* layout,
-        IShaderObject**         outObject) override
-    {
-        RefPtr<CUDAShaderObject> result = new CUDAShaderObject();
-        SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
-        returnComPtr(outObject, result);
-        return SLANG_OK;
-    }
-
-    virtual Result createMutableShaderObject(
-        ShaderObjectLayoutBase* layout,
-        IShaderObject** outObject) override
-    {
-        RefPtr<CUDAMutableShaderObject> result = new CUDAMutableShaderObject();
-        SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
-        returnComPtr(outObject, result);
-        return SLANG_OK;
-    }
-
-    Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
-    {
-        auto cudaProgram = dynamic_cast<CUDAShaderProgram*>(program);
-        auto cudaLayout = cudaProgram->layout;
-
-        RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject();
-        SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
-        returnRefPtrMove(outObject, result);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createProgram(
-        const IShaderProgram::Desc& desc,
-        IShaderProgram** outProgram,
-        ISlangBlob** outDiagnosticBlob) override
-    {
-        // If this is a specializable program, we just keep a reference to the slang program and
-        // don't actually create any kernels. This program will be specialized later when we know
-        // the shader object bindings.
-        RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram();
-        cudaProgram->init(desc);
-        cudaProgram->cudaContext = m_context;
-        if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
-        {
-            cudaProgram->layout = new CUDAProgramLayout(this, desc.slangGlobalScope->getLayout());
-            returnComPtr(outProgram, cudaProgram);
-            return SLANG_OK;
-        }
-
-        ComPtr<ISlangBlob> kernelCode;
-        ComPtr<ISlangBlob> diagnostics;
-        auto compileResult = desc.slangGlobalScope->getEntryPointCode(
-            (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
-        if (diagnostics)
-        {
-            getDebugCallback()->handleMessage(
-                compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
-                DebugMessageSource::Slang,
-                (char*)diagnostics->getBufferPointer());
-            if (outDiagnosticBlob)
-                returnComPtr(outDiagnosticBlob, diagnostics);
-        }
-        SLANG_RETURN_ON_FAIL(compileResult);
-        
-        SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
-        cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
-        SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
-            &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
-
-        auto slangGlobalScope = desc.slangGlobalScope;
-        if( slangGlobalScope )
-        {
-            cudaProgram->slangGlobalScope = slangGlobalScope;
-
-            auto slangProgramLayout = slangGlobalScope->getLayout();
-            if(!slangProgramLayout)
-                return SLANG_FAIL;
-
-            RefPtr<CUDAProgramLayout> cudaLayout;
-            cudaLayout = new CUDAProgramLayout(this, slangProgramLayout);
-            cudaLayout->programLayout = slangProgramLayout;
-            cudaProgram->layout = cudaLayout;
-        }
-
-        returnComPtr(outProgram, cudaProgram);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
-        const ComputePipelineStateDesc& desc, IPipelineState** outState) override
-    {
-        RefPtr<CUDAPipelineState> state = new CUDAPipelineState();
-        state->shaderProgram = static_cast<CUDAShaderProgram*>(desc.program);
-        state->init(desc);
-        returnComPtr(outState, state);
-        return Result();
-    }
-
-    void* map(IBufferResource* buffer)
-    {
-        return static_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory;
-    }
-
-    void unmap(IBufferResource* buffer)
-    {
-        SLANG_UNUSED(buffer);
-    }
-
-    virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override
-    {
-        return m_info;
-    }
-
-public:
-    virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap(
-        const ITransientResourceHeap::Desc& desc,
-        ITransientResourceHeap** outHeap) override
-    {
-        RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
-        SLANG_RETURN_ON_FAIL(result->init(this, desc));
-        returnComPtr(outHeap, result);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override
-    {
-        RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
-        queue->init(this);
-        returnComPtr(outQueue, queue);
-        return SLANG_OK;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
-        const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(window);
-        SLANG_UNUSED(outSwapchain);
-        return SLANG_FAIL;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
-        const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(outLayout);
-        return SLANG_FAIL;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(outFramebuffer);
-        return SLANG_FAIL;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
-        const IRenderPassLayout::Desc& desc,
-        IRenderPassLayout** outRenderPassLayout) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(outRenderPassLayout);
-        return SLANG_FAIL;
-    }
-    virtual SLANG_NO_THROW Result SLANG_MCALL
-        createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override
-    {
-        SLANG_UNUSED(desc);
-        *outSampler = nullptr;
-        return SLANG_OK;
-    }
-    
-    virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
-        IInputLayout::Desc const& desc,
-        IInputLayout** outLayout) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(outLayout);
-        return SLANG_E_NOT_AVAILABLE;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
-        const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override
-    {
-        SLANG_UNUSED(desc);
-        SLANG_UNUSED(outState);
-        return SLANG_E_NOT_AVAILABLE;
-    }
-
-    virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
-        ITextureResource* texture,
-        ResourceState state,
-        ISlangBlob** outBlob,
-        size_t* outRowPitch,
-        size_t* outPixelSize) override
-    {
-        auto textureImpl = static_cast<TextureCUDAResource*>(texture);
-        RefPtr<ListBlob> blob = new ListBlob();
-
-        auto desc = textureImpl->getDesc();
-        auto width = desc->size.width;
-        auto height = desc->size.height;
-        FormatInfo sizeInfo;
-        SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
-        size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
-        size_t rowPitch = width * pixelSize;
-        size_t size = height * rowPitch;
-        blob->m_data.setCount((Index)size);
-
-        CUDA_MEMCPY2D copyParam;
-        memset(&copyParam, 0, sizeof(copyParam));
-
-        copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
-        copyParam.srcArray = textureImpl->m_cudaArray;
-
-        copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
-        copyParam.dstHost = blob->m_data.getBuffer();
-        copyParam.dstPitch = rowPitch;
-        copyParam.WidthInBytes = copyParam.dstPitch;
-        copyParam.Height = height;
-        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
-
-        *outRowPitch = rowPitch;
-        *outPixelSize = pixelSize;
-        returnComPtr(outBlob, blob);
-        return SLANG_OK;
-    }
-
-    virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
-        IBufferResource* buffer,
-        size_t offset,
-        size_t size,
-        ISlangBlob** outBlob) override
-    {
-        auto bufferImpl = static_cast<MemoryCUDAResource*>(buffer);
-        RefPtr<ListBlob> blob = new ListBlob();
-        blob->m_data.setCount((Index)size);
-        cudaMemcpy(
-            blob->m_data.getBuffer(),
-            (uint8_t*)bufferImpl->m_cudaMemory + offset,
-            size,
-            cudaMemcpyDefault);
-        returnComPtr(outBlob, blob);
-        return SLANG_OK;
-    }
-};
-
-SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
-    m_layout = typeLayout;
-
-    // If the layout tells us that there is any uniform data,
-    // then we need to allocate a constant buffer to hold that data.
-    //
-    // TODO: Do we need to allocate a shadow copy for use from
-    // the CPU?
-    //
-    // TODO: When/where do we bind this constant buffer into
-    // a descriptor set for later use?
-    //
-    auto slangLayout = getLayout()->getElementTypeLayout();
-    size_t uniformSize = slangLayout->getSize();
-    if (uniformSize)
-    {
-        m_data.setCount((Index)uniformSize);
-    }
-
-    // If the layout specifies that we have any resources or sub-objects,
-    // then we need to size the appropriate arrays to account for them.
-    //
-    // Note: the counts here are the *total* number of resources/sub-objects
-    // and not just the number of resource/sub-object ranges.
-    //
-    resources.setCount(typeLayout->getResourceCount());
-    m_objects.setCount(typeLayout->getSubObjectCount());
-
-    for (auto subObjectRange : getLayout()->subObjectRanges)
-    {
-        RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout;
-
-        // In the case where the sub-object range represents an
-        // existential-type leaf field (e.g., an `IBar`), we
-        // cannot pre-allocate the object(s) to go into that
-        // range, since we can't possibly know what to allocate
-        // at this point.
-        //
-        if (!subObjectLayout)
-            continue;
-        //
-        // Otherwise, we will allocate a sub-object to fill
-        // in each entry in this range, based on the layout
-        // information we already have.
-
-        auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
-        for (Index i = 0; i < bindingRangeInfo.count; ++i)
-        {
-            RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject();
-            SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
-
-            ShaderOffset offset;
-            offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
-            offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex;
-            offset.bindingArrayIndex = (GfxIndex)i;
-
-            SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
-        }
-    }
-    return SLANG_OK;
-}
-
-SlangResult CUDARootShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
-    SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(device, typeLayout));
-    auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout);
-    for (auto& entryPoint : programLayout->entryPointLayouts)
-    {
-        RefPtr<CUDAEntryPointShaderObject> object = new CUDAEntryPointShaderObject();
-        SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
-        entryPointObjects.add(object);
-    }
-    return SLANG_OK;
-}
-
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
-    RefPtr<CUDADevice> result = new CUDADevice();
-    SLANG_RETURN_ON_FAIL(result->initialize(*desc));
-    returnComPtr(outDevice, result);
-    return SLANG_OK;
-}
-#else
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
-    SLANG_UNUSED(desc);
-    *outDevice = nullptr;
-    return SLANG_FAIL;
-}
-#endif
-
-}
diff --git a/tools/gfx/cuda/render-cuda.h b/tools/gfx/cuda/render-cuda.h
deleted file mode 100644
index 5c477f513..000000000
--- a/tools/gfx/cuda/render-cuda.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include "../renderer-shared.h"
-
-namespace gfx
-{
-
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
-}
diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp
index ef4edb341..880e61c61 100644
--- a/tools/gfx/render.cpp
+++ b/tools/gfx/render.cpp
@@ -2,7 +2,6 @@
 #include "renderer-shared.h"
 #include "../../source/core/slang-math.h"
 #include "open-gl/render-gl.h"
-#include "cuda/render-cuda.h"
 #include "cpu/render-cpu.h"
 #include "debug-layer.h"
 
@@ -14,6 +13,7 @@ using namespace Slang;
 Result SLANG_MCALL createD3D11Device(const IDevice::Desc* desc, IDevice** outDevice);
 Result SLANG_MCALL createD3D12Device(const IDevice::Desc* desc, IDevice** outDevice);
 Result SLANG_MCALL createVKDevice(const IDevice::Desc* desc, IDevice** outDevice);
+Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice);
 
 static bool debugLayerEnabled = false;