From 856d7d3705cedabcc2e9389a3f0ac730b0e33476 Mon Sep 17 00:00:00 2001
From: Yong He <yonghe@outlook.com>
Date: Fri, 11 Dec 2020 09:42:23 -0800
Subject: Implements CUDA renderer in gfx. (#1637)

* Implements CUDA renderer in gfx.

* Revert unnecessary change.

* Revert unnecessary changes.

Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
---
 build/visual-studio/gfx/gfx.vcxproj            |    2 +
 build/visual-studio/gfx/gfx.vcxproj.filters    |    6 +
 premake5.lua                                   |   20 +-
 tests/compute/array-param.slang                |    1 +
 tests/compute/entry-point-uniform-params.slang |    3 +-
 tools/gfx/cuda/render-cuda.cpp                 | 1506 ++++++++++++++++++++++++
 tools/gfx/cuda/render-cuda.h                   |    8 +
 tools/gfx/render.cpp                           |    5 +
 tools/render-test/render-test-main.cpp         |    2 +-
 9 files changed, 1547 insertions(+), 6 deletions(-)
 create mode 100644 tools/gfx/cuda/render-cuda.cpp
 create mode 100644 tools/gfx/cuda/render-cuda.h
diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj
index d04ac9c2a..841a11c03 100644
--- a/build/visual-studio/gfx/gfx.vcxproj
+++ b/build/visual-studio/gfx/gfx.vcxproj
@@ -171,6 +171,7 @@
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d11\render-d3d11.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d12\circular-resource-heap-d3d12.h" />
@@ -197,6 +198,7 @@
     <ClInclude Include="..\..\..\tools\gfx\window.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d11\render-d3d11.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d12\circular-resource-heap-d3d12.cpp" />
diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters
index ada84b968..d55d0d4bb 100644
--- a/build/visual-studio/gfx/gfx.vcxproj.filters
+++ b/build/visual-studio/gfx/gfx.vcxproj.filters
@@ -9,6 +9,9 @@
     </Filter>
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -83,6 +86,9 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/premake5.lua b/premake5.lua
index 801847cde..588caaf79 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -723,8 +723,6 @@ toolSharedLibrary "render-test"
             includedirs { optixPath .. "include/" }
         end
         
-        links { "cuda", "cudart" }   
-        
         filter { "platforms:x86" }
             libdirs { cudaPath .. "/lib/Win32/" }
            
@@ -767,9 +765,23 @@ tool "gfx"
         addSourceDir "tools/gfx/d3d" 
         addSourceDir "tools/gfx/d3d11"
         addSourceDir "tools/gfx/d3d12"
-        
+        addSourceDir "tools/gfx/cuda"
         addSourceDir "tools/gfx/windows"
-        
+
+        if type(cudaPath) == "string" then
+            defines { "GFX_ENABLE_CUDA" }
+            includedirs { cudaPath .. "/include" }
+            includedirs { cudaPath .. "/include", cudaPath .. "/common/inc" }
+            if optixPath then
+                defines { "GFX_OPTIX" }
+                includedirs { optixPath .. "include/" }
+            end
+            links { "cuda", "cudart" }   
+            filter { "platforms:x86" }
+                libdirs { cudaPath .. "/lib/Win32/" }
+            filter { "platforms:x64" }
+                libdirs { cudaPath .. "/lib/x64/" }
+        end
     elseif targetDetail == "mingw" or targetDetail == "cygwin" then
         -- Don't support any render techs...
     elseif os.target() == "macosx" then
diff --git a/tests/compute/array-param.slang b/tests/compute/array-param.slang
index ceac34e78..5f5528a10 100644
--- a/tests/compute/array-param.slang
+++ b/tests/compute/array-param.slang
@@ -1,3 +1,4 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -compile-arg -O3
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
diff --git a/tests/compute/entry-point-uniform-params.slang b/tests/compute/entry-point-uniform-params.slang
index 736a4c05a..db43f4ab7 100644
--- a/tests/compute/entry-point-uniform-params.slang
+++ b/tests/compute/entry-point-uniform-params.slang
@@ -3,7 +3,8 @@
 // Confirm that `uniform` parameters on
 // entry points are allowed, and work as expected.
 
-//DISABLE_TEST:CPU_REFLECTION: -profile cs_5_0 -entry computeMain -target cpp 
+//DISABLE_TEST:CPU_REFLECTION: -profile cs_5_0 -entry computeMain -target cpp
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cpu
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
new file mode 100644
index 000000000..fcb27fe0b
--- /dev/null
+++ b/tools/gfx/cuda/render-cuda.cpp
@@ -0,0 +1,1506 @@
+#include "render-cuda.h"
+
+#ifdef GFX_ENABLE_CUDA
+#include "../render.h"
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "../../source/core/slang-std-writers.h"
+#include "slang.h"
+#endif
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
+SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
+
+// A enum used to control if errors are reported on failure of CUDA call.
+enum class CUDAReportStyle
+{
+    Normal,
+    Silent,
+};
+
+struct CUDAErrorInfo
+{
+    CUDAErrorInfo(
+        const char* filePath,
+        int lineNo,
+        const char* errorName = nullptr,
+        const char* errorString = nullptr)
+        : m_filePath(filePath)
+        , m_lineNo(lineNo)
+        , m_errorName(errorName)
+        , m_errorString(errorString)
+    {}
+    SlangResult handle() const
+    {
+        StringBuilder builder;
+        builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
+
+        if (m_errorName)
+        {
+            builder << m_errorName << " : ";
+        }
+        if (m_errorString)
+        {
+            builder << m_errorString;
+        }
+
+        StdWriters::getError().put(builder.getUnownedSlice());
+
+        // Slang::signalUnexpectedError(builder.getBuffer());
+        return SLANG_FAIL;
+    }
+
+    const char* m_filePath;
+    int m_lineNo;
+    const char* m_errorName;
+    const char* m_errorString;
+};
+
+#    if 1
+// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
+
+static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
+{
+    CUDAErrorInfo info(file, line);
+    cuGetErrorString(cuResult, &info.m_errorString);
+    cuGetErrorName(cuResult, &info.m_errorName);
+    return info.handle();
+}
+
+static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
+{
+    return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
+}
+
+#        define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res, __FILE__, __LINE__)
+
+#    else
+// If this code path is enabled, errors are not reported, but can have an assert enabled
+
+static SlangResult _handleCUDAError(CUresult cuResult)
+{
+    SLANG_UNUSED(cuResult);
+    // SLANG_ASSERT(!"Failed CUDA call");
+    return SLANG_FAIL;
+}
+
+static SlangResult _handleCUDAError(cudaError_t error)
+{
+    SLANG_UNUSED(error);
+    // SLANG_ASSERT(!"Failed CUDA call");
+    return SLANG_FAIL;
+}
+
+#        define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res)
+#    endif
+
+#    define SLANG_CUDA_RETURN_ON_FAIL(x)              \
+        {                                             \
+            auto _res = x;                            \
+            if (_isError(_res))                       \
+                return SLANG_CUDA_HANDLE_ERROR(_res); \
+        }
+#    define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r)                               \
+        {                                                                             \
+            auto _res = x;                                                            \
+            if (_isError(_res))                                                       \
+            {                                                                         \
+                return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
+                                                      : SLANG_FAIL;                   \
+            }                                                                         \
+        }
+
+#    define SLANG_CUDA_ASSERT_ON_FAIL(x)           \
+        {                                          \
+            auto _res = x;                         \
+            if (_isError(_res))                    \
+            {                                      \
+                SLANG_ASSERT(!"Failed CUDA call"); \
+            };                                     \
+        }
+
+#    ifdef RENDER_TEST_OPTIX
+
+static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }
+
+#        if 1
+static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
+{
+    fprintf(
+        stderr,
+        "%s(%d): optix: %s (%s)\n",
+        file,
+        line,
+        optixGetErrorString(result),
+        optixGetErrorName(result));
+    return SLANG_FAIL;
+}
+#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
+#        else
+#            define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
+#        endif
+
+#        define SLANG_OPTIX_RETURN_ON_FAIL(EXPR)           \
+            do                                             \
+            {                                              \
+                auto _res = EXPR;                          \
+                if (_isError(_res))                        \
+                    return SLANG_OPTIX_HANDLE_ERROR(_res); \
+            } while (0)
+
+void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
+{
+    fprintf(stderr, "optix: %s (%s)\n", message, tag);
+}
+
+#    endif
+
+class MemoryCUDAResource : public BufferResource
+{
+public:
+    MemoryCUDAResource(const Desc& _desc)
+        : BufferResource(_desc)
+    {}
+
+    ~MemoryCUDAResource()
+    {
+        if (m_cudaMemory)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+        }
+    }
+
+    uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; }
+
+    void* m_cudaMemory = nullptr;
+};
+
+class TextureCUDAResource : public TextureResource
+{
+public:
+    TextureCUDAResource(const TextureResource::Desc& desc)
+        : TextureResource(desc)
+    {}
+    ~TextureCUDAResource()
+    {
+        if (m_cudaSurfObj)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
+        }
+        if (m_cudaTexObj)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
+        }
+        if (m_cudaArray)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
+        }
+        if (m_cudaMipMappedArray)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
+        }
+    }
+
+    uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; }
+
+    // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
+    // a long long
+    CUtexObject m_cudaTexObj = CUtexObject();
+
+    // The surfObj is for reading/writing 'texture like' things, but not for sampling.
+    CUsurfObject m_cudaSurfObj = CUsurfObject();
+
+    CUarray m_cudaArray = CUarray();
+    CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
+};
+
+class CUDAResourceView : public ResourceView
+{
+public:
+    Desc desc;
+    RefPtr<MemoryCUDAResource> memoryResource = nullptr;
+    RefPtr<TextureCUDAResource> textureResource = nullptr;
+};
+
+class CUDAShaderProgram : public ShaderProgram
+{
+public:
+    CUmodule cudaModule = nullptr;
+    CUfunction cudaKernel;
+    String kernelName;
+    ~CUDAShaderProgram()
+    {
+        if (cudaModule)
+            cuModuleUnload(cudaModule);
+    }
+};
+
+class CUDAPipelineState : public PipelineState
+{
+public:
+    RefPtr<CUDAShaderProgram> shaderProgram;
+};
+
+class CUDAShaderObjectLayout : public ShaderObjectLayout
+{
+public:
+    slang::TypeLayoutReflection* typeLayout = nullptr;
+
+    struct BindingRangeInfo
+    {
+        slang::BindingType bindingType;
+        Index count;
+        Index baseIndex;
+        Index descriptorSetIndex;
+        Index rangeIndexInDescriptorSet;
+        Index uniformOffset;
+        //        Index   subObjectRangeIndex = -1;
+    };
+
+    struct SubObjectRangeInfo
+    {
+        RefPtr<CUDAShaderObjectLayout> layout;
+        Index bindingRangeIndex;
+    };
+
+    List<SubObjectRangeInfo> subObjectRanges;
+    List<BindingRangeInfo> m_bindingRanges;
+
+    Index m_resourceViewCount = 0;
+    Index m_samplerCount = 0;
+    Index m_combinedTextureSamplerCount = 0;
+    Index m_subObjectCount = 0;
+    Index m_varyingInputCount = 0;
+    Index m_varyingOutputCount = 0;
+
+    slang::TypeLayoutReflection* unwrapParameterGroups(slang::TypeLayoutReflection* typeLayout)
+    {
+        for (;;)
+        {
+            if (!typeLayout->getType())
+            {
+                if (auto elementTypeLayout = typeLayout->getElementTypeLayout())
+                    typeLayout = elementTypeLayout;
+            }
+
+            switch (typeLayout->getKind())
+            {
+            default:
+                return typeLayout;
+
+            case slang::TypeReflection::Kind::ConstantBuffer:
+            case slang::TypeReflection::Kind::ParameterBlock:
+                typeLayout = typeLayout->getElementTypeLayout();
+                continue;
+            }
+        }
+    }
+
+    CUDAShaderObjectLayout(slang::TypeLayoutReflection* layout)
+    {
+        typeLayout = unwrapParameterGroups(layout);
+
+        // Compute the binding ranges that are used to store
+        // the logical contents of the object in memory. These will relate
+        // to the descriptor ranges in the various sets, but not always
+        // in a one-to-one fashion.
+
+        SlangInt bindingRangeCount = typeLayout->getBindingRangeCount();
+        for (SlangInt r = 0; r < bindingRangeCount; ++r)
+        {
+            slang::BindingType slangBindingType = typeLayout->getBindingRangeType(r);
+            SlangInt count = typeLayout->getBindingRangeBindingCount(r);
+            slang::TypeLayoutReflection* slangLeafTypeLayout =
+                typeLayout->getBindingRangeLeafTypeLayout(r);
+
+            SlangInt descriptorSetIndex = typeLayout->getBindingRangeDescriptorSetIndex(r);
+            SlangInt rangeIndexInDescriptorSet =
+                typeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
+
+            auto uniformOffset = typeLayout->getDescriptorSetDescriptorRangeIndexOffset(
+                descriptorSetIndex, rangeIndexInDescriptorSet);
+
+            Index baseIndex = 0;
+            switch (slangBindingType)
+            {
+            case slang::BindingType::ConstantBuffer:
+            case slang::BindingType::ParameterBlock:
+            case slang::BindingType::ExistentialValue:
+                baseIndex = m_subObjectCount;
+                m_subObjectCount += count;
+                break;
+
+            case slang::BindingType::Sampler:
+                baseIndex = m_samplerCount;
+                m_samplerCount += count;
+                break;
+
+            case slang::BindingType::CombinedTextureSampler:
+                baseIndex = m_combinedTextureSamplerCount;
+                m_combinedTextureSamplerCount += count;
+                break;
+
+            case slang::BindingType::VaryingInput:
+                baseIndex = m_varyingInputCount;
+                m_varyingInputCount += count;
+                break;
+
+            case slang::BindingType::VaryingOutput:
+                baseIndex = m_varyingOutputCount;
+                m_varyingOutputCount += count;
+                break;
+
+            default:
+                baseIndex = m_resourceViewCount;
+                m_resourceViewCount += count;
+                break;
+            }
+
+            BindingRangeInfo bindingRangeInfo;
+            bindingRangeInfo.bindingType = slangBindingType;
+            bindingRangeInfo.count = count;
+            //                bindingRangeInfo.descriptorSetIndex = descriptorSetIndex;
+            //                bindingRangeInfo.rangeIndexInDescriptorSet = slotRangeIndex;
+            //                bindingRangeInfo.subObjectRangeIndex = subObjectRangeIndex;
+            bindingRangeInfo.baseIndex = baseIndex;
+            bindingRangeInfo.descriptorSetIndex = descriptorSetIndex;
+            bindingRangeInfo.rangeIndexInDescriptorSet = rangeIndexInDescriptorSet;
+            bindingRangeInfo.uniformOffset = uniformOffset;
+            m_bindingRanges.add(bindingRangeInfo);
+        }
+
+        SlangInt subObjectRangeCount = typeLayout->getSubObjectRangeCount();
+        for (SlangInt r = 0; r < subObjectRangeCount; ++r)
+        {
+            SlangInt bindingRangeIndex = typeLayout->getSubObjectRangeBindingRangeIndex(r);
+            auto slangBindingType = typeLayout->getBindingRangeType(bindingRangeIndex);
+            slang::TypeLayoutReflection* slangLeafTypeLayout =
+                typeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
+
+            // A sub-object range can either represent a sub-object of a known
+            // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
+            // (in which case we can pre-compute a layout to use, based on
+            // the type `Foo`) *or* it can represent a sub-object of some
+            // existential type (e.g., `IBar`) in which case we cannot
+            // know the appropraite type/layout of sub-object to allocate.
+            //
+            RefPtr<CUDAShaderObjectLayout> subObjectLayout;
+            if (slangBindingType != slang::BindingType::ExistentialValue)
+            {
+                subObjectLayout =
+                    new CUDAShaderObjectLayout(slangLeafTypeLayout->getElementTypeLayout());
+            }
+
+            SubObjectRangeInfo subObjectRange;
+            subObjectRange.bindingRangeIndex = bindingRangeIndex;
+            subObjectRange.layout = subObjectLayout;
+            subObjectRanges.add(subObjectRange);
+        }
+    }
+};
+
+class CUDAProgramLayout : public CUDAShaderObjectLayout
+{
+public:
+    slang::ProgramLayout* programLayout = nullptr;
+    List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts;
+    CUDAProgramLayout(slang::ProgramLayout* inProgramLayout)
+        : CUDAShaderObjectLayout(inProgramLayout->getGlobalParamsTypeLayout())
+        , programLayout(inProgramLayout)
+    {
+        for (UInt i =0; i< programLayout->getEntryPointCount(); i++)
+        {
+            entryPointLayouts.add(new CUDAShaderObjectLayout(
+                programLayout->getEntryPointByIndex(i)->getTypeLayout()));
+        }
+
+    }
+
+    int getKernelIndex(UnownedStringSlice kernelName)
+    {
+        for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
+        {
+            auto entryPoint = programLayout->getEntryPointByIndex(i);
+            if (kernelName == entryPoint->getName())
+            {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
+    {
+        auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
+        entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
+    }
+};
+
+class CUDAShaderObject : public ShaderObject
+{
+public:
+    RefPtr<MemoryCUDAResource> bufferResource;
+    RefPtr<CUDAShaderObjectLayout> layout;
+    List<RefPtr<CUDAShaderObject>> objects;
+    List<RefPtr<CUDAResourceView>> resources;
+
+    virtual SlangResult init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout);
+
+    virtual slang::TypeLayoutReflection* getElementTypeLayout() override
+    {
+        return layout->typeLayout;
+    }
+    virtual Slang::Index getEntryPointCount() override { return 0; }
+    virtual ShaderObject* getEntryPoint(Slang::Index index) override { return nullptr; }
+    virtual SlangResult setData(ShaderOffset const& offset, void const* data, size_t size)
+    {
+        size = Math::Min(size, bufferResource->getDesc().sizeInBytes - offset.uniformOffset);
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+            (uint8_t*)bufferResource->m_cudaMemory + offset.uniformOffset,
+            data,
+            size,
+            cudaMemcpyHostToDevice));
+        return SLANG_OK;
+    }
+    virtual SlangResult getObject(ShaderOffset const& offset, ShaderObject** object)
+    {
+        auto subObjectIndex =
+            layout->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex;
+        if (subObjectIndex >= objects.getCount())
+        {
+            *object = nullptr;
+            return SLANG_OK;
+        }
+        *object = objects[subObjectIndex].Ptr();
+        return SLANG_OK;
+    }
+    virtual SlangResult setObject(ShaderOffset const& offset, ShaderObject* object)
+    {
+        auto subObjectIndex =
+            layout->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex;
+        SLANG_ASSERT(
+            offset.uniformOffset ==
+            layout->m_bindingRanges[offset.bindingRangeIndex].uniformOffset +
+                offset.bindingArrayIndex * sizeof(void*));
+        auto cudaObject = dynamic_cast<CUDAShaderObject*>(object);
+        if (subObjectIndex >= objects.getCount())
+            objects.setCount(subObjectIndex + 1);
+        objects[subObjectIndex] = cudaObject;
+        return setData(offset, &cudaObject->bufferResource->m_cudaMemory, sizeof(void*));
+    }
+    virtual SlangResult setResource(ShaderOffset const& offset, ResourceView* resourceView)
+    {
+        auto cudaView = dynamic_cast<CUDAResourceView*>(resourceView);
+        if (offset.bindingRangeIndex >= resources.getCount())
+            resources.setCount(offset.bindingRangeIndex + 1);
+        resources[offset.bindingRangeIndex] = cudaView;
+        if (cudaView->textureResource)
+        {
+            if (cudaView->desc.type == ResourceView::Type::UnorderedAccess)
+            {
+                auto handle = cudaView->textureResource->getBindlessHandle();
+                setData(offset, &handle, sizeof(uint64_t));
+            }
+            else
+            {
+                auto handle = cudaView->textureResource->m_cudaSurfObj;
+                setData(offset, &handle, sizeof(uint64_t));
+            }
+        }
+        else
+        {
+            auto handle = cudaView->memoryResource->getBindlessHandle();
+            setData(offset, &handle, sizeof(handle));
+            auto sizeOffset = offset;
+            sizeOffset.uniformOffset += sizeof(handle);
+            auto& desc = cudaView->memoryResource->getDesc();
+            size_t size = desc.sizeInBytes;
+            if (desc.elementSize > 1)
+                size /= desc.elementSize;
+            setData(sizeOffset, &size, sizeof(size));
+
+        }
+        return SLANG_OK;
+    }
+    virtual SlangResult setSampler(ShaderOffset const& offset, SamplerState* sampler)
+    {
+        SLANG_UNUSED(sampler);
+        SLANG_UNUSED(offset);
+        return SLANG_OK;
+    }
+    virtual SlangResult setCombinedTextureSampler(
+        ShaderOffset const& offset, ResourceView* textureView, SamplerState* sampler)
+    {
+        SLANG_UNUSED(sampler);
+        setResource(offset, textureView);
+        return SLANG_OK;
+    }
+};
+
+class CUDARootShaderObject : public CUDAShaderObject
+{
+public:
+    List<RefPtr<CUDAShaderObject>> entryPointObjects;
+    virtual SlangResult init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout) override;
+    virtual Slang::Index getEntryPointCount() override { return entryPointObjects.getCount(); }
+    virtual ShaderObject* getEntryPoint(Slang::Index index) override { return entryPointObjects[index].Ptr(); }
+};
+
+class CUDARenderer : public Renderer
+{
+private:
+    static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
+    static int _calcSMCountPerMultiProcessor(int major, int minor)
+    {
+        // Defines for GPU Architecture types (using the SM version to determine
+        // the # of cores per SM
+        struct SMInfo
+        {
+            int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+            int coreCount;
+        };
+
+        static const SMInfo infos[] = {
+            {0x30, 192},
+            {0x32, 192},
+            {0x35, 192},
+            {0x37, 192},
+            {0x50, 128},
+            {0x52, 128},
+            {0x53, 128},
+            {0x60, 64},
+            {0x61, 128},
+            {0x62, 128},
+            {0x70, 64},
+            {0x72, 64},
+            {0x75, 64}};
+
+        const int sm = ((major << 4) + minor);
+        for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+        {
+            if (infos[i].sm == sm)
+            {
+                return infos[i].coreCount;
+            }
+        }
+
+        const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+        // It must be newer presumably
+        SLANG_ASSERT(sm > last.sm);
+
+        // Default to the last entry
+        return last.coreCount;
+    }
+
+    static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
+    {
+        int smPerMultiproc = 0;
+        int maxPerfDevice = -1;
+        int deviceCount = 0;
+        int devicesProhibited = 0;
+
+        uint64_t maxComputePerf = 0;
+        SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+        // Find the best CUDA capable GPU device
+        for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+        {
+            int computeMode = -1, major = 0, minor = 0;
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+            // If this GPU is not running on Compute Mode prohibited,
+            // then we can add it to the list
+            if (computeMode != cudaComputeModeProhibited)
+            {
+                if (major == 9999 && minor == 9999)
+                {
+                    smPerMultiproc = 1;
+                }
+                else
+                {
+                    smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+                }
+
+                int multiProcessorCount = 0, clockRate = 0;
+                SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
+                    &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+                SLANG_CUDA_RETURN_ON_FAIL(
+                    cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+                uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+                if (compute_perf > maxComputePerf)
+                {
+                    maxComputePerf = compute_perf;
+                    maxPerfDevice = currentDevice;
+                }
+            }
+            else
+            {
+                devicesProhibited++;
+            }
+        }
+
+        if (maxPerfDevice < 0)
+        {
+            return SLANG_FAIL;
+        }
+
+        *outDeviceIndex = maxPerfDevice;
+        return SLANG_OK;
+    }
+
+    static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
+    {
+        static CUresult res = cuInit(0);
+        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
+        return SLANG_OK;
+    }
+
+private:
+    int m_deviceIndex = -1;
+    CUdevice m_device = 0;
+    CUcontext m_context = nullptr;
+    CUDAPipelineState* currentPipeline = nullptr;
+    CUDARootShaderObject* currentRootObject = nullptr;
+ public:
+    ~CUDARenderer()
+    {
+        if (m_context)
+        {
+            cuCtxDestroy(m_context);
+        }
+    }
+    virtual SlangResult initialize(const Desc& desc, void* inWindowHandle) override
+    {
+        SLANG_RETURN_ON_FAIL(_initCuda(reportType));
+
+        SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
+        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
+
+        if (m_context)
+        {
+            cuCtxDestroy(m_context);
+            m_context = nullptr;
+        }
+
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
+
+        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, 0, m_device), reportType);
+        return SLANG_OK;
+    }
+
+    virtual Result createTextureResource(
+        Resource::Usage initialUsage,
+        const TextureResource::Desc& desc,
+        const TextureResource::Data* initData,
+        TextureResource** outResource) override
+    {
+        RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(desc);
+        CUresourcetype resourceType;
+        size_t elementSize = 0;
+
+        {
+            CUarray_format format = CU_AD_FORMAT_FLOAT;
+            int numChannels = 0;
+
+            switch (desc.format)
+            {
+            case Format::R_Float32:
+                {
+                    format = CU_AD_FORMAT_FLOAT;
+                    numChannels = 1;
+                    elementSize = sizeof(float);
+                    break;
+                }
+            case Format::RGBA_Unorm_UInt8:
+                {
+                    format = CU_AD_FORMAT_UNSIGNED_INT8;
+                    numChannels = 4;
+                    elementSize = sizeof(uint32_t);
+                    break;
+                }
+            default:
+                {
+                    SLANG_ASSERT(!"Only support R_Float32/RGBA_Unorm_UInt8 formats for now");
+                    return SLANG_FAIL;
+                }
+            }
+
+            if (desc.numMipLevels > 1)
+            {
+                resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                arrayDesc.Width = desc.size.width;
+                arrayDesc.Height = desc.size.height;
+                arrayDesc.Depth = desc.size.depth;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+                arrayDesc.Flags = 0;
+
+                if (desc.arraySize > 1)
+                {
+                    if (desc.type == Resource::Type::Texture1D ||
+                        desc.type == Resource::Type::Texture2D ||
+                        desc.type == Resource::Type::TextureCube)
+                    {
+                        arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
+                        arrayDesc.Depth = desc.arraySize;
+                    }
+                    else
+                    {
+                        SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
+                        return SLANG_FAIL;
+                    }
+                }
+
+                if (desc.type == Resource::Type::TextureCube)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                    arrayDesc.Depth *= 6;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(
+                    cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
+            }
+            else
+            {
+                resourceType = CU_RESOURCE_TYPE_ARRAY;
+
+                if (desc.arraySize > 1)
+                {
+                    if (desc.type == Resource::Type::Texture1D ||
+                        desc.type == Resource::Type::Texture2D ||
+                        desc.type == Resource::Type::TextureCube)
+                    {
+                        SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
+                        return SLANG_FAIL;
+                    }
+
+                    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                    memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                    // Set the depth as the array length
+                    arrayDesc.Depth = desc.arraySize;
+                    if (desc.type == Resource::Type::TextureCube)
+                    {
+                        arrayDesc.Depth *= 6;
+                    }
+
+                    arrayDesc.Height = desc.size.height;
+                    arrayDesc.Width = desc.size.width;
+                    arrayDesc.Format = format;
+                    arrayDesc.NumChannels = numChannels;
+
+                    if (desc.type == Resource::Type::TextureCube)
+                    {
+                        arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                    }
+
+                    SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+                }
+                else if (desc.type == Resource::Type::Texture3D ||
+                    desc.type == Resource::Type::TextureCube)
+                {
+                    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                    memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                    arrayDesc.Depth = desc.size.depth;
+                    arrayDesc.Height = desc.size.height;
+                    arrayDesc.Width = desc.size.width;
+                    arrayDesc.Format = format;
+                    arrayDesc.NumChannels = numChannels;
+
+                    arrayDesc.Flags = 0;
+
+                    // Handle cube texture
+                    if (desc.type == Resource::Type::TextureCube)
+                    {
+                        arrayDesc.Depth = 6;
+                        arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                    }
+
+                    SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+                }
+                else
+                {
+                    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                    memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                    arrayDesc.Height = desc.size.height;
+                    arrayDesc.Width = desc.size.width;
+                    arrayDesc.Format = format;
+                    arrayDesc.NumChannels = numChannels;
+
+                    // Allocate the array, will work for 1D or 2D case
+                    SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
+                }
+            }
+        }
+
+        // Work space for holding data for uploading if it needs to be rearranged
+        List<uint8_t> workspace;
+        auto width = desc.size.width;
+        auto height = desc.size.height;
+        auto depth = desc.size.depth;
+        for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
+        {
+            int mipWidth = width >> mipLevel;
+            int mipHeight = height >> mipLevel;
+            int mipDepth = depth >> mipLevel;
+
+            mipWidth = (mipWidth == 0) ? 1 : mipWidth;
+            mipHeight = (mipHeight == 0) ? 1 : mipHeight;
+            mipDepth = (mipDepth == 0) ? 1 : mipDepth;
+
+            // If it's a cubemap then the depth is always 6
+            if (desc.type == Resource::Type::TextureCube)
+            {
+                mipDepth = 6;
+            }
+
+            auto dstArray = tex->m_cudaArray;
+            if (tex->m_cudaMipMappedArray)
+            {
+                // Get the array for the mip level
+                SLANG_CUDA_RETURN_ON_FAIL(
+                    cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
+            }
+            SLANG_ASSERT(dstArray);
+
+            // Check using the desc to see if it's plausible
+            {
+                CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
+
+                SLANG_ASSERT(mipWidth == arrayDesc.Width);
+                SLANG_ASSERT(
+                    mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
+            }
+
+            const void* srcDataPtr = nullptr;
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == Resource::Type::Texture1D ||
+                    desc.type == Resource::Type::Texture2D ||
+                    desc.type == Resource::Type::TextureCube);
+
+                // TODO(JS): Here I assume that arrays are just held contiguously within a 'face'
+                // This seems reasonable and works with the Copy3D.
+                const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                Index faceCount = desc.arraySize;
+                if (desc.type == Resource::Type::TextureCube)
+                {
+                    faceCount *= 6;
+                }
+
+                const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
+                workspace.setCount(mipSizeInBytes);
+
+                // We need to add the face data from each mip
+                // We iterate over face count so we copy all of the cubemap faces
+                if (initData)
+                {
+                    for (Index j = 0; j < faceCount; j++)
+                    {
+                        const auto srcData = initData->subResources[mipLevel + j * desc.numMipLevels];
+                        // Copy over to the workspace to make contiguous
+                        ::memcpy(
+                            workspace.begin() + faceSizeInBytes * j, srcData,
+                            faceSizeInBytes);
+                    }
+                }
+
+                srcDataPtr = workspace.getBuffer();
+            }
+            else
+            {
+                if (desc.type == Resource::Type::TextureCube)
+                {
+                    size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                    workspace.setCount(faceSizeInBytes * 6);
+
+                    // Copy the data over to make contiguous
+                    for (Index j = 0; j < 6; j++)
+                    {
+                        const auto srcData =
+                            initData->subResources[mipLevel + j * desc.numMipLevels];
+                        ::memcpy(
+                            workspace.getBuffer() + faceSizeInBytes * j, srcData,
+                            faceSizeInBytes);
+                    }
+
+                    srcDataPtr = workspace.getBuffer();
+                }
+                else
+                {
+                    const auto srcData = initData->subResources[mipLevel];
+                    srcDataPtr = srcData;
+                }
+            }
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == Resource::Type::Texture1D ||
+                    desc.type == Resource::Type::Texture2D ||
+                    desc.type == Resource::Type::TextureCube);
+
+                CUDA_MEMCPY3D copyParam;
+                memset(&copyParam, 0, sizeof(copyParam));
+
+                copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                copyParam.dstArray = dstArray;
+
+                copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                copyParam.srcHost = srcDataPtr;
+                copyParam.srcPitch = mipWidth * elementSize;
+                copyParam.WidthInBytes = copyParam.srcPitch;
+                copyParam.Height = mipHeight;
+                // Set the depth to the array length
+                copyParam.Depth = desc.arraySize;
+
+                if (desc.type == Resource::Type::TextureCube)
+                {
+                    copyParam.Depth *= 6;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+            }
+            else
+            {
+                switch (desc.type)
+                {
+                case Resource::Type::Texture1D:
+                case Resource::Type::Texture2D:
+                    {
+                        CUDA_MEMCPY2D copyParam;
+                        memset(&copyParam, 0, sizeof(copyParam));
+                        copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                        copyParam.dstArray = dstArray;
+                        copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                        copyParam.srcHost = srcDataPtr;
+                        copyParam.srcPitch = mipWidth * elementSize;
+                        copyParam.WidthInBytes = copyParam.srcPitch;
+                        copyParam.Height = mipHeight;
+                        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+                        break;
+                    }
+                case Resource::Type::Texture3D:
+                case Resource::Type::TextureCube:
+                    {
+                        CUDA_MEMCPY3D copyParam;
+                        memset(&copyParam, 0, sizeof(copyParam));
+
+                        copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                        copyParam.dstArray = dstArray;
+
+                        copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                        copyParam.srcHost = srcDataPtr;
+                        copyParam.srcPitch = mipWidth * elementSize;
+                        copyParam.WidthInBytes = copyParam.srcPitch;
+                        copyParam.Height = mipHeight;
+                        copyParam.Depth = mipDepth;
+
+                        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+                        break;
+                    }
+
+                default:
+                    {
+                        SLANG_ASSERT(!"Not implemented");
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Set up texture sampling parameters, and create final texture obj
+
+        {
+            CUDA_RESOURCE_DESC resDesc;
+            memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+            resDesc.resType = resourceType;
+
+            if (tex->m_cudaArray)
+            {
+                resDesc.res.array.hArray = tex->m_cudaArray;
+            }
+            if (tex->m_cudaMipMappedArray)
+            {
+                resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
+            }
+
+            // Create handle for uav.
+            SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+            
+            // Create handle for sampling.
+            CUDA_TEXTURE_DESC texDesc;
+            memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+            texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+            texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+            texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+        }
+
+        *outResource = tex.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createBufferResource(
+        Resource::Usage initialUsage,
+        const BufferResource::Desc& desc,
+        const void* initData,
+        BufferResource** outResource) override
+    {
+        RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+        if (initData)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyHostToDevice));
+        }
+        *outResource = resource.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createTextureView(
+        TextureResource* texture, ResourceView::Desc const& desc, ResourceView** outView) override
+    {
+        RefPtr<CUDAResourceView> view = new CUDAResourceView();
+        view->desc = desc;
+        view->textureResource = dynamic_cast<TextureCUDAResource*>(texture);
+        *outView = view.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createBufferView(
+        BufferResource* buffer, ResourceView::Desc const& desc, ResourceView** outView) override
+    {
+        RefPtr<CUDAResourceView> view = new CUDAResourceView();
+        view->desc = desc;
+        view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer);
+        *outView = view.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createShaderObjectLayout(
+        slang::TypeLayoutReflection* typeLayout, ShaderObjectLayout** outLayout) override
+    {
+        RefPtr<CUDAShaderObjectLayout> cudaLayout;
+        cudaLayout = new CUDAShaderObjectLayout(typeLayout);
+        *outLayout = cudaLayout.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createRootShaderObjectLayout(
+        slang::ProgramLayout* layout, ShaderObjectLayout** outLayout) override
+    {
+        RefPtr<CUDAProgramLayout> cudaLayout;
+        cudaLayout = new CUDAProgramLayout(layout);
+        cudaLayout->programLayout = layout;
+        *outLayout = cudaLayout.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createShaderObject(ShaderObjectLayout* layout, ShaderObject** outObject) override
+    {
+        RefPtr<CUDAShaderObject> result = new CUDAShaderObject();
+        SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
+        *outObject = result.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createRootShaderObject(ShaderObjectLayout* layout, ShaderObject** outObject) override
+    {
+        RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject();
+        SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
+        *outObject = result.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result bindRootShaderObject(PipelineType pipelineType, ShaderObject* object) override
+    {
+        currentRootObject = dynamic_cast<CUDARootShaderObject*>(object);
+        if (currentRootObject)
+            return SLANG_OK;
+        return SLANG_E_INVALID_ARG;
+    }
+
+    virtual Result createProgram(const ShaderProgram::Desc& desc, ShaderProgram** outProgram) override
+    {
+        if (desc.kernelCount != 1)
+            return SLANG_E_INVALID_ARG;
+        RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram();
+        SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, desc.kernels[0].codeBegin));
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cuModuleGetFunction(&cudaProgram->cudaKernel, cudaProgram->cudaModule, desc.kernels[0].entryPointName));
+        cudaProgram->kernelName = desc.kernels[0].entryPointName;
+        *outProgram = cudaProgram.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createComputePipelineState(const ComputePipelineStateDesc& desc, PipelineState** outState) override
+    {
+        RefPtr<CUDAPipelineState> state = new CUDAPipelineState();
+        state->shaderProgram = dynamic_cast<CUDAShaderProgram*>(desc.program);
+        *outState = state.detach();
+        return Result();
+    }
+
+    virtual void* map(BufferResource* buffer, MapFlavor flavor) override
+    {
+        return dynamic_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory;
+    }
+
+    virtual void unmap(BufferResource* buffer) override
+    {
+        SLANG_UNUSED(buffer);
+    }
+
+    virtual void setPipelineState(PipelineType pipelineType, PipelineState* state) override
+    {
+        SLANG_ASSERT(pipelineType == PipelineType::Compute);
+        currentPipeline = dynamic_cast<CUDAPipelineState*>(state);
+    }
+
+    virtual void dispatchCompute(int x, int y, int z) override
+    {
+        // Find out thread group size from program reflection.
+        auto& kernelName = currentPipeline->shaderProgram->kernelName;
+        auto programLayout = dynamic_cast<CUDAProgramLayout*>(currentRootObject->layout.Ptr());
+        int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
+        SLANG_ASSERT(kernelId != -1);
+        UInt threadGroupSize[3];
+        programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
+
+        int sharedSizeInBytes;
+        cuFuncGetAttribute(
+            &sharedSizeInBytes,
+            CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+            currentPipeline->shaderProgram->cudaKernel);
+
+        // Copy global parameter data to the `SLANG_globalParams` symbol.
+        {
+            CUdeviceptr globalParamsSymbol = 0;
+            size_t globalParamsSymbolSize = 0;
+            cuModuleGetGlobal(
+                &globalParamsSymbol,
+                &globalParamsSymbolSize,
+                currentPipeline->shaderProgram->cudaModule,
+                "SLANG_globalParams");
+
+            CUdeviceptr globalParamsCUDAData =
+                currentRootObject->bufferResource
+                    ? currentRootObject->bufferResource->getBindlessHandle()
+                    : 0;
+            cudaMemcpyAsync(
+                (void*)globalParamsSymbol,
+                (void*)globalParamsCUDAData,
+                globalParamsSymbolSize,
+                cudaMemcpyDeviceToDevice,
+                0);
+        }
+        //
+        // In the case of the entry-point parameters, we have to deal with
+        // two different wrinkles.
+        //
+        // First, the `bindRoot` will have the entry-point argument data
+        // stored in a GPU-memory buffer, but we actually need it to be
+        // in host CPU memory. We handle that for now by allocating a
+        // temporary host memory buffer (if needed) and copying the data
+        // from device to host.
+        //
+        auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->bufferResource.Ptr();
+        size_t entryPointDataSize = entryPointBuffer ? entryPointBuffer->getDesc().sizeInBytes : 0;
+        void* entryPointHostData = nullptr;
+        if (entryPointDataSize)
+        {
+            entryPointHostData = alloca(entryPointDataSize);
+            cudaMemcpy(
+                entryPointHostData,
+                (void*)entryPointBuffer->getBindlessHandle(),
+                entryPointDataSize,
+                cudaMemcpyDeviceToHost);
+        }
+        //
+        // Second, the argument data for the entry-point parameters has
+        // been allocated and filled in as a single buffer, but `cuLaunchKernel`
+        // defaults to taking pointers to each of the kernel arguments.
+        //
+        // We could loop over the entry-point parameters using the refleciton
+        // information, and set up a pointer to each using the offset stored
+        // for it in the reflection data. Such an approach would require
+        // us to create and fill in a dynamically-sized array here.
+        //
+        // Instead, we take advantage of a documented but seldom-used feature
+        // of `cuLaunchKernel` that allows the argument data for all of the
+        // kernel "launch parameters" to be specified as a single buffer.
+        //
+        
+        void* extraOptions[] = {
+            CU_LAUNCH_PARAM_BUFFER_POINTER,
+            (void*)entryPointHostData,
+            CU_LAUNCH_PARAM_BUFFER_SIZE,
+            &entryPointDataSize,
+            CU_LAUNCH_PARAM_END,
+        };
+
+        // Once we have all the decessary data extracted and/or
+        // set up, we can launch the kernel and see what happens.
+        //
+        auto cudaLaunchResult = cuLaunchKernel(
+            currentPipeline->shaderProgram->cudaKernel,
+            x,
+            y,
+            z,
+            int(threadGroupSize[0]),
+            int(threadGroupSize[1]),
+            int(threadGroupSize[2]),
+            sharedSizeInBytes,
+            0,
+            nullptr,
+            extraOptions);
+
+        SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
+    }
+
+    virtual void submitGpuWork() override {}
+
+    virtual void waitForGpu() override
+    {
+        auto result = cudaDeviceSynchronize();
+        SLANG_ASSERT(result == CUDA_SUCCESS);
+    }
+
+    virtual RendererType getRendererType() const override { return RendererType::CUDA; }
+
+public:
+    // Unused public interfaces. These functions are not supported on CUDA.
+    virtual const Slang::List<Slang::String>& getFeatures() override
+    {
+        static Slang::List<Slang::String> featureSet;
+        return featureSet;
+    }
+    virtual void setClearColor(const float color[4]) override
+    {
+        SLANG_UNUSED(color);
+    }
+    virtual void clearFrame() override {}
+    virtual void presentFrame() override {}
+    virtual TextureResource::Desc getSwapChainTextureDesc() override
+    {
+        return TextureResource::Desc();
+    }
+    
+    virtual Result createSamplerState(SamplerState::Desc const& desc, SamplerState** outSampler) override
+    {
+        SLANG_UNUSED(desc);
+        *outSampler = nullptr;
+        return SLANG_OK;
+    }
+    
+    virtual Result createInputLayout(
+        const InputElementDesc* inputElements,
+        UInt inputElementCount,
+        InputLayout** outLayout) override
+    {
+        SLANG_UNUSED(inputElements);
+        SLANG_UNUSED(inputElementCount);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual Result createDescriptorSetLayout(
+        const DescriptorSetLayout::Desc& desc, DescriptorSetLayout** outLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual Result createPipelineLayout(const PipelineLayout::Desc& desc, PipelineLayout** outLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual Result createDescriptorSet(DescriptorSetLayout* layout, DescriptorSet** outDescriptorSet) override
+    {
+        SLANG_UNUSED(layout);
+        SLANG_UNUSED(outDescriptorSet);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual Result createGraphicsPipelineState(const GraphicsPipelineStateDesc& desc, PipelineState** outState) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outState);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SlangResult captureScreenSurface(Surface& surfaceOut) override
+    {
+        SLANG_UNUSED(surfaceOut);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual void setPrimitiveTopology(PrimitiveTopology topology) override
+    {
+        SLANG_UNUSED(topology);
+    }
+    virtual void setDescriptorSet(
+        PipelineType pipelineType,
+        PipelineLayout* layout,
+        UInt index,
+        DescriptorSet* descriptorSet) override
+    {
+        SLANG_UNUSED(pipelineType);
+        SLANG_UNUSED(layout);
+        SLANG_UNUSED(index);
+        SLANG_UNUSED(descriptorSet);
+    }
+    virtual void setVertexBuffers(
+        UInt startSlot,
+        UInt slotCount,
+        BufferResource* const* buffers,
+        const UInt* strides,
+        const UInt* offsets) override
+    {
+        SLANG_UNUSED(startSlot);
+        SLANG_UNUSED(slotCount);
+        SLANG_UNUSED(buffers);
+        SLANG_UNUSED(strides);
+        SLANG_UNUSED(offsets);
+    }
+    virtual void setIndexBuffer(BufferResource* buffer, Format indexFormat, UInt offset = 0) override
+    {
+        SLANG_UNUSED(buffer);
+        SLANG_UNUSED(indexFormat);
+        SLANG_UNUSED(offset);
+    }
+    virtual void setDepthStencilTarget(ResourceView* depthStencilView) override
+    {
+        SLANG_UNUSED(depthStencilView);
+    }
+    virtual void setViewports(UInt count, Viewport const* viewports) override
+    {
+        SLANG_UNUSED(count);
+        SLANG_UNUSED(viewports);
+    }
+    virtual void setScissorRects(UInt count, ScissorRect const* rects) override
+    {
+        SLANG_UNUSED(count);
+        SLANG_UNUSED(rects);
+    }
+    virtual void draw(UInt vertexCount, UInt startVertex) override
+    {
+        SLANG_UNUSED(vertexCount);
+        SLANG_UNUSED(startVertex);
+    }
+    virtual void drawIndexed(UInt indexCount, UInt startIndex, UInt baseVertex) override
+    {
+        SLANG_UNUSED(indexCount);
+        SLANG_UNUSED(startIndex);
+        SLANG_UNUSED(baseVertex);
+    }
+};
+
+SlangResult CUDAShaderObject::init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout)
+{
+    this->layout = typeLayout;
+    
+    // If the layout tells us that there is any uniform data,
+    // then we need to allocate a constant buffer to hold that data.
+    //
+    // TODO: Do we need to allocate a shadow copy for use from
+    // the CPU?
+    //
+    // TODO: When/where do we bind this constant buffer into
+    // a descriptor set for later use?
+    //
+    auto slangLayout = layout->typeLayout;
+    size_t uniformSize = layout->typeLayout->getSize();
+    if (uniformSize)
+    {
+        BufferResource::Desc bufferDesc;
+        bufferDesc.init(uniformSize);
+        bufferDesc.cpuAccessFlags |= Resource::AccessFlag::Write;
+        RefPtr<BufferResource> constantBuffer;
+        SLANG_RETURN_ON_FAIL(renderer->createBufferResource(
+            Resource::Usage::ConstantBuffer, bufferDesc, nullptr, constantBuffer.writeRef()));
+        bufferResource = dynamic_cast<MemoryCUDAResource*>(constantBuffer.Ptr());
+    }
+
+    // If the layout specifies that we have any sub-objects, then
+    // we need to size the array to account for them.
+    //
+    Index subObjectCount = slangLayout->getSubObjectRangeCount();
+    objects.setCount(subObjectCount);
+
+    for (auto subObjectRange : layout->subObjectRanges)
+    {
+        RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout;
+
+        // In the case where the sub-object range represents an
+        // existential-type leaf field (e.g., an `IBar`), we
+        // cannot pre-allocate the object(s) to go into that
+        // range, since we can't possibly know what to allocate
+        // at this point.
+        //
+        if (!subObjectLayout)
+            continue;
+        //
+        // Otherwise, we will allocate a sub-object to fill
+        // in each entry in this range, based on the layout
+        // information we already have.
+
+        auto& bindingRangeInfo = layout->m_bindingRanges[subObjectRange.bindingRangeIndex];
+        for (Index i = 0; i < bindingRangeInfo.count; ++i)
+        {
+            RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject();
+            SLANG_RETURN_ON_FAIL(subObject->init(renderer, subObjectLayout));
+            objects[bindingRangeInfo.baseIndex + i] = subObject;
+            ShaderOffset offset;
+            offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
+            if (subObject->bufferResource)
+                SLANG_RETURN_ON_FAIL(setData(offset, &subObject->bufferResource->m_cudaMemory, sizeof(void*)));
+        }
+    }
+    return SLANG_OK;
+}
+
+SlangResult CUDARootShaderObject::init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout)
+{
+    SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(renderer, typeLayout));
+    auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout);
+    for (auto& entryPoint : programLayout->entryPointLayouts)
+    {
+        RefPtr<CUDAShaderObject> object = new CUDAShaderObject();
+        SLANG_RETURN_ON_FAIL(object->init(renderer, entryPoint));
+        entryPointObjects.add(object);
+    }
+    return SLANG_OK;
+}
+
+Renderer* createCUDARenderer() { return new CUDARenderer(); }
+#else
+Renderer* createCUDARenderer() { return nullptr; }
+#endif
+
+}
diff --git a/tools/gfx/cuda/render-cuda.h b/tools/gfx/cuda/render-cuda.h
new file mode 100644
index 000000000..cf63113c8
--- /dev/null
+++ b/tools/gfx/cuda/render-cuda.h
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace gfx
+{
+class Renderer;
+
+Renderer* createCUDARenderer();
+}
diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp
index 43a255817..cf3a0576c 100644
--- a/tools/gfx/render.cpp
+++ b/tools/gfx/render.cpp
@@ -7,6 +7,7 @@
 #include "d3d12/render-d3d12.h"
 #include "open-gl/render-gl.h"
 #include "vulkan/render-vk.h"
+#include "cuda/render-cuda.h"
 
 namespace gfx {
 using namespace Slang;
@@ -432,6 +433,10 @@ ProjectionStyle RendererUtil::getProjectionStyle(RendererType type)
         {
             return &createVKRenderer;
         }
+        case RendererType::CUDA:
+        {
+            return &createCUDARenderer;
+        }
 #endif
 
         default: return nullptr;
diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp
index 5903c49f7..1c8dc1d72 100644
--- a/tools/render-test/render-test-main.cpp
+++ b/tools/render-test/render-test-main.cpp
@@ -1171,7 +1171,7 @@ static SlangResult _innerMain(Slang::StdWriters* stdWriters, SlangSession* sessi
         return SLANG_OK;
     }
 
-    if (options.rendererType == RendererType::CUDA)
+    if (options.rendererType == RendererType::CUDA && !options.useShaderObjects)
     {        
 #if RENDER_TEST_CUDA
         // Check we have all the required features
-- 
cgit v1.2.3