1 files changed, 1115 insertions, 0 deletions
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
new file mode 100644
index 000000000..1a4a142d0
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -0,0 +1,1115 @@
+// cuda-device.cpp
+#include "cuda-device.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-queue.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-query.h"
+#include "cuda-shader-object.h"
+#include "cuda-shader-object-layout.h"
+#include "cuda-shader-program.h"
+#include "cuda-resource-views.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+int DeviceImpl::_calcSMCountPerMultiProcessor(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    struct SMInfo
+    {
+        int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+        int coreCount;
+    };
+
+    static const SMInfo infos[] = {
+        {0x30, 192},
+        {0x32, 192},
+        {0x35, 192},
+        {0x37, 192},
+        {0x50, 128},
+        {0x52, 128},
+        {0x53, 128},
+        {0x60, 64},
+        {0x61, 128},
+        {0x62, 128},
+        {0x70, 64},
+        {0x72, 64},
+        {0x75, 64} };
+
+    const int sm = ((major << 4) + minor);
+    for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+    {
+        if (infos[i].sm == sm)
+        {
+            return infos[i].coreCount;
+        }
+    }
+
+    const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+    // It must be newer presumably
+    SLANG_ASSERT(sm > last.sm);
+
+    // Default to the last entry
+    return last.coreCount;
+}
+
+SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
+{
+    int smPerMultiproc = 0;
+    int maxPerfDevice = -1;
+    int deviceCount = 0;
+    int devicesProhibited = 0;
+
+    uint64_t maxComputePerf = 0;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+    // Find the best CUDA capable GPU device
+    for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+    {
+        int computeMode = -1, major = 0, minor = 0;
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+        // If this GPU is not running on Compute Mode prohibited,
+        // then we can add it to the list
+        if (computeMode != cudaComputeModeProhibited)
+        {
+            if (major == 9999 && minor == 9999)
+            {
+                smPerMultiproc = 1;
+            }
+            else
+            {
+                smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+            }
+
+            int multiProcessorCount = 0, clockRate = 0;
+            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
+                &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+            uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+            if (compute_perf > maxComputePerf)
+            {
+                maxComputePerf = compute_perf;
+                maxPerfDevice = currentDevice;
+            }
+        }
+        else
+        {
+            devicesProhibited++;
+        }
+    }
+
+    if (maxPerfDevice < 0)
+    {
+        return SLANG_FAIL;
+    }
+
+    *outDeviceIndex = maxPerfDevice;
+    return SLANG_OK;
+}
+
+SlangResult DeviceImpl::_initCuda(CUDAReportStyle reportType)
+{
+    static CUresult res = cuInit(0);
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::getNativeDeviceHandles(InteropHandles* outHandles)
+{
+    outHandles->handles[0].handleValue = (uint64_t)m_device;
+    outHandles->handles[0].api = InteropHandleAPI::CUDA;
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
+{
+    SLANG_RETURN_ON_FAIL(slangContext.initialize(
+        desc.slang,
+        SLANG_PTX,
+        "sm_5_1",
+        makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
+
+    SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
+
+    SLANG_RETURN_ON_FAIL(_initCuda(reportType));
+
+    SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
+
+    m_context = new CUDAContext();
+
+    int count = -1;
+    cuDeviceGetCount(&count);
+    SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
+
+    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
+        cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
+
+    // Not clear how to detect half support on CUDA. For now we'll assume we have it
+    {
+        m_features.add("half");
+    }
+
+    // Initialize DeviceInfo
+    {
+        m_info.deviceType = DeviceType::CUDA;
+        m_info.bindingStyle = BindingStyle::CUDA;
+        m_info.projectionStyle = ProjectionStyle::DirectX;
+        m_info.apiName = "CUDA";
+        static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
+        ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
+        cudaDeviceProp deviceProperties;
+        cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
+        m_adapterName = deviceProperties.name;
+        m_info.adapterName = m_adapterName.begin();
+        m_info.timestampFrequency = 1000000;
+    }
+
+    return SLANG_OK;
+}
+
+Result DeviceImpl::getCUDAFormat(Format format, CUarray_format* outFormat)
+{
+    // TODO: Expand to cover all available formats that can be supported in CUDA
+    switch (format)
+    {
+    case Format::R32G32B32A32_FLOAT:
+    case Format::R32G32B32_FLOAT:
+    case Format::R32G32_FLOAT:
+    case Format::R32_FLOAT:
+    case Format::D32_FLOAT:
+        *outFormat = CU_AD_FORMAT_FLOAT;
+        return SLANG_OK;
+    case Format::R16G16B16A16_FLOAT:
+    case Format::R16G16_FLOAT:
+    case Format::R16_FLOAT:
+        *outFormat = CU_AD_FORMAT_HALF;
+        return SLANG_OK;
+    case Format::R32G32B32A32_UINT:
+    case Format::R32G32B32_UINT:
+    case Format::R32G32_UINT:
+    case Format::R32_UINT:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
+        return SLANG_OK;
+    case Format::R16G16B16A16_UINT:
+    case Format::R16G16_UINT:
+    case Format::R16_UINT:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
+        return SLANG_OK;
+    case Format::R8G8B8A8_UINT:
+    case Format::R8G8_UINT:
+    case Format::R8_UINT:
+    case Format::R8G8B8A8_UNORM:
+        *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+        return SLANG_OK;
+    case Format::R32G32B32A32_SINT:
+    case Format::R32G32B32_SINT:
+    case Format::R32G32_SINT:
+    case Format::R32_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT32;
+        return SLANG_OK;
+    case Format::R16G16B16A16_SINT:
+    case Format::R16G16_SINT:
+    case Format::R16_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT16;
+        return SLANG_OK;
+    case Format::R8G8B8A8_SINT:
+    case Format::R8G8_SINT:
+    case Format::R8_SINT:
+        *outFormat = CU_AD_FORMAT_SIGNED_INT8;
+        return SLANG_OK;
+    default:
+        SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+        return SLANG_FAIL;
+    }
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureResource(
+    const ITextureResource::Desc& desc,
+    const ITextureResource::SubresourceData* initData,
+    ITextureResource** outResource)
+{
+    TextureResource::Desc srcDesc = fixupTextureDesc(desc);
+
+    RefPtr<TextureResourceImpl> tex = new TextureResourceImpl(srcDesc);
+    tex->m_cudaContext = m_context;
+
+    CUresourcetype resourceType;
+
+    // The size of the element/texel in bytes
+    size_t elementSize = 0;
+
+    // Our `ITextureResource::Desc` uses an enumeration to specify
+    // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
+    // `cuMipmappedArrayCreate` seemingly relies on a policy where
+    // the extents of the array in dimenions above the rank are
+    // specified as zero (e.g., a 1D texture requires `height==0`).
+    //
+    // We will start by massaging the extents as specified by the
+    // user into a form that CUDA wants/expects, based on the
+    // texture shape as specified in the `desc`.
+    //
+    int width = desc.size.width;
+    int height = desc.size.height;
+    int depth = desc.size.depth;
+    switch (desc.type)
+    {
+    case IResource::Type::Texture1D:
+        height = 0;
+        depth = 0;
+        break;
+
+    case IResource::Type::Texture2D:
+        depth = 0;
+        break;
+
+    case IResource::Type::Texture3D:
+        break;
+
+    case IResource::Type::TextureCube:
+        depth = 1;
+        break;
+    }
+
+    {
+        CUarray_format format = CU_AD_FORMAT_FLOAT;
+        int numChannels = 0;
+
+        SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
+        FormatInfo info;
+        gfxGetFormatInfo(desc.format, &info);
+        numChannels = info.channelCount;
+
+        switch (format)
+        {
+        case CU_AD_FORMAT_FLOAT:
+        {
+            elementSize = sizeof(float) * numChannels;
+            break;
+        }
+        case CU_AD_FORMAT_HALF:
+        {
+            elementSize = sizeof(uint16_t) * numChannels;
+            break;
+        }
+        case CU_AD_FORMAT_UNSIGNED_INT8:
+        {
+            elementSize = sizeof(uint8_t) * numChannels;
+            break;
+        }
+        default:
+        {
+            SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+            return SLANG_FAIL;
+        }
+        }
+
+        if (desc.numMipLevels > 1)
+        {
+            resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+            arrayDesc.Width = width;
+            arrayDesc.Height = height;
+            arrayDesc.Depth = depth;
+            arrayDesc.Format = format;
+            arrayDesc.NumChannels = numChannels;
+            arrayDesc.Flags = 0;
+
+            if (desc.arraySize > 1)
+            {
+                if (desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
+                    arrayDesc.Depth = desc.arraySize;
+                }
+                else
+                {
+                    SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
+                    return SLANG_FAIL;
+                }
+            }
+
+            if (desc.type == IResource::Type::TextureCube)
+            {
+                arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                arrayDesc.Depth *= 6;
+            }
+
+            SLANG_CUDA_RETURN_ON_FAIL(
+                cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
+        }
+        else
+        {
+            resourceType = CU_RESOURCE_TYPE_ARRAY;
+
+            if (desc.arraySize > 1)
+            {
+                if (desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube)
+                {
+                    SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
+                    return SLANG_FAIL;
+                }
+
+                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                // Set the depth as the array length
+                arrayDesc.Depth = desc.arraySize;
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Depth *= 6;
+                }
+
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+            else if (desc.type == IResource::Type::Texture3D ||
+                desc.type == IResource::Type::TextureCube)
+            {
+                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                arrayDesc.Depth = depth;
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                arrayDesc.Flags = 0;
+
+                // Handle cube texture
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    arrayDesc.Depth = 6;
+                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+            else
+            {
+                CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+                arrayDesc.Height = height;
+                arrayDesc.Width = width;
+                arrayDesc.Format = format;
+                arrayDesc.NumChannels = numChannels;
+
+                // Allocate the array, will work for 1D or 2D case
+                SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
+            }
+        }
+    }
+
+    // Work space for holding data for uploading if it needs to be rearranged
+    if (initData)
+    {
+        List<uint8_t> workspace;
+        for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
+        {
+            int mipWidth = width >> mipLevel;
+            int mipHeight = height >> mipLevel;
+            int mipDepth = depth >> mipLevel;
+
+            mipWidth = (mipWidth == 0) ? 1 : mipWidth;
+            mipHeight = (mipHeight == 0) ? 1 : mipHeight;
+            mipDepth = (mipDepth == 0) ? 1 : mipDepth;
+
+            // If it's a cubemap then the depth is always 6
+            if (desc.type == IResource::Type::TextureCube)
+            {
+                mipDepth = 6;
+            }
+
+            auto dstArray = tex->m_cudaArray;
+            if (tex->m_cudaMipMappedArray)
+            {
+                // Get the array for the mip level
+                SLANG_CUDA_RETURN_ON_FAIL(
+                    cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
+            }
+            SLANG_ASSERT(dstArray);
+
+            // Check using the desc to see if it's plausible
+            {
+                CUDA_ARRAY_DESCRIPTOR arrayDesc;
+                SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
+
+                SLANG_ASSERT(mipWidth == arrayDesc.Width);
+                SLANG_ASSERT(
+                    mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
+            }
+
+            const void* srcDataPtr = nullptr;
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube);
+
+                // TODO(JS): Here I assume that arrays are just held contiguously within a
+                // 'face' This seems reasonable and works with the Copy3D.
+                const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                Index faceCount = desc.arraySize;
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    faceCount *= 6;
+                }
+
+                const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
+                workspace.setCount(mipSizeInBytes);
+
+                // We need to add the face data from each mip
+                // We iterate over face count so we copy all of the cubemap faces
+                for (Index j = 0; j < faceCount; j++)
+                {
+                    const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
+                    // Copy over to the workspace to make contiguous
+                    ::memcpy(
+                        workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
+                }
+
+                srcDataPtr = workspace.getBuffer();
+            }
+            else
+            {
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+                    workspace.setCount(faceSizeInBytes * 6);
+                    // Copy the data over to make contiguous
+                    for (Index j = 0; j < 6; j++)
+                    {
+                        const auto srcData =
+                            initData[mipLevel + j * desc.numMipLevels].data;
+                        ::memcpy(
+                            workspace.getBuffer() + faceSizeInBytes * j,
+                            srcData,
+                            faceSizeInBytes);
+                    }
+                    srcDataPtr = workspace.getBuffer();
+                }
+                else
+                {
+                    const auto srcData = initData[mipLevel].data;
+                    srcDataPtr = srcData;
+                }
+            }
+
+            if (desc.arraySize > 1)
+            {
+                SLANG_ASSERT(
+                    desc.type == IResource::Type::Texture1D ||
+                    desc.type == IResource::Type::Texture2D ||
+                    desc.type == IResource::Type::TextureCube);
+
+                CUDA_MEMCPY3D copyParam;
+                memset(&copyParam, 0, sizeof(copyParam));
+
+                copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                copyParam.dstArray = dstArray;
+
+                copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                copyParam.srcHost = srcDataPtr;
+                copyParam.srcPitch = mipWidth * elementSize;
+                copyParam.WidthInBytes = copyParam.srcPitch;
+                copyParam.Height = mipHeight;
+                // Set the depth to the array length
+                copyParam.Depth = desc.arraySize;
+
+                if (desc.type == IResource::Type::TextureCube)
+                {
+                    copyParam.Depth *= 6;
+                }
+
+                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+            }
+            else
+            {
+                switch (desc.type)
+                {
+                case IResource::Type::Texture1D:
+                case IResource::Type::Texture2D:
+                {
+                    CUDA_MEMCPY2D copyParam;
+                    memset(&copyParam, 0, sizeof(copyParam));
+                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                    copyParam.dstArray = dstArray;
+                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                    copyParam.srcHost = srcDataPtr;
+                    copyParam.srcPitch = mipWidth * elementSize;
+                    copyParam.WidthInBytes = copyParam.srcPitch;
+                    copyParam.Height = mipHeight;
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+                    break;
+                }
+                case IResource::Type::Texture3D:
+                case IResource::Type::TextureCube:
+                {
+                    CUDA_MEMCPY3D copyParam;
+                    memset(&copyParam, 0, sizeof(copyParam));
+
+                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                    copyParam.dstArray = dstArray;
+
+                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+                    copyParam.srcHost = srcDataPtr;
+                    copyParam.srcPitch = mipWidth * elementSize;
+                    copyParam.WidthInBytes = copyParam.srcPitch;
+                    copyParam.Height = mipHeight;
+                    copyParam.Depth = mipDepth;
+
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+                    break;
+                }
+
+                default:
+                {
+                    SLANG_ASSERT(!"Not implemented");
+                    break;
+                }
+                }
+            }
+        }
+    }
+    // Set up texture sampling parameters, and create final texture obj
+
+    {
+        CUDA_RESOURCE_DESC resDesc;
+        memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+        resDesc.resType = resourceType;
+
+        if (tex->m_cudaArray)
+        {
+            resDesc.res.array.hArray = tex->m_cudaArray;
+        }
+        if (tex->m_cudaMipMappedArray)
+        {
+            resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
+        }
+
+        // If the texture might be used as a UAV, then we need to allocate
+        // a CUDA "surface" for it.
+        //
+        // Note: We cannot do this unconditionally, because it will fail
+        // on surfaces that are not usable as UAVs (e.g., those with
+        // mipmaps).
+        //
+        // TODO: We should really only be allocating the array at the
+        // time we create a resource, and then allocate the surface or
+        // texture objects as part of view creation.
+        //
+        if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
+        {
+            // On CUDA surfaces only support a single MIP map
+            SLANG_ASSERT(desc.numMipLevels == 1);
+
+            SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+        }
+
+
+        // Create handle for sampling.
+        CUDA_TEXTURE_DESC texDesc;
+        memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+        texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+        texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+        texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+        SLANG_CUDA_RETURN_ON_FAIL(
+            cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+    }
+
+    returnComPtr(outResource, tex);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
+    const IBufferResource::Desc& descIn,
+    const void* initData,
+    IBufferResource** outResource)
+{
+    auto desc = fixupBufferDesc(descIn);
+    RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+    if (initData)
+    {
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+    }
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
+    InteropHandle handle,
+    const IBufferResource::Desc& desc,
+    IBufferResource** outResource)
+{
+    if (handle.handleValue == 0)
+    {
+        *outResource = nullptr;
+        return SLANG_OK;
+    }
+
+    RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+
+    // CUDA manages sharing of buffers through the idea of an
+    // "external memory" object, which represents the relationship
+    // with another API's objects. In order to create this external
+    // memory association, we first need to fill in a descriptor struct.
+    cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+    memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+    switch (handle.api)
+    {
+    case InteropHandleAPI::D3D12:
+        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+        break;
+    case InteropHandleAPI::Vulkan:
+        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+        break;
+    default:
+        return SLANG_FAIL;
+    }
+    externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+    externalMemoryHandleDesc.size = desc.sizeInBytes;
+    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+    // Once we have filled in the descriptor, we can request
+    // that CUDA create the required association between the
+    // external buffer and its own memory.
+    cudaExternalMemory_t externalMemory;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+    resource->m_cudaExternalMemory = externalMemory;
+
+    // The CUDA "external memory" handle is not itself a device
+    // pointer, so we need to query for a suitable device address
+    // for the buffer with another call.
+    //
+    // Just as for the external memory, we fill in a descriptor
+    // structure (although in this case we only need to specify
+    // the size).
+    cudaExternalMemoryBufferDesc bufferDesc;
+    memset(&bufferDesc, 0, sizeof(bufferDesc));
+    bufferDesc.size = desc.sizeInBytes;
+
+    // Finally, we can "map" the buffer to get a device address.
+    void* deviceAddress;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+    resource->m_cudaMemory = deviceAddress;
+
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
+    InteropHandle handle,
+    const ITextureResource::Desc& desc,
+    const size_t size,
+    ITextureResource** outResource)
+{
+    if (handle.handleValue == 0)
+    {
+        *outResource = nullptr;
+        return SLANG_OK;
+    }
+
+    RefPtr<TextureResourceImpl> resource = new TextureResourceImpl(desc);
+    resource->m_cudaContext = m_context;
+
+    // CUDA manages sharing of buffers through the idea of an
+    // "external memory" object, which represents the relationship
+    // with another API's objects. In order to create this external
+    // memory association, we first need to fill in a descriptor struct.
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
+    memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+    switch (handle.api)
+    {
+    case InteropHandleAPI::D3D12:
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
+        break;
+    case InteropHandleAPI::Vulkan:
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
+        break;
+    default:
+        return SLANG_FAIL;
+    }
+    externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+    externalMemoryHandleDesc.size = size;
+    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+    CUexternalMemory externalMemory;
+    SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+    resource->m_cudaExternalMemory = externalMemory;
+
+    FormatInfo formatInfo;
+    SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    arrayDesc.Depth = desc.size.depth;
+    arrayDesc.Height = desc.size.height;
+    arrayDesc.Width = desc.size.width;
+    arrayDesc.NumChannels = formatInfo.channelCount;
+    getCUDAFormat(desc.format, &arrayDesc.Format);
+    arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
+
+    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
+    memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
+    externalMemoryMipDesc.offset = 0;
+    externalMemoryMipDesc.arrayDesc = arrayDesc;
+    externalMemoryMipDesc.numLevels = desc.numMipLevels;
+
+    CUmipmappedArray mipArray;
+    SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
+    resource->m_cudaMipMappedArray = mipArray;
+
+    CUarray cuArray;
+    SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
+    resource->m_cudaArray = cuArray;
+
+    CUDA_RESOURCE_DESC surfDesc;
+    memset(&surfDesc, 0, sizeof(surfDesc));
+    surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    surfDesc.res.array.hArray = cuArray;
+
+    CUsurfObject surface;
+    SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
+    resource->m_cudaSurfObj = surface;
+
+    returnComPtr(outResource, resource);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureView(
+    ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView)
+{
+    RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+    view->m_desc = desc;
+    view->textureResource = dynamic_cast<TextureResourceImpl*>(texture);
+    returnComPtr(outView, view);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferView(
+    IBufferResource* buffer,
+    IBufferResource* counterBuffer,
+    IResourceView::Desc const& desc,
+    IResourceView** outView)
+{
+    RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+    view->m_desc = desc;
+    view->memoryResource = dynamic_cast<BufferResourceImpl*>(buffer);
+    returnComPtr(outView, view);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createQueryPool(
+    const IQueryPool::Desc& desc,
+    IQueryPool** outPool)
+{
+    RefPtr<QueryPoolImpl> pool = new QueryPoolImpl();
+    SLANG_RETURN_ON_FAIL(pool->init(desc));
+    returnComPtr(outPool, pool);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObjectLayout(
+    slang::TypeLayoutReflection* typeLayout,
+    ShaderObjectLayoutBase** outLayout)
+{
+    RefPtr<ShaderObjectLayoutImpl> cudaLayout;
+    cudaLayout = new ShaderObjectLayoutImpl(this, typeLayout);
+    returnRefPtrMove(outLayout, cudaLayout);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObject(
+    ShaderObjectLayoutBase* layout,
+    IShaderObject** outObject)
+{
+    RefPtr<ShaderObjectImpl> result = new ShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+    returnComPtr(outObject, result);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createMutableShaderObject(
+    ShaderObjectLayoutBase* layout,
+    IShaderObject** outObject)
+{
+    RefPtr<MutableShaderObjectImpl> result = new MutableShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+    returnComPtr(outObject, result);
+    return SLANG_OK;
+}
+
+Result DeviceImpl::createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
+{
+    auto cudaProgram = dynamic_cast<ShaderProgramImpl*>(program);
+    auto cudaLayout = cudaProgram->layout;
+
+    RefPtr<RootShaderObjectImpl> result = new RootShaderObjectImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
+    returnRefPtrMove(outObject, result);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+    const IShaderProgram::Desc& desc,
+    IShaderProgram** outProgram,
+    ISlangBlob** outDiagnosticBlob)
+{
+    // If this is a specializable program, we just keep a reference to the slang program and
+    // don't actually create any kernels. This program will be specialized later when we know
+    // the shader object bindings.
+    RefPtr<ShaderProgramImpl> cudaProgram = new ShaderProgramImpl();
+    cudaProgram->init(desc);
+    cudaProgram->cudaContext = m_context;
+    if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
+    {
+        cudaProgram->layout = new RootShaderObjectLayoutImpl(this, desc.slangGlobalScope->getLayout());
+        returnComPtr(outProgram, cudaProgram);
+        return SLANG_OK;
+    }
+
+    ComPtr<ISlangBlob> kernelCode;
+    ComPtr<ISlangBlob> diagnostics;
+    auto compileResult = desc.slangGlobalScope->getEntryPointCode(
+        (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
+    if (diagnostics)
+    {
+        getDebugCallback()->handleMessage(
+            compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
+            DebugMessageSource::Slang,
+            (char*)diagnostics->getBufferPointer());
+        if (outDiagnosticBlob)
+            returnComPtr(outDiagnosticBlob, diagnostics);
+    }
+    SLANG_RETURN_ON_FAIL(compileResult);
+
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
+    cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
+        &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
+
+    auto slangGlobalScope = desc.slangGlobalScope;
+    if (slangGlobalScope)
+    {
+        cudaProgram->slangGlobalScope = slangGlobalScope;
+
+        auto slangProgramLayout = slangGlobalScope->getLayout();
+        if (!slangProgramLayout)
+            return SLANG_FAIL;
+
+        RefPtr<RootShaderObjectLayoutImpl> cudaLayout;
+        cudaLayout = new RootShaderObjectLayoutImpl(this, slangProgramLayout);
+        cudaLayout->programLayout = slangProgramLayout;
+        cudaProgram->layout = cudaLayout;
+    }
+
+    returnComPtr(outProgram, cudaProgram);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createComputePipelineState(
+    const ComputePipelineStateDesc& desc, IPipelineState** outState)
+{
+    RefPtr<ComputePipelineStateImpl> state = new ComputePipelineStateImpl();
+    state->shaderProgram = static_cast<ShaderProgramImpl*>(desc.program);
+    state->init(desc);
+    returnComPtr(outState, state);
+    return Result();
+}
+
+void* DeviceImpl::map(IBufferResource* buffer)
+{
+    return static_cast<BufferResourceImpl*>(buffer)->m_cudaMemory;
+}
+
+void DeviceImpl::unmap(IBufferResource* buffer)
+{
+    SLANG_UNUSED(buffer);
+}
+
+SLANG_NO_THROW const DeviceInfo& SLANG_MCALL DeviceImpl::getDeviceInfo() const
+{
+    return m_info;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTransientResourceHeap(
+    const ITransientResourceHeap::Desc& desc,
+    ITransientResourceHeap** outHeap)
+{
+    RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
+    SLANG_RETURN_ON_FAIL(result->init(this, desc));
+    returnComPtr(outHeap, result);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue)
+{
+    RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
+    queue->init(this);
+    returnComPtr(outQueue, queue);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createSwapchain(
+    const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(window);
+    SLANG_UNUSED(outSwapchain);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createFramebufferLayout(
+    const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outLayout);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outFramebuffer);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createRenderPassLayout(
+    const IRenderPassLayout::Desc& desc,
+    IRenderPassLayout** outRenderPassLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outRenderPassLayout);
+    return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+    DeviceImpl::createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler)
+{
+    SLANG_UNUSED(desc);
+    *outSampler = nullptr;
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createInputLayout(
+    IInputLayout::Desc const& desc,
+    IInputLayout** outLayout)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outLayout);
+    return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createGraphicsPipelineState(
+    const GraphicsPipelineStateDesc& desc, IPipelineState** outState)
+{
+    SLANG_UNUSED(desc);
+    SLANG_UNUSED(outState);
+    return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::readTextureResource(
+    ITextureResource* texture,
+    ResourceState state,
+    ISlangBlob** outBlob,
+    size_t* outRowPitch,
+    size_t* outPixelSize)
+{
+    auto textureImpl = static_cast<TextureResourceImpl*>(texture);
+    RefPtr<ListBlob> blob = new ListBlob();
+
+    auto desc = textureImpl->getDesc();
+    auto width = desc->size.width;
+    auto height = desc->size.height;
+    FormatInfo sizeInfo;
+    SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
+    size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
+    size_t rowPitch = width * pixelSize;
+    size_t size = height * rowPitch;
+    blob->m_data.setCount((Index)size);
+
+    CUDA_MEMCPY2D copyParam;
+    memset(&copyParam, 0, sizeof(copyParam));
+
+    copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+    copyParam.srcArray = textureImpl->m_cudaArray;
+
+    copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
+    copyParam.dstHost = blob->m_data.getBuffer();
+    copyParam.dstPitch = rowPitch;
+    copyParam.WidthInBytes = copyParam.dstPitch;
+    copyParam.Height = height;
+    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+
+    *outRowPitch = rowPitch;
+    *outPixelSize = pixelSize;
+    returnComPtr(outBlob, blob);
+    return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
+    IBufferResource* buffer,
+    size_t offset,
+    size_t size,
+    ISlangBlob** outBlob)
+{
+    auto bufferImpl = static_cast<BufferResourceImpl*>(buffer);
+    RefPtr<ListBlob> blob = new ListBlob();
+    blob->m_data.setCount((Index)size);
+    cudaMemcpy(
+        blob->m_data.getBuffer(),
+        (uint8_t*)bufferImpl->m_cudaMemory + offset,
+        size,
+        cudaMemcpyDefault);
+    returnComPtr(outBlob, blob);
+    return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx