#include "cuda-compute-util.h"

#include "../../slang-com-helper.h"

#include "../../source/core/slang-std-writers.h"
#include "../../source/core/slang-token-reader.h"
#include "../../source/core/slang-semantic-version.h"

#include "../bind-location.h"

#include <cuda.h>

#include <cuda_runtime_api.h>

// TODO: should conditionalize this on OptiX being present
#ifdef RENDER_TEST_OPTIX

// The `optix_stubs.h` header produces warnings when compiled with MSVC
#ifdef _MSC_VER
#pragma warning(disable: 4996)
#endif
#include <optix.h>
#include <optix_function_table_definition.h>
#include <optix_stubs.h>
#endif

namespace renderer_test {
using namespace Slang;

SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }

// A enum used to control if errors are reported on failure of CUDA call.
enum class CUDAReportStyle
{
    Normal,
    Silent,
};

struct CUDAErrorInfo
{
    CUDAErrorInfo(const char* filePath, int lineNo, const char* errorName = nullptr, const char* errorString = nullptr):
        m_filePath(filePath),
        m_lineNo(lineNo),
        m_errorName(errorName),
        m_errorString(errorString)
    {
    }
    SlangResult handle() const
    {
        StringBuilder builder;
        builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";

        if (m_errorName)
        {
            builder << m_errorName << " : ";
        }
        if (m_errorString)
        {
            builder << m_errorString;
        }

        StdWriters::getError().put(builder.getUnownedSlice());

        //Slang::signalUnexpectedError(builder.getBuffer());
        return SLANG_FAIL;
    }

    const char* m_filePath;
    int m_lineNo;
    const char* m_errorName;
    const char* m_errorString;
};

#if 1
// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.

static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
{
    CUDAErrorInfo info(file, line);
    cuGetErrorString(cuResult, &info.m_errorString);
    cuGetErrorName(cuResult, &info.m_errorName);
    return info.handle();
}

static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
{
    return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
}

#define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res, __FILE__, __LINE__)

#else
// If this code path is enabled, errors are not reported, but can have an assert enabled

static SlangResult _handleCUDAError(CUresult cuResult)
{
    SLANG_UNUSED(cuResult);
    //SLANG_ASSERT(!"Failed CUDA call");
    return SLANG_FAIL;
}

static SlangResult _handleCUDAError(cudaError_t error)
{
    SLANG_UNUSED(error);
    //SLANG_ASSERT(!"Failed CUDA call");
    return SLANG_FAIL;
}

#define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res)
#endif

#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_CUDA_HANDLE_ERROR(_res); }
#define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \
    { \
        auto _res = x; \
        if (_isError(_res)) \
        { \
            return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) : SLANG_FAIL; \
        } \
    } \

#define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; }

#ifdef RENDER_TEST_OPTIX

static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }

#if 1
static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
{
    fprintf(stderr, "%s(%d): optix: %s (%s)\n",
        file,
        line,
        optixGetErrorString(result),
        optixGetErrorName(result));
    return SLANG_FAIL;
}
#define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
#else
#define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
#endif

#define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) do { auto _res = EXPR; if(_isError(_res)) return SLANG_OPTIX_HANDLE_ERROR(_res); } while(0)

void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
{
    fprintf(stderr, "optix: %s (%s)\n",
        message,
        tag);
}

#endif

class MemoryCUDAResource : public CUDAResource
{
public:
    typedef CUDAResource Super;

        /// Dtor
    ~MemoryCUDAResource()
    {
        if (m_cudaMemory)
        {
            SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
        }
    }

    static MemoryCUDAResource* asResource(BindSet::Value* value)
    {
        return value ? dynamic_cast<MemoryCUDAResource*>(value->m_target.Ptr()) : nullptr;
    }
        /// Helper function to get the CUDA memory pointer when given a value
    static CUdeviceptr getCUDAData(BindSet::Value* value)
    {
        auto resource = asResource(value);
        return resource ? resource->m_cudaMemory : CUdeviceptr();
    }

    CUdeviceptr m_cudaMemory = CUdeviceptr();
};

class TextureCUDAResource : public CUDAResource
{
public:
    typedef CUDAResource Super;

    ~TextureCUDAResource()
    {
        if (m_cudaSurfObj)
        {
            SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
        }
        if (m_cudaTexObj)
        {
            SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
        }
        if (m_cudaArray)
        {
            SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
        }
        if (m_cudaMipMappedArray)
        {
            SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
        }
    }

    static TextureCUDAResource* asResource(BindSet::Value* value)
    {
        return value ? dynamic_cast<TextureCUDAResource*>(value->m_target.Ptr()) : nullptr;
    }

    static CUtexObject getTexObject(BindSet::Value* value)
    {
        auto resource = asResource(value);
        // It's an assumption here that 0 is okay for null. Seems to work...
        return resource ? resource->m_cudaTexObj : CUtexObject(0);
    }

    static CUsurfObject getSurfObject(BindSet::Value* value)
    {
        auto resource = asResource(value);
        return resource ? resource->m_cudaSurfObj : CUsurfObject(0);
    }

    // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by a long long
    CUtexObject m_cudaTexObj = CUtexObject();

    // The surfObj is for reading/writing 'texture like' things, but not for sampling.
    CUsurfObject m_cudaSurfObj = CUsurfObject();

    CUarray m_cudaArray = CUarray();
    CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
};

class ScopeCUDAModule
{
public:

    operator CUmodule () const { return m_module; }

    ScopeCUDAModule(): m_module(nullptr) {}
    SlangResult load(const void* image)
    {
        release(); 
        SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&m_module, image));
        return SLANG_OK;
    }
    void release()
    {
        if (m_module)
        {
            cuModuleUnload(m_module);
            m_module = nullptr;
        }
    }

    ~ScopeCUDAModule() { release(); }

    CUmodule m_module;
};

class ScopeCUDAStream
{
public:

    SlangResult init(unsigned int flags)
    {
        release();
        SLANG_ASSERT(m_stream == nullptr);
        SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags));
        return SLANG_OK;
    }

    SlangResult sync()
    {
        if (m_stream)
        {
            SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream));
        }
        else
        {
            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize());
        }
        return SLANG_OK;
    }

    void release()
    {
        if (m_stream)
        {
            sync();
            SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream));
            m_stream = nullptr;
        }
    }

    ScopeCUDAStream():m_stream(nullptr) {}

    ~ScopeCUDAStream() { release(); }

    operator CUstream () const { return m_stream; }

    CUstream m_stream;
};

static int _calcSMCountPerMultiProcessor(int major, int minor)
{
    // Defines for GPU Architecture types (using the SM version to determine
    // the # of cores per SM
    struct SMInfo
    {
        int sm;  // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
        int coreCount;
    };

    static const SMInfo infos[] =
    {
        {0x30, 192},
        {0x32, 192},
        {0x35, 192},
        {0x37, 192},
        {0x50, 128},
        {0x52, 128},
        {0x53, 128},
        {0x60,  64},
        {0x61, 128},
        {0x62, 128},
        {0x70,  64},
        {0x72,  64},
        {0x75,  64}
    };

    const int sm = ((major << 4) + minor);
    for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
    {
        if (infos[i].sm == sm)
        {
            return infos[i].coreCount;
        }
    }

    const auto& last = infos[SLANG_COUNT_OF(infos) - 1];

    // It must be newer presumably
    SLANG_ASSERT(sm > last.sm);

    // Default to the last entry
    return last.coreCount;
}

static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
{
    int smPerMultiproc = 0;
    int maxPerfDevice = -1;
    int deviceCount = 0;
    int devicesProhibited = 0;

    uint64_t maxComputePerf = 0;
    SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));

    // Find the best CUDA capable GPU device
    for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
    {
        int computeMode = -1, major = 0, minor = 0;
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));

        // If this GPU is not running on Compute Mode prohibited,
        // then we can add it to the list
        if (computeMode != cudaComputeModeProhibited)
        {
            if (major == 9999 && minor == 9999)
            {
                smPerMultiproc = 1;
            }
            else
            {
                smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
            }

            int multiProcessorCount = 0, clockRate = 0;
            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
            uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;

            if (compute_perf > maxComputePerf)
            {
                maxComputePerf = compute_perf;
                maxPerfDevice = currentDevice;
            }
        }
        else
        {
            devicesProhibited++;
        }
    }

    if (maxPerfDevice < 0)
    {
        return SLANG_FAIL;
    }

    *outDeviceIndex = maxPerfDevice;
    return SLANG_OK;
}

static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
{
    static CUresult res = cuInit(0);
    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
    return SLANG_OK;
}

class ScopeCUDAContext
{
public:
    ScopeCUDAContext() :
        m_context(nullptr),
        m_device(-1),
        m_deviceIndex(-1)
    {}

    SlangResult init(unsigned int flags, int deviceIndex, CUDAReportStyle reportType = CUDAReportStyle::Normal)
    {
        SLANG_RETURN_ON_FAIL(_initCuda(reportType));

        if (m_context)
        {
            cuCtxDestroy(m_context);
            m_context = nullptr;
        }

        m_deviceIndex = deviceIndex;
        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, deviceIndex));

        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, flags, m_device), reportType);
        return SLANG_OK;
    }

    SlangResult init(unsigned int flags, CUDAReportStyle reportType = CUDAReportStyle::Normal)
    {
        SLANG_RETURN_ON_FAIL(_initCuda(reportType));

        SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);

        if (m_context)
        {
            cuCtxDestroy(m_context);
            m_context = nullptr;
        }

        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));

        SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, flags, m_device), reportType);
        return SLANG_OK;
    }

    ~ScopeCUDAContext()
    {
        if (m_context)
        {
            cuCtxDestroy(m_context);
        }
    }
    SLANG_FORCE_INLINE operator CUcontext () const { return m_context; }

    int m_deviceIndex;
    CUdevice m_device;
    CUcontext m_context;
};

/* static */SlangResult CUDAComputeUtil::parseFeature(const Slang::UnownedStringSlice& feature, bool& outResult)
{
    outResult = false;

    if (feature.startsWith("cuda_sm_"))
    {
        const UnownedStringSlice versionSlice = UnownedStringSlice(feature.begin() + 8, feature.end());
        SemanticVersion requiredVersion;
        SLANG_RETURN_ON_FAIL(SemanticVersion::parse(versionSlice, '_', requiredVersion));

        // Need to get the version from the cuda device
        ScopeCUDAContext context;
        SLANG_RETURN_ON_FAIL(context.init(0, CUDAReportStyle::Silent));

        const int deviceIndex = context.m_deviceIndex;

        int computeMode = -1;
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, deviceIndex));

        // If we don't have compute mode availability, we can't execute
        if (computeMode == cudaComputeModeProhibited)
        {
            return SLANG_FAIL;
        }

        int major, minor;
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major,  cudaDevAttrComputeCapabilityMajor, deviceIndex));
        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));

        SemanticVersion actualVersion;
        actualVersion.set(major, minor);

        outResult = actualVersion >= requiredVersion;

        return SLANG_OK;
    }

    return SLANG_FAIL;
}

/* static */bool CUDAComputeUtil::hasFeature(const Slang::UnownedStringSlice& feature)
{
    bool res;
    return SLANG_SUCCEEDED(parseFeature(feature, res)) ? res : false;
}

/* static */bool CUDAComputeUtil::canCreateDevice()
{
    ScopeCUDAContext context;
    return SLANG_SUCCEEDED(context.init(0, CUDAReportStyle::Silent));
}

static bool _hasReadAccess(SlangResourceAccess access)
{
    return access = SLANG_RESOURCE_ACCESS_READ || access == SLANG_RESOURCE_ACCESS_READ_WRITE;
}

static bool _hasWriteAccess(SlangResourceAccess access)
{
    return access == SLANG_RESOURCE_ACCESS_READ_WRITE;
}

/* static */SlangResult CUDAComputeUtil::createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<CUDAResource>& outResource)
{
    auto type = typeLayout->getType();
    auto shape = type->getResourceShape();

    auto access = type->getResourceAccess();

    if (!(access == SLANG_RESOURCE_ACCESS_READ ||
        access == SLANG_RESOURCE_ACCESS_READ_WRITE))
    {
        SLANG_ASSERT(!"Only read or read write currently supported");
        return SLANG_FAIL;
    }

    CUresourcetype resourceType = CU_RESOURCE_TYPE_ARRAY;
    auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;

    slang::TypeReflection* typeReflection = typeLayout->getResourceResultType();

    InputTextureDesc textureDesc = srcEntry.textureDesc;

    if (_hasWriteAccess(access))
    {
        textureDesc.mipMapCount = 1;
    }
    
    // CUDA wants the unused dimensions to be 0.
    // Might need to specially handle elsewhere
    int width = textureDesc.size;
    int height = 0;
    int depth = 0;

    switch (baseShape)
    {
        case SLANG_TEXTURE_1D:
        {
            break;
        }
        case SLANG_TEXTURE_2D:
        {
            height = textureDesc.size;
            break;
        }
        case SLANG_TEXTURE_3D:
        {
            height = textureDesc.size;
            depth = textureDesc.size;
            break;
        }
        case SLANG_TEXTURE_CUBE:
        {
            height = width;
            depth = 1;
            break;
        }
        default:
        {
            SLANG_ASSERT(!"Type not supported");
            return SLANG_FAIL;
        }
    }
    
    TextureData texData;
    generateTextureData(texData, textureDesc);

    auto mipLevels = texData.mipLevels;

    RefPtr<TextureCUDAResource> tex = new TextureCUDAResource;

    size_t elementSize = 0;

    {
        CUarray_format format = CU_AD_FORMAT_FLOAT;
        int numChannels = 0;

        switch (textureDesc.format)
        {
            case Format::R_Float32:
            {
                format = CU_AD_FORMAT_FLOAT;
                numChannels = 1;
                elementSize = sizeof(float);
                break;
            }
            case Format::RGBA_Unorm_UInt8:
            {
                format = CU_AD_FORMAT_UNSIGNED_INT8;
                numChannels = 4;
                elementSize = sizeof(uint32_t);
                break;
            }
            default:
            {
                SLANG_ASSERT(!"Only support R_Float32/RGBA_Unorm_UInt8 formats for now");
                return SLANG_FAIL;
            }
        }

        if (mipLevels > 1)
        {
            resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;

            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
            memset(&arrayDesc, 0, sizeof(arrayDesc));

            arrayDesc.Width = width;
            arrayDesc.Height = height;
            arrayDesc.Depth = depth;
            arrayDesc.Format = format;
            arrayDesc.NumChannels = numChannels;
            arrayDesc.Flags = 0;

            if (textureDesc.arrayLength > 1)
            {
                if (baseShape == SLANG_TEXTURE_1D ||
                    baseShape == SLANG_TEXTURE_2D ||
                    baseShape == SLANG_TEXTURE_CUBE)
                {
                    arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
                    arrayDesc.Depth = textureDesc.arrayLength;
                }
                else
                {
                    SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
                    return SLANG_FAIL;
                }
            }
            
            if (baseShape == SLANG_TEXTURE_CUBE)
            {
                arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
                arrayDesc.Depth *= 6;
            }

            SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, mipLevels));
        }
        else
        {
            resourceType = CU_RESOURCE_TYPE_ARRAY;

            if (textureDesc.arrayLength > 1)
            {
                if (baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE)
                {
                    SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
                    return SLANG_FAIL;
                }

                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
                memset(&arrayDesc, 0, sizeof(arrayDesc));

                // Set the depth as the array length
                arrayDesc.Depth = textureDesc.arrayLength;
                if (baseShape == SLANG_TEXTURE_CUBE)
                {
                    arrayDesc.Depth *= 6;
                }
                
                arrayDesc.Height = height;
                arrayDesc.Width = width;
                arrayDesc.Format = format;
                arrayDesc.NumChannels = numChannels;

                if (baseShape == SLANG_TEXTURE_CUBE)
                {
                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
                }

                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
            }
            else if (baseShape == SLANG_TEXTURE_3D || baseShape == SLANG_TEXTURE_CUBE)
            {
                CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
                memset(&arrayDesc, 0, sizeof(arrayDesc));

                arrayDesc.Depth = depth;
                arrayDesc.Height = height;
                arrayDesc.Width = width;
                arrayDesc.Format = format;
                arrayDesc.NumChannels = numChannels;

                arrayDesc.Flags = 0;

                // Handle cube texture
                if (baseShape == SLANG_TEXTURE_CUBE)
                {
                    arrayDesc.Depth = 6;
                    arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
                }

                SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
            }
            else
            {
                CUDA_ARRAY_DESCRIPTOR arrayDesc;
                memset(&arrayDesc, 0, sizeof(arrayDesc));

                arrayDesc.Width = width;
                arrayDesc.Height = height;
                arrayDesc.Format = format;
                arrayDesc.NumChannels = numChannels;

                // Allocate the array, will work for 1D or 2D case
                SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
            }
        }
    }

    // Work space for holding data for uploading if it needs to be rearranged
    List<uint8_t> workspace;

    for (int mipLevel = 0; mipLevel < mipLevels; ++mipLevel)
    {
        int mipWidth = width >> mipLevel;
        int mipHeight = height >> mipLevel;
        int mipDepth = depth >> mipLevel;

        mipWidth = (mipWidth == 0) ? 1 : mipWidth;
        mipHeight = (mipHeight == 0) ? 1 : mipHeight;
        mipDepth = (mipDepth == 0) ? 1 : mipDepth;

        // If it's a cubemap then the depth is always 6
        if (baseShape == SLANG_TEXTURE_CUBE)
        {
            mipDepth = 6;
        }

        auto dstArray = tex->m_cudaArray;
        if (tex->m_cudaMipMappedArray)
        {
            // Get the array for the mip level
            SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
        }
        SLANG_ASSERT(dstArray);

        // Check using the desc to see if it's plausible
        {
            CUDA_ARRAY_DESCRIPTOR arrayDesc;
            SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));

            SLANG_ASSERT(mipWidth == arrayDesc.Width);
            SLANG_ASSERT(mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
        }

        const void* srcDataPtr = nullptr;

        if (textureDesc.arrayLength > 1)
        {
            SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE);

            // TODO(JS): Here I assume that arrays are just held contiguously within a 'face'
            // This seems reasonable and works with the Copy3D.
            const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;

            Index faceCount = textureDesc.arrayLength;
            if (baseShape == SLANG_TEXTURE_CUBE)
            {
                faceCount *= 6;
            }

            const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
            workspace.setCount(mipSizeInBytes);

            // We need to add the face data from each mip
            // We iterate over face count so we copy all of the cubemap faces
            for (Index j = 0; j < faceCount; j++)
            {
                const auto& srcData = texData.dataBuffer[mipLevel + j * mipLevels];
                // Copy over to the workspace to make contiguous
                ::memcpy(workspace.begin() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes);
            }

            srcDataPtr = workspace.getBuffer();
        }
        else
        {
            if (baseShape == SLANG_TEXTURE_CUBE)
            {
                size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;

                workspace.setCount(faceSizeInBytes * 6);

                // Copy the data over to make contiguous
                for (Index j = 0; j < 6; j++)
                {
                    const auto& srcData = texData.dataBuffer[mipLevels * j + mipLevel];
                    SLANG_ASSERT(mipWidth * mipHeight == srcData.getCount());

                    ::memcpy(workspace.getBuffer() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes);
                }

                srcDataPtr = workspace.getBuffer();
            }
            else
            {
                const auto& srcData = texData.dataBuffer[mipLevel];
                SLANG_ASSERT(mipWidth * mipHeight * mipDepth == srcData.getCount());

                srcDataPtr = srcData.getBuffer();
            }
        }

        if (textureDesc.arrayLength > 1)
        {
            SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE);

            CUDA_MEMCPY3D copyParam;
            memset(&copyParam, 0, sizeof(copyParam));

            copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
            copyParam.dstArray = dstArray;

            copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
            copyParam.srcHost = srcDataPtr;
            copyParam.srcPitch = mipWidth * elementSize;
            copyParam.WidthInBytes = copyParam.srcPitch;
            copyParam.Height = mipHeight;
            // Set the depth to the array length
            copyParam.Depth = textureDesc.arrayLength;

            if (baseShape == SLANG_TEXTURE_CUBE)
            {
                copyParam.Depth *= 6;
            }

            SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
        }
        else
        {
            switch (baseShape)
            {
                case SLANG_TEXTURE_1D:
                case SLANG_TEXTURE_2D:
                {
                    CUDA_MEMCPY2D copyParam;
                    memset(&copyParam, 0, sizeof(copyParam));
                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
                    copyParam.dstArray = dstArray;
                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
                    copyParam.srcHost = srcDataPtr;
                    copyParam.srcPitch = mipWidth * elementSize;
                    copyParam.WidthInBytes = copyParam.srcPitch;
                    copyParam.Height = mipHeight;
                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
                    break;
                }
                case SLANG_TEXTURE_3D:
                case SLANG_TEXTURE_CUBE:
                {
                    CUDA_MEMCPY3D copyParam;
                    memset(&copyParam, 0, sizeof(copyParam));

                    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
                    copyParam.dstArray = dstArray;

                    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
                    copyParam.srcHost = srcDataPtr;
                    copyParam.srcPitch = mipWidth * elementSize;
                    copyParam.WidthInBytes = copyParam.srcPitch;
                    copyParam.Height = mipHeight;
                    copyParam.Depth = mipDepth;

                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
                    break;
                }

                default:
                {
                    SLANG_ASSERT(!"Not implemented");
                    break;
                }
            }
        }
    }

    // Set up texture sampling parameters, and create final texture obj

    {
        CUDA_RESOURCE_DESC resDesc;
        memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
        resDesc.resType = resourceType;

        if (tex->m_cudaArray)
        {
            resDesc.res.array.hArray = tex->m_cudaArray;
        }
        if (tex->m_cudaMipMappedArray)
        {
            resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
        }

        if (_hasWriteAccess(access))
        {
            // If has write access it's effectively UAV, and so doesn't have sampling available
            SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
        }
        else
        {
            // If read only it's a SRV and can sample, but cannot write
            CUDA_TEXTURE_DESC texDesc;
            memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
            texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
            texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
            texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
            texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
            texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

            SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
        }

    }

    outResource = tex;
    return SLANG_OK;
}

    /// Load kernel code and invoke a compute program
    ///
    /// Assumes that data for binding the kernel parameters is already
    /// set up in `outContext.`
    ///
static SlangResult _invokeComputeProgram(
    CUcontext cudaContext,
    ScopeCUDAStream& cudaStream,
    ScopeCUDAModule& cudaModule,
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    const uint32_t dispatchSize[3],
    CUDAComputeUtil::Context& outContext)
{
    auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);

    auto& bindSet = outContext.m_bindSet;
    auto& bindRoot = outContext.m_bindRoot;

    // The global-scope shader parameters in the input Slang program
    // will be collected into a single `__constant__` global variable
    // in the output CUDA module.
    //
    // We need to query the address of the `__constant__` variable
    // so that we can copy parameter data into it when invoking
    // a kernel.
    //
    // The Slang compiler always names this symbol `SLANG_globalParams`
    // so that it is easy to look up independent of the module or
    // entry point in question.
    //
    CUdeviceptr globalParamsSymbol = 0;
    size_t globalParamsSymbolSize = 0;
    cuModuleGetGlobal(&globalParamsSymbol, &globalParamsSymbolSize, cudaModule, "SLANG_globalParams");

    slang::EntryPointReflection* entryPoint = nullptr;
    auto entryPointCount = reflection->getEntryPointCount();
    SLANG_ASSERT(entryPointCount == 1);

    entryPoint = reflection->getEntryPointByIndex(0);

    const char* entryPointName = entryPoint->getName();

    // Get the entry point
    CUfunction cudaEntryPoint;
    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&cudaEntryPoint, cudaModule, entryPointName));

    // Get the max threads per block for this function

    int maxTheadsPerBlock;
    SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cudaEntryPoint));

    int sharedSizeInBytes;
    SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));

    // A single CUDA kernel can be invoked with thread groups
    // of different shapes/sizes, but an HLSL/Slang compute
    // kernel always has a fixed thread group shape baked in.
    // We use reflection to query the thread-group size that
    // the kernel expects, so that we can use the right size
    // when invoking the kernel.
    //
    SlangUInt numThreadsPerAxis[3];
    entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);

    // The argument data for the kernel has been set up in `bindRoot`,
    // which encapsulates global buffers for both the global and
    // entry-point parameter data.
    //
    // In the case of global parameters, we just need to extract the
    // device address of the parameter data, so we can copy it into
    // the `SLANG_globalParams` symbol.
    //
    {
        CUdeviceptr globalParamsCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
        cudaMemcpyAsync(
            (void*) globalParamsSymbol,
            (void*) globalParamsCUDAData,
            globalParamsSymbolSize,
            cudaMemcpyDeviceToDevice,
            cudaStream);
    }
    //
    // In the case of the entry-point parameters, we have to deal with
    // two different wrinkles.
    //
    // First, the `bindRoot` will have the entry-point argument data
    // stored in a GPU-memory buffer, but we actually need it to be
    // in host CPU memory. We handle that for now by allocating a
    // temporary host memory buffer (if needed) and copying the data
    // from device to host.
    //
    auto entryPointBindValue = bindRoot.getEntryPointValue();
    CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(entryPointBindValue);
    size_t entryPointDataSize = entryPointBindValue ? entryPointBindValue->m_sizeInBytes : 0;
    void* entryPointHostData = nullptr;
    if(entryPointDataSize)
    {
        entryPointHostData = alloca(entryPointDataSize);
        cudaMemcpy(entryPointHostData, (void*)entryPointCUDAData, entryPointDataSize, cudaMemcpyDeviceToHost);
    }
    //
    // Second, the argument data for the entry-point parameters has
    // been allocated and filled in as a single buffer, but `cuLaunchKernel`
    // defaults to taking pointers to each of the kernel arguments.
    //
    // We could loop over the entry-point parameters using the refleciton
    // information, and set up a pointer to each using the offset stored
    // for it in the reflection data. Such an approach would require
    // us to create and fill in a dynamically-sized array here.
    //
    // Instead, we take advantage of a documented but seldom-used feature
    // of `cuLaunchKernel` that allows the argument data for all of the
    // kernel "launch parameters" to be specified as a single buffer.
    //
    void* extraOptions[] = {
        CU_LAUNCH_PARAM_BUFFER_POINTER, (void*) entryPointHostData,
        CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize,
        CU_LAUNCH_PARAM_END,
    };

    // Once we have all the decessary data extracted and/or
    // set up, we can launch the kernel and see what happens.
    //
    auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
        dispatchSize[0], dispatchSize[1], dispatchSize[2], 
        int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
        0,              // Shared memory size
        cudaStream,     // Stream. 0 is no stream.
        nullptr,        // Not using traditional argument passing
        extraOptions);  // Instead passing kernel arguments via "extra" options
    SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);

    // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
    SLANG_RETURN_ON_FAIL(cudaStream.sync());

    return SLANG_OK;
}

#ifdef RENDER_TEST_OPTIX
    /// Load kernel code and invoke a ray-tracing program
    ///
    /// Assumes that data for binding the kernel parameters is already
    /// set up in `outContext.`
    ///
    /// Currently only works for programs that have a single
    /// ray generation shader and no other entry points.
    ///
static SlangResult _loadAndInvokeRayTracingProgram(
    CUcontext cudaContext,
    ScopeCUDAStream& cudaStream,
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    const uint32_t dispatchSize[3],
    CUDAComputeUtil::Context& outContext)
{
    SLANG_OPTIX_RETURN_ON_FAIL(optixInit());

    OptixDeviceContextOptions optixOptions = {};

#if _DEBUG
    optixOptions.logCallbackFunction = &_optixLogCallback;
    optixOptions.logCallbackLevel = 4;
#endif

    OptixDeviceContext optixContext = nullptr;
    SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));

    enum { kOptixLogSize = 2*1024 };
    char log[kOptixLogSize];
    size_t logSize = sizeof(log);

    OptixPipelineCompileOptions optixPipelineCompileOptions = {};
    optixPipelineCompileOptions.pipelineLaunchParamsVariableName = "SLANG_globalParams";

    // We need to load modules from the PTX code available to us,
    // and then also create program groups from the kernels
    // in those modules.
    //
    // For now we will only support program groups with a single
    // kernel in them, and will create one per entry point.
    //
    Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
    List<OptixProgramGroup> optixProgramGroups;
    List<String> names;

    OptixShaderBindingTable optixSBT = {};

    for( Index ee = 0; ee < entryPointCount; ++ee )
    {
        auto& kernel = outputAndLayout.output.kernelDescs[ee];

        // TODO: The logic here assumes that each kernel will
        // come from its own independent module, and has no
        // provisiion for loading modules that might contain
        // multiple entry points.
        //
        OptixModuleCompileOptions optixModuleCompileOptions = {};
        OptixModule optixModule;
        SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
            optixContext,
            &optixModuleCompileOptions,
            &optixPipelineCompileOptions,
            (char const*) kernel.codeBegin,
            kernel.getCodeSize(),
            log,
            &logSize,
            &optixModule));

        // TODO: The logic here only handles ray-generation entry points.
        //
        // It would seem simple to extend this to handle other entry
        // point types, by inspecting the stage of the entry points
        // being loaded, and this is indeed true for the subset of
        // stages that map one-to-one with OptiX "program groups."
        //
        // The sticking point is "hit groups" which require a collection
        // of entry points to be specified together (insersection,
        // any hit, and closest hit). A hit group can comprise between
        // zero and three entry points.
        //
        // The catch for us is how to determine which entry points
        // should be grouped to form hit groups. Should this be
        // implied in the input code (either by naming convention
        // or by new Slang language features)? Should this be set
        // up via command-line arguments or something akin to
        // `//TEST_INPUT` lines?

        OptixProgramGroupOptions optixProgramGroupOptions = {};

        OptixProgramGroupDesc optixProgramGroupDesc = {};
        optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
        optixProgramGroupDesc.raygen.module = optixModule;

        String name = String("__raygen__") + kernel.entryPointName;
        names.add(name);
        optixProgramGroupDesc.raygen.entryFunctionName = name.begin();

        OptixProgramGroup optixProgramGroup = nullptr;
        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
            optixContext,
            &optixProgramGroupDesc,
            1,
            &optixProgramGroupOptions,
            log,
            &logSize,
            &optixProgramGroup));

        optixProgramGroups.add(optixProgramGroup);

        {
            CUdeviceptr rayGenRecordPtr;
            size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;

            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));

            struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
            SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));

            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
                (void*) rayGenRecordPtr,
                &rayGenRecordData,
                rayGenRecordSize,
                cudaMemcpyHostToDevice));

            optixSBT.raygenRecord = rayGenRecordPtr;
        }
    }

    OptixPipeline optixPipeline = nullptr;

    OptixPipelineLinkOptions optixPipelineLinkOptions = {};
    optixPipelineLinkOptions.maxTraceDepth = 5;
    optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
    optixPipelineLinkOptions.overrideUsesMotionBlur = false;
    SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
        optixContext,
        &optixPipelineCompileOptions,
        &optixPipelineLinkOptions,
        optixProgramGroups.getBuffer(),
        (unsigned int)optixProgramGroups.getCount(),
        log,
        &logSize,
        &optixPipeline));


    {
        // The OptiX API complains if we don't fill in a miss record
        // in the SBT, so we will create a dummy one here to represent
        // the lack of any miss shaders.
        //
        OptixProgramGroupOptions optixProgramGroupOptions = {};
        OptixProgramGroupDesc missGroupDesc = {};
        missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
        OptixProgramGroup missProgramGroup;
        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
            optixContext,
            &missGroupDesc,
            1,
            &optixProgramGroupOptions,
            log,
            &logSize,
            &missProgramGroup));


        CUdeviceptr missRecordPtr;
        size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;

        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));

        struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
        SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));

        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
            (void*) missRecordPtr,
            &missRecordData,
            missRecordSize,
            cudaMemcpyHostToDevice));

        optixSBT.missRecordBase = missRecordPtr;
        optixSBT.missRecordCount = 1;
        optixSBT.missRecordStrideInBytes = (unsigned int)missRecordSize;
    }
    {
        // Okay, we also need a dummy hit group.

        OptixProgramGroupOptions optixProgramGroupOptions = {};
        OptixProgramGroupDesc hitGroupDesc = {};
        hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
        OptixProgramGroup programGroup;
        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
            optixContext,
            &hitGroupDesc,
            1,
            &optixProgramGroupOptions,
            log,
            &logSize,
            &programGroup));


        CUdeviceptr recordPtr;
        size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;

        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));

        struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
        SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));

        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
            (void*) recordPtr,
            &recordData,
            recordSize,
            cudaMemcpyHostToDevice));

        optixSBT.hitgroupRecordBase = recordPtr;
        optixSBT.hitgroupRecordCount = 1;
        optixSBT.hitgroupRecordStrideInBytes = (unsigned int)recordSize;
    }

    // Work out the args

    auto& bindRoot = outContext.m_bindRoot;

    CUdeviceptr globalParams = 0;
    size_t globalParamsSize;

    if( auto globalArg = bindRoot.getRootValue() )
    {
        globalParams = MemoryCUDAResource::getCUDAData(globalArg);
        globalParamsSize = globalArg->m_sizeInBytes;
    }

    // TODO: The data for entry point parameters needs to be stored
    // into the SBT.
    //
    // The simplest solution here would be to copy data from the `bindRoot`
    // into the SBT at the point where we are setting up the SBT, but
    // a more optimized approach (more similar to what a real applicaiton
    // would do) would be to allocate the SBT first and then have the
    // binding logic write directly into its entries.
    //
    // One big complication here is that there need not necessarily be
    // a one-to-one relationship between the entry points (or entry-point
    // groups) in a compiled ray-tracing pipeline and the entries in
    // the SBT. Each SBT entry is conceptually an "instance" of one
    // of the entry-point groups in the program, and there can be
    // zero, one, or many instances of a given group.
    //
    // Modelling this more completely in `render-test` requires that
    // we start having a model for the "scene" that is being rendered,
    // and how entry point groups are associated with the objects in
    // that scene.
    //
    CUdeviceptr entryPointParams = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());

    SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
        optixPipeline,
        cudaStream,
        globalParams,
        globalParamsSize,
        &optixSBT,
        dispatchSize[0],
        dispatchSize[1],
        dispatchSize[2]));

    SLANG_RETURN_ON_FAIL(cudaStream.sync());

    return SLANG_OK;
}
#endif

    // Fill in RTTI pointers values in input buffers.
static SlangResult _populateRTTIEntries(
    const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout,
    ScopeCUDAModule& cudaModule)
{
    Slang::ComPtr<slang::ISession> linkage;
    spCompileRequest_getSession(compilationAndLayout.output.request, linkage.writeRef());
    auto& inputLayout = compilationAndLayout.layout;
    for (auto& entry : inputLayout.entries)
    {
        for (auto& rtti : entry.rttiEntries)
        {
            CUdeviceptr ptrValue = 0;
            switch (rtti.type)
            {
            case RTTIDataEntryType::RTTIObject:
                {
                    auto reflection =
                        slang::ShaderReflection::get(compilationAndLayout.output.request);
                    auto concreteType = reflection->findTypeByName(rtti.typeName.getBuffer());
                    ComPtr<ISlangBlob> outName;
                    linkage->getTypeRTTIMangledName(concreteType, outName.writeRef());
                    if (!outName)
                        return SLANG_FAIL;
                    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetGlobal(
                        &ptrValue,
                        nullptr,
                        cudaModule.m_module,
                        (char*)outName->getBufferPointer()));
                }
                break;
            case RTTIDataEntryType::WitnessTable:
                {
                    auto reflection =
                        slang::ShaderReflection::get(compilationAndLayout.output.request);
                    auto concreteType = reflection->findTypeByName(rtti.typeName.getBuffer());
                    if (!concreteType)
                        return SLANG_FAIL;
                    auto interfaceType = reflection->findTypeByName(rtti.interfaceName.getBuffer());
                    if (!interfaceType)
                        return SLANG_FAIL;
                    ComPtr<ISlangBlob> outName;
                    linkage->getTypeConformanceWitnessMangledName(
                        concreteType, interfaceType, outName.writeRef());
                    if (!outName)
                        return SLANG_FAIL;
                    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetGlobal(
                        &ptrValue,
                        nullptr,
                        cudaModule.m_module,
                        (char*)outName->getBufferPointer()));
                    break;
                }
            default:
                break;
            }
            if (!ptrValue)
                return SLANG_FAIL;
            if (rtti.offset >= 0 &&
                rtti.offset + sizeof(ptrValue) <=
                    entry.bufferData.getCount() * sizeof(decltype(entry.bufferData[0])))
            {
                memcpy(
                    ((char*)entry.bufferData.getBuffer()) + rtti.offset,
                    &ptrValue,
                    sizeof(ptrValue));
            }
            else
            {
                return SLANG_FAIL;
            }
        }
    }
    return SLANG_OK;
}

    /// Fill in the binding information for arguments of a CUDA program.
static SlangResult _setUpArguments(
    CUcontext cudaContext,
    ScopeCUDAStream& cudaStream,
    ScopeCUDAModule& cudaModule,
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    const uint32_t dispatchSize[3],
    CUDAComputeUtil::Context& outContext)
{
    auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);

    auto& bindSet = outContext.m_bindSet;
    auto& bindRoot = outContext.m_bindRoot;

    // Okay now we need to set up binding
    bindRoot.init(&bindSet, reflection, 0);

    // Will set up any root buffers
    bindRoot.addDefaultValues();

    // Now set up the Values from the test

    auto outStream = StdWriters::getOut();

    // Fill in RTTI pointers in input buffers before copying it to GPU memory.
    // TODO: enable this for Optix path after it is refactored so that context
    // creation and module loading happens before _setUpArguments.
    if (outputAndLayout.output.desc.pipelineType == PipelineType::Compute)
    {
        SLANG_RETURN_ON_FAIL(_populateRTTIEntries(outputAndLayout, cudaModule));
    }
    SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));

    ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);

    // First create all of the resources for the values

    {
        const auto& values = bindSet.getValues();
        const auto& entries = outputAndLayout.layout.entries;

        for (BindSet::Value* value : values)
        {
            auto typeLayout = value->m_type;
               
            // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
            slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
               
            switch (kind)
            {
                case slang::TypeReflection::Kind::ConstantBuffer:
                case slang::TypeReflection::Kind::ParameterBlock:
                {
                    // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
                    RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
                    SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
                    value->m_target = resource;
                    break;
                }
                case slang::TypeReflection::Kind::Resource:
                {
                    auto type = typeLayout->getType();
                    auto shape = type->getResourceShape();

                    auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;

                    switch (baseShape)
                    {
                        case SLANG_TEXTURE_1D: 
                        case SLANG_TEXTURE_2D:
                        case SLANG_TEXTURE_3D:
                        case SLANG_TEXTURE_CUBE:
                        {
                            RefPtr<CUDAResource> resource;
                            SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
                            value->m_target = resource;
                            break;
                        }
                        case SLANG_TEXTURE_BUFFER:
                        {
                            // Need a CUDA impl for these...
                            // For now we can just leave as target will just be nullptr
                            break;
                        }

                        case SLANG_BYTE_ADDRESS_BUFFER:
                        case SLANG_STRUCTURED_BUFFER:
                        {
                            // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
                            RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
                            SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
                            value->m_target = resource;
                            break;
                        }
                    }
                }
                default: break;
            }
        }
    }
    
    // Now we need to go through all of the bindings and set the appropriate data

    {
        List<BindLocation> locations;
        List<BindSet::Value*> values;
        bindSet.getBindings(locations, values);

        for (Index i = 0; i < locations.getCount(); ++i)
        {
            const auto& location = locations[i];
            BindSet::Value* value = values[i];

            // Okay now we need to set up the actual handles that CPU will follow.
            auto typeLayout = location.getTypeLayout();

            const auto kind = typeLayout->getKind();
            switch (kind)
            {
                case slang::TypeReflection::Kind::Array:
                {
                    auto elementCount = int(typeLayout->getElementCount());
                    if (elementCount == 0)
                    {
                        CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
                        auto resource = MemoryCUDAResource::asResource(value);
                        if (resource)
                        {
                            array.data = resource->m_cudaMemory;
                            array.count = value->m_elementCount;
                        }

                        location.setUniform(&array, sizeof(array));
                    }
                    break;
                }
                case slang::TypeReflection::Kind::ConstantBuffer:
                case slang::TypeReflection::Kind::ParameterBlock:
                {
                    // These map down to just pointers
                    *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
                    break;
                }
                case slang::TypeReflection::Kind::Resource:
                {
                    auto type = typeLayout->getType();
                    auto shape = type->getResourceShape();

                    auto access = type->getResourceAccess();

                    const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;

                    switch (baseShape)
                    {
                        case SLANG_STRUCTURED_BUFFER:
                        {
                            CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
                            auto resource = MemoryCUDAResource::asResource(value);
                            if (resource)
                            {
                                buffer.data = resource->m_cudaMemory;
                                buffer.count = value->m_elementCount;
                            }

                            location.setUniform(&buffer, sizeof(buffer));
                            break;
                        }
                        case SLANG_BYTE_ADDRESS_BUFFER:
                        {
                            CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };

                            auto resource = MemoryCUDAResource::asResource(value);
                            if (resource)
                            {
                                buffer.data = resource->m_cudaMemory;
                                buffer.sizeInBytes = value->m_sizeInBytes;
                            }

                            location.setUniform(&buffer, sizeof(buffer));
                            break;
                        }
                        case SLANG_TEXTURE_1D:
                        case SLANG_TEXTURE_2D:
                        case SLANG_TEXTURE_3D:
                        case SLANG_TEXTURE_CUBE:
                        {
                            if (_hasWriteAccess(access))
                            {
                                *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
                            }
                            else
                            {
                                *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
                            }
                            break;
                        }

                    }
                    break;
                }
                default: break;
            }
        }
    }

    // Okay now the memory is all set up, we can copy everything over
    {
        const auto& values = bindSet.getValues();
        for (BindSet::Value* value : values)
        {
            CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
            if (value && value->m_data && cudaMem)
            {
                // Okay copy the data over...
                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
            }
        }
    }

    return SLANG_OK;
}

    /// Read back any output arguments from a CUDA program.
static SlangResult _readBackOutputs(
    CUcontext cudaContext,
    ScopeCUDAStream& cudaStream,
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    const uint32_t dispatchSize[3],
    CUDAComputeUtil::Context& outContext)
{
    const auto& entries = outputAndLayout.layout.entries;

    for (Index i = 0; i < entries.getCount(); ++i)
    {
        const auto& entry = entries[i];
        BindSet::Value* value = outContext.m_buffers[i];

        if (entry.isOutput)
        {
            // Copy back to CPU memory
            CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
            if (value && value->m_data && cudaMem)
            {
                // Okay copy the data back...
                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
            }
        }
    }

    return SLANG_OK;
}

SlangResult _loadCUDAModule(
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    ScopeCUDAModule& outModule)
{
    const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
    if (index < 0)
    {
        return SLANG_FAIL;
    }
    const auto& kernelDesc = outputAndLayout.output.kernelDescs[index];
    SLANG_RETURN_ON_FAIL(outModule.load(kernelDesc.codeBegin));
    return SLANG_OK;
}

    /// Load and invoke a CUDA program (either compute or ray-tracing)
SlangResult _loadAndInvokeKernel(
    CUcontext cudaContext,
    ScopeCUDAStream& cudaStream,
    ScopeCUDAModule& cudaModule,
    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
    const uint32_t dispatchSize[3],
    CUDAComputeUtil::Context& outContext)
{
    switch( outputAndLayout.output.desc.pipelineType )
    {
    case PipelineType::Compute:
        return _invokeComputeProgram(
            cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext);

    case PipelineType::RayTracing:
#ifdef RENDER_TEST_OPTIX
        return _loadAndInvokeRayTracingProgram(
            cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
#endif
        break;
    
    default: break;
    }

    return SLANG_FAIL;
}

    /// Execute a CUDA program (either compute or ray-tracing)
    ///
    /// This function handles loading code and argument data,
    /// invoking the kernel(s), and reading back results.
    ///
/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
{
    ScopeCUDAContext cudaContext;
    SLANG_RETURN_ON_FAIL(cudaContext.init(0));

    // A default stream, will act as a global stream. Calling sync will globally sync
    ScopeCUDAStream cudaStream;
    //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));

    ScopeCUDAModule cudaModule;

    auto& bindSet = outContext.m_bindSet;
    auto& bindRoot = outContext.m_bindRoot;

    auto request = outputAndLayout.output.request;
    auto reflection = (slang::ShaderReflection*) spGetReflection(request);

    // Load cuda module first so its symbols may be queried and filled into argument buffers.
    // TODO: refactor optix path to also front-load its context creation and module loading here.
    // For now just front-load compute kernels.
    if (outputAndLayout.output.desc.pipelineType == PipelineType::Compute)
    {
        SLANG_RETURN_ON_FAIL(_loadCUDAModule(outputAndLayout, cudaModule));
    }

    SLANG_RETURN_ON_FAIL(_setUpArguments(
        cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext));

    SLANG_RETURN_ON_FAIL(_loadAndInvokeKernel(
        cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext));

        // Finally we need to copy the data back
    SLANG_RETURN_ON_FAIL(_readBackOutputs(
        cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));

    // Release all othe CUDA resource/allocations
    bindSet.releaseValueTargets();

    return SLANG_OK;
}


} // renderer_test