diff options
Diffstat (limited to 'tools/gfx/cuda/render-cuda.cpp')
| -rw-r--r-- | tools/gfx/cuda/render-cuda.cpp | 2513 |
1 files changed, 0 insertions, 2513 deletions
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp deleted file mode 100644 index 114751a3d..000000000 --- a/tools/gfx/cuda/render-cuda.cpp +++ /dev/null @@ -1,2513 +0,0 @@ -#include "render-cuda.h" - -#ifdef GFX_ENABLE_CUDA -#include <cuda.h> -#include <cuda_runtime_api.h> -#include "core/slang-basic.h" -#include "core/slang-blob.h" -#include "core/slang-std-writers.h" - -#include "slang.h" -#include "slang-com-ptr.h" -#include "slang-com-helper.h" -#include "../command-writer.h" -#include "../renderer-shared.h" -#include "../mutable-shader-object.h" -#include "../simple-transient-resource-heap.h" -#include "../slang-context.h" -#include "../command-encoder-com-forward.h" - -# ifdef RENDER_TEST_OPTIX - -// The `optix_stubs.h` header produces warnings when compiled with MSVC -# ifdef _MSC_VER -# pragma warning(disable: 4996) -# endif - -# include <optix.h> -# include <optix_function_table_definition.h> -# include <optix_stubs.h> -# endif - -#endif - -namespace gfx -{ -#ifdef GFX_ENABLE_CUDA -using namespace Slang; - -SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } -SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } - -// A enum used to control if errors are reported on failure of CUDA call. -enum class CUDAReportStyle -{ - Normal, - Silent, -}; - -struct CUDAErrorInfo -{ - CUDAErrorInfo( - const char* filePath, - int lineNo, - const char* errorName = nullptr, - const char* errorString = nullptr) - : m_filePath(filePath) - , m_lineNo(lineNo) - , m_errorName(errorName) - , m_errorString(errorString) - {} - SlangResult handle() const - { - StringBuilder builder; - builder << "Error: " << m_filePath << " (" << m_lineNo << ") :"; - - if (m_errorName) - { - builder << m_errorName << " : "; - } - if (m_errorString) - { - builder << m_errorString; - } - - getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver, - builder.getUnownedSlice().begin()); - - // Slang::signalUnexpectedError(builder.getBuffer()); - return SLANG_FAIL; - } - - const char* m_filePath; - int m_lineNo; - const char* m_errorName; - const char* m_errorString; -}; - -// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream. - -static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line) -{ - CUDAErrorInfo info(file, line); - cuGetErrorString(cuResult, &info.m_errorString); - cuGetErrorName(cuResult, &info.m_errorName); - return info.handle(); -} - -static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line) -{ - return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle(); -} - -# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__) - -# define SLANG_CUDA_RETURN_ON_FAIL(x) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - return SLANG_CUDA_HANDLE_ERROR(_res); \ - } -# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - { \ - return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \ - : SLANG_FAIL; \ - } \ - } - -# define SLANG_CUDA_ASSERT_ON_FAIL(x) \ - { \ - auto _res = x; \ - if (_isError(_res)) \ - { \ - SLANG_ASSERT(!"Failed CUDA call"); \ - }; \ - } - -# ifdef RENDER_TEST_OPTIX - -static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; } - -# if 1 -static SlangResult _handleOptixError(OptixResult result, char const* file, int line) -{ - fprintf( - stderr, - "%s(%d): optix: %s (%s)\n", - file, - line, - optixGetErrorString(result), - optixGetErrorName(result)); - return SLANG_FAIL; -} -# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__) -# else -# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL -# endif - -# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \ - do \ - { \ - auto _res = EXPR; \ - if (_isError(_res)) \ - return SLANG_OPTIX_HANDLE_ERROR(_res); \ - } while (0) - -void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData) -{ - fprintf(stderr, "optix: %s (%s)\n", message, tag); -} - -# endif - -class CUDAContext : public RefObject -{ -public: - CUcontext m_context = nullptr; - ~CUDAContext() { cuCtxDestroy(m_context); } -}; - -class MemoryCUDAResource : public BufferResource -{ -public: - MemoryCUDAResource(const Desc& _desc) - : BufferResource(_desc) - {} - - ~MemoryCUDAResource() - { - if (m_cudaMemory) - { - SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); - } - } - - uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; } - - void* m_cudaExternalMemory = nullptr; - void* m_cudaMemory = nullptr; - - RefPtr<CUDAContext> m_cudaContext; - - virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override - { - return (DeviceAddress)m_cudaMemory; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override - { - outHandle->handleValue = getBindlessHandle(); - outHandle->api = InteropHandleAPI::CUDA; - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - map(MemoryRange* rangeToRead, void** outPointer) override - { - SLANG_UNUSED(rangeToRead); - SLANG_UNUSED(outPointer); - return SLANG_FAIL; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override - { - SLANG_UNUSED(writtenRange); - return SLANG_FAIL; - } -}; - -class TextureCUDAResource : public TextureResource -{ -public: - TextureCUDAResource(const TextureResource::Desc& desc) - : TextureResource(desc) - {} - ~TextureCUDAResource() - { - if (m_cudaSurfObj) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj)); - } - if (m_cudaTexObj) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); - } - if (m_cudaArray) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); - } - if (m_cudaMipMappedArray) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray)); - } - } - - uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; } - - // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by - // a long long - CUtexObject m_cudaTexObj = CUtexObject(); - - // The surfObj is for reading/writing 'texture like' things, but not for sampling. - CUsurfObject m_cudaSurfObj = CUsurfObject(); - - CUarray m_cudaArray = CUarray(); - CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray(); - - void* m_cudaExternalMemory = nullptr; - - RefPtr<CUDAContext> m_cudaContext; - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override - { - outHandle->handleValue = getBindlessHandle(); - outHandle->api = InteropHandleAPI::CUDA; - return SLANG_OK; - } -}; - -class CUDAResourceView : public ResourceViewBase -{ -public: - RefPtr<MemoryCUDAResource> memoryResource = nullptr; - RefPtr<TextureCUDAResource> textureResource = nullptr; - void* proxyBuffer = nullptr; -}; - -class CUDAShaderObjectLayout : public ShaderObjectLayoutBase -{ -public: - struct BindingRangeInfo - { - slang::BindingType bindingType; - Index count; - Index baseIndex; // Flat index for sub-ojects - Index subObjectIndex; - - // TODO: The `uniformOffset` field should be removed, - // since it cannot be supported by the Slang reflection - // API once we fix some design issues. - // - // It is only being used today for pre-allocation of sub-objects - // for constant buffers and parameter blocks (which should be - // deprecated/removed anyway). - // - // Note: We would need to bring this field back, plus - // a lot of other complexity, if we ever want to support - // setting of resources/buffers directly by a binding - // range index and array index. - // - Index uniformOffset; // Uniform offset for a resource typed field. - }; - - struct SubObjectRangeInfo - { - RefPtr<CUDAShaderObjectLayout> layout; - Index bindingRangeIndex; - }; - - List<SubObjectRangeInfo> subObjectRanges; - List<BindingRangeInfo> m_bindingRanges; - - Index m_subObjectCount = 0; - Index m_resourceCount = 0; - - CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout) - { - m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType); - - initBase(renderer, m_elementTypeLayout); - - // Compute the binding ranges that are used to store - // the logical contents of the object in memory. These will relate - // to the descriptor ranges in the various sets, but not always - // in a one-to-one fashion. - - SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount(); - for (SlangInt r = 0; r < bindingRangeCount; ++r) - { - slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r); - SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r); - slang::TypeLayoutReflection* slangLeafTypeLayout = - m_elementTypeLayout->getBindingRangeLeafTypeLayout(r); - - SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r); - SlangInt rangeIndexInDescriptorSet = - m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r); - - // TODO: This logic assumes that for any binding range that might consume - // multiple kinds of resources, the descriptor range for its uniform - // usage will be the first one in the range. - // - // We need to decide whether that assumption is one we intend to support - // applications making, or whether they should be forced to perform a - // linear search over the descriptor ranges for a specific binding range. - // - auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset( - descriptorSetIndex, rangeIndexInDescriptorSet); - - Index baseIndex = 0; - Index subObjectIndex = 0; - switch (slangBindingType) - { - case slang::BindingType::ConstantBuffer: - case slang::BindingType::ParameterBlock: - case slang::BindingType::ExistentialValue: - baseIndex = m_subObjectCount; - subObjectIndex = baseIndex; - m_subObjectCount += count; - break; - case slang::BindingType::RawBuffer: - case slang::BindingType::MutableRawBuffer: - if (slangLeafTypeLayout->getType()->getElementType() != nullptr) - { - // A structured buffer occupies both a resource slot and - // a sub-object slot. - subObjectIndex = m_subObjectCount; - m_subObjectCount += count; - } - baseIndex = m_resourceCount; - m_resourceCount += count; - break; - default: - baseIndex = m_resourceCount; - m_resourceCount += count; - break; - } - - BindingRangeInfo bindingRangeInfo; - bindingRangeInfo.bindingType = slangBindingType; - bindingRangeInfo.count = count; - bindingRangeInfo.baseIndex = baseIndex; - bindingRangeInfo.uniformOffset = uniformOffset; - bindingRangeInfo.subObjectIndex = subObjectIndex; - m_bindingRanges.add(bindingRangeInfo); - } - - SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount(); - for (SlangInt r = 0; r < subObjectRangeCount; ++r) - { - SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r); - auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex); - slang::TypeLayoutReflection* slangLeafTypeLayout = - m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex); - - // A sub-object range can either represent a sub-object of a known - // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>` - // (in which case we can pre-compute a layout to use, based on - // the type `Foo`) *or* it can represent a sub-object of some - // existential type (e.g., `IBar`) in which case we cannot - // know the appropraite type/layout of sub-object to allocate. - // - RefPtr<CUDAShaderObjectLayout> subObjectLayout; - if (slangBindingType != slang::BindingType::ExistentialValue) - { - subObjectLayout = - new CUDAShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout()); - } - - SubObjectRangeInfo subObjectRange; - subObjectRange.bindingRangeIndex = bindingRangeIndex; - subObjectRange.layout = subObjectLayout; - subObjectRanges.add(subObjectRange); - } - } - - Index getResourceCount() const { return m_resourceCount; } - Index getSubObjectCount() const { return m_subObjectCount; } - List<SubObjectRangeInfo>& getSubObjectRanges() { return subObjectRanges; } - BindingRangeInfo getBindingRange(Index index) { return m_bindingRanges[index]; } - Index getBindingRangeCount() const { return m_bindingRanges.getCount(); } -}; - -class CUDAProgramLayout : public CUDAShaderObjectLayout -{ -public: - slang::ProgramLayout* programLayout = nullptr; - List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts; - CUDAProgramLayout(RendererBase* renderer, slang::ProgramLayout* inProgramLayout) - : CUDAShaderObjectLayout(renderer, inProgramLayout->getGlobalParamsTypeLayout()) - , programLayout(inProgramLayout) - { - for (UInt i =0; i< programLayout->getEntryPointCount(); i++) - { - entryPointLayouts.add(new CUDAShaderObjectLayout( - renderer, - programLayout->getEntryPointByIndex(i)->getTypeLayout())); - } - - } - - int getKernelIndex(UnownedStringSlice kernelName) - { - for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++) - { - auto entryPoint = programLayout->getEntryPointByIndex(i); - if (kernelName == entryPoint->getName()) - { - return i; - } - } - return -1; - } - - void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes) - { - auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex); - entryPoint->getComputeThreadGroupSize(3, threadGroupSizes); - } -}; - -class CUDAShaderObjectData -{ -public: - bool isHostOnly = false; - Slang::RefPtr<MemoryCUDAResource> m_bufferResource; - Slang::RefPtr<CUDAResourceView> m_bufferView; - Slang::List<uint8_t> m_cpuBuffer; - Result setCount(Index count) - { - if (isHostOnly) - { - m_cpuBuffer.setCount(count); - if (!m_bufferView) - { - IResourceView::Desc viewDesc = {}; - viewDesc.type = IResourceView::Type::UnorderedAccess; - m_bufferView = new CUDAResourceView(); - m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer(); - m_bufferView->m_desc = viewDesc; - } - return SLANG_OK; - } - - if (!m_bufferResource) - { - IBufferResource::Desc desc; - desc.type = IResource::Type::Buffer; - desc.sizeInBytes = count; - m_bufferResource = new MemoryCUDAResource(desc); - if (count) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count)); - } - IResourceView::Desc viewDesc = {}; - viewDesc.type = IResourceView::Type::UnorderedAccess; - m_bufferView = new CUDAResourceView(); - m_bufferView->memoryResource = m_bufferResource; - m_bufferView->m_desc = viewDesc; - } - auto oldSize = m_bufferResource->getDesc()->sizeInBytes; - if ((size_t)count != oldSize) - { - void* newMemory = nullptr; - if (count) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count)); - } - if (oldSize) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - newMemory, - m_bufferResource->m_cudaMemory, - Math::Min((size_t)count, oldSize), - cudaMemcpyDefault)); - } - cudaFree(m_bufferResource->m_cudaMemory); - m_bufferResource->m_cudaMemory = newMemory; - m_bufferResource->getDesc()->sizeInBytes = count; - } - return SLANG_OK; - } - - Slang::Index getCount() - { - if (isHostOnly) - return m_cpuBuffer.getCount(); - if (m_bufferResource) - return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes); - else - return 0; - } - - void* getBuffer() - { - if (isHostOnly) - return m_cpuBuffer.getBuffer(); - - if (m_bufferResource) - return m_bufferResource->m_cudaMemory; - return nullptr; - } - - /// Returns a resource view for GPU access into the buffer content. - ResourceViewBase* getResourceView( - RendererBase* device, - slang::TypeLayoutReflection* elementLayout, - slang::BindingType bindingType) - { - SLANG_UNUSED(device); - m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize(); - return m_bufferView.Ptr(); - } -}; - -class CUDAShaderObject - : public ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData> -{ - typedef ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData> - Super; - -public: - List<RefPtr<CUDAResourceView>> resources; - - virtual SLANG_NO_THROW Result SLANG_MCALL - init(IDevice* device, CUDAShaderObjectLayout* typeLayout); - - virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return 0; } - virtual SLANG_NO_THROW Result SLANG_MCALL - getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override - { - *outEntryPoint = nullptr; - return SLANG_OK; - } - - virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override - { - return m_data.getBuffer(); - } - - virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override - { - return (Size)m_data.getCount(); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - setData(ShaderOffset const& offset, void const* data, Size size) override - { - size = Math::Min(size, (Size)m_data.getCount() - offset.uniformOffset); - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault)); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setResource(ShaderOffset const& offset, IResourceView* resourceView) override - { - if (!resourceView) - return SLANG_OK; - - auto layout = getLayout(); - - auto bindingRangeIndex = offset.bindingRangeIndex; - SLANG_ASSERT(bindingRangeIndex >= 0); - SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount()); - - auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex]; - - auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex; - auto cudaView = static_cast<CUDAResourceView*>(resourceView); - - resources[viewIndex] = cudaView; - - if (cudaView->textureResource) - { - if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess) - { - auto handle = cudaView->textureResource->m_cudaSurfObj; - setData(offset, &handle, sizeof(uint64_t)); - } - else - { - auto handle = cudaView->textureResource->getBindlessHandle(); - setData(offset, &handle, sizeof(uint64_t)); - } - } - else if (cudaView->memoryResource) - { - auto handle = cudaView->memoryResource->getBindlessHandle(); - setData(offset, &handle, sizeof(handle)); - auto sizeOffset = offset; - sizeOffset.uniformOffset += sizeof(handle); - auto& desc = *cudaView->memoryResource->getDesc(); - size_t size = desc.sizeInBytes; - if (desc.elementSize > 1) - size /= desc.elementSize; - setData(sizeOffset, &size, sizeof(size)); - } - else if (cudaView->proxyBuffer) - { - auto handle = cudaView->proxyBuffer; - setData(offset, &handle, sizeof(handle)); - auto sizeOffset = offset; - sizeOffset.uniformOffset += sizeof(handle); - auto& desc = *cudaView->memoryResource->getDesc(); - size_t size = desc.sizeInBytes; - if (desc.elementSize > 1) - size /= desc.elementSize; - setData(sizeOffset, &size, sizeof(size)); - } - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setObject(ShaderOffset const& offset, IShaderObject* object) override - { - SLANG_RETURN_ON_FAIL(Super::setObject(offset, object)); - - auto bindingRangeIndex = offset.bindingRangeIndex; - auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex]; - - CUDAShaderObject* subObject = static_cast<CUDAShaderObject*>(object); - switch (bindingRange.bindingType) - { - default: - { - void* subObjectDataBuffer = subObject->getBuffer(); - SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*))); - } - break; - case slang::BindingType::ExistentialValue: - case slang::BindingType::RawBuffer: - case slang::BindingType::MutableRawBuffer: - break; - } - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - setSampler(ShaderOffset const& offset, ISamplerState* sampler) override - { - SLANG_UNUSED(sampler); - SLANG_UNUSED(offset); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler( - ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override - { - SLANG_UNUSED(sampler); - setResource(offset, textureView); - return SLANG_OK; - } -}; - -class CUDAMutableShaderObject : public MutableShaderObject< CUDAMutableShaderObject, CUDAShaderObjectLayout> -{}; - -class CUDAEntryPointShaderObject : public CUDAShaderObject -{ -public: - CUDAEntryPointShaderObject() { m_data.isHostOnly = true; } -}; - -class CUDARootShaderObject : public CUDAShaderObject -{ -public: - virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override { return 1; } - virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override { return 1; } -public: - List<RefPtr<CUDAEntryPointShaderObject>> entryPointObjects; - virtual SLANG_NO_THROW Result SLANG_MCALL - init(IDevice* device, CUDAShaderObjectLayout* typeLayout) override; - virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return (GfxCount)entryPointObjects.getCount(); } - virtual SLANG_NO_THROW Result SLANG_MCALL - getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override - { - returnComPtr(outEntryPoint, entryPointObjects[index]); - return SLANG_OK; - } - virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override - { - SLANG_RETURN_ON_FAIL(CUDAShaderObject::collectSpecializationArgs(args)); - for (auto& entryPoint : entryPointObjects) - { - SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args)); - } - return SLANG_OK; - } -}; - -class CUDAShaderProgram : public ShaderProgramBase -{ -public: - CUmodule cudaModule = nullptr; - CUfunction cudaKernel; - String kernelName; - RefPtr<CUDAProgramLayout> layout; - RefPtr<CUDAContext> cudaContext; - ~CUDAShaderProgram() - { - if (cudaModule) - cuModuleUnload(cudaModule); - } -}; - -class CUDAPipelineState : public PipelineStateBase -{ -public: - RefPtr<CUDAShaderProgram> shaderProgram; - void init(const ComputePipelineStateDesc& inDesc) - { - PipelineStateDesc pipelineDesc; - pipelineDesc.type = PipelineType::Compute; - pipelineDesc.compute = inDesc; - initializeBase(pipelineDesc); - } -}; - -class CUDAQueryPool : public QueryPoolBase -{ -public: - // The event object for each query. Owned by the pool. - List<CUevent> m_events; - - // The event that marks the starting point. - CUevent m_startEvent; - - Result init(const IQueryPool::Desc& desc) - { - cuEventCreate(&m_startEvent, 0); - cuEventRecord(m_startEvent, 0); - m_events.setCount(desc.count); - for (SlangInt i = 0; i < m_events.getCount(); i++) - { - cuEventCreate(&m_events[i], 0); - } - return SLANG_OK; - } - - ~CUDAQueryPool() - { - for (auto& e : m_events) - { - cuEventDestroy(e); - } - cuEventDestroy(m_startEvent); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getResult( - GfxIndex queryIndex, GfxCount count, uint64_t* data) override - { - for (GfxIndex i = 0; i < count; i++) - { - float time = 0.0f; - cuEventSynchronize(m_events[i + queryIndex]); - cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]); - data[i] = (uint64_t)((double)time * 1000.0f); - } - return SLANG_OK; - } -}; - -class CUDADevice : public RendererBase -{ -private: - static const CUDAReportStyle reportType = CUDAReportStyle::Normal; - static int _calcSMCountPerMultiProcessor(int major, int minor) - { - // Defines for GPU Architecture types (using the SM version to determine - // the # of cores per SM - struct SMInfo - { - int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version - int coreCount; - }; - - static const SMInfo infos[] = { - {0x30, 192}, - {0x32, 192}, - {0x35, 192}, - {0x37, 192}, - {0x50, 128}, - {0x52, 128}, - {0x53, 128}, - {0x60, 64}, - {0x61, 128}, - {0x62, 128}, - {0x70, 64}, - {0x72, 64}, - {0x75, 64}}; - - const int sm = ((major << 4) + minor); - for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i) - { - if (infos[i].sm == sm) - { - return infos[i].coreCount; - } - } - - const auto& last = infos[SLANG_COUNT_OF(infos) - 1]; - - // It must be newer presumably - SLANG_ASSERT(sm > last.sm); - - // Default to the last entry - return last.coreCount; - } - - static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex) - { - int smPerMultiproc = 0; - int maxPerfDevice = -1; - int deviceCount = 0; - int devicesProhibited = 0; - - uint64_t maxComputePerf = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); - - // Find the best CUDA capable GPU device - for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) - { - int computeMode = -1, major = 0, minor = 0; - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); - - // If this GPU is not running on Compute Mode prohibited, - // then we can add it to the list - if (computeMode != cudaComputeModeProhibited) - { - if (major == 9999 && minor == 9999) - { - smPerMultiproc = 1; - } - else - { - smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor); - } - - int multiProcessorCount = 0, clockRate = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute( - &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); - uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; - - if (compute_perf > maxComputePerf) - { - maxComputePerf = compute_perf; - maxPerfDevice = currentDevice; - } - } - else - { - devicesProhibited++; - } - } - - if (maxPerfDevice < 0) - { - return SLANG_FAIL; - } - - *outDeviceIndex = maxPerfDevice; - return SLANG_OK; - } - - static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal) - { - static CUresult res = cuInit(0); - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType); - return SLANG_OK; - } - -private: - int m_deviceIndex = -1; - CUdevice m_device = 0; - RefPtr<CUDAContext> m_context; - DeviceInfo m_info; - String m_adapterName; - -public: - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override - { - outHandles->handles[0].handleValue = (uint64_t)m_device; - outHandles->handles[0].api = InteropHandleAPI::CUDA; - return SLANG_OK; - } - - class CommandQueueImpl; - - class CommandBufferImpl - : public ICommandBuffer - , public CommandWriter - , public ComObject - { - public: - SLANG_COM_OBJECT_IUNKNOWN_ALL - ICommandBuffer* getInterface(const Guid& guid) - { - if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer) - return static_cast<ICommandBuffer*>(this); - return nullptr; - } - public: - CUDADevice* m_device; - TransientResourceHeapBase* m_transientHeap; - - void init(CUDADevice* device, TransientResourceHeapBase* transientHeap) - { - m_device = device; - m_transientHeap = transientHeap; - } - - virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands( - IRenderPassLayout* renderPass, - IFramebuffer* framebuffer, - IRenderCommandEncoder** outEncoder) override - { - SLANG_UNUSED(renderPass); - SLANG_UNUSED(framebuffer); - *outEncoder = nullptr; - } - - class ResourceCommandEncoderImpl : public IResourceCommandEncoder - { - public: - CommandWriter* m_writer; - - void init(CommandBufferImpl* cmdBuffer) { m_writer = cmdBuffer; } - - virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} - virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer( - IBufferResource* dst, - Offset dstOffset, - IBufferResource* src, - Offset srcOffset, - Size size) override - { - m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size); - } - - virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier( - GfxCount count, - ITextureResource* const* textures, - ResourceState src, - ResourceState dst) override - {} - - virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier( - GfxCount count, - IBufferResource* const* buffers, - ResourceState src, - ResourceState dst) override - {} - - virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData( - IBufferResource* dst, Offset offset, Size size, void* data) override - { - m_writer->uploadBufferData(dst, offset, size, data); - } - - virtual SLANG_NO_THROW void SLANG_MCALL - writeTimestamp(IQueryPool* pool, GfxIndex index) override - { - m_writer->writeTimestamp(pool, index); - } - - virtual SLANG_NO_THROW void SLANG_MCALL copyTexture( - ITextureResource* dst, - ResourceState dstState, - SubresourceRange dstSubresource, - ITextureResource::Offset3D dstOffset, - ITextureResource* src, - ResourceState srcState, - SubresourceRange srcSubresource, - ITextureResource::Offset3D srcOffset, - ITextureResource::Extents extent) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(dstState); - SLANG_UNUSED(dstSubresource); - SLANG_UNUSED(dstOffset); - SLANG_UNUSED(src); - SLANG_UNUSED(srcState); - SLANG_UNUSED(srcSubresource); - SLANG_UNUSED(srcOffset); - SLANG_UNUSED(extent); - SLANG_UNIMPLEMENTED_X("copyTexture"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData( - ITextureResource* dst, - SubresourceRange subResourceRange, - ITextureResource::Offset3D offset, - ITextureResource::Extents extent, - ITextureResource::SubresourceData* subResourceData, - GfxCount subResourceDataCount) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(subResourceRange); - SLANG_UNUSED(offset); - SLANG_UNUSED(extent); - SLANG_UNUSED(subResourceData); - SLANG_UNUSED(subResourceDataCount); - SLANG_UNIMPLEMENTED_X("uploadTextureData"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView( - IResourceView* view, - ClearValue* clearValue, - ClearResourceViewFlags::Enum flags) override - { - SLANG_UNUSED(view); - SLANG_UNUSED(clearValue); - SLANG_UNUSED(flags); - SLANG_UNIMPLEMENTED_X("clearResourceView"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL resolveResource( - ITextureResource* source, - ResourceState sourceState, - SubresourceRange sourceRange, - ITextureResource* dest, - ResourceState destState, - SubresourceRange destRange) override - { - SLANG_UNUSED(source); - SLANG_UNUSED(sourceState); - SLANG_UNUSED(sourceRange); - SLANG_UNUSED(dest); - SLANG_UNUSED(destState); - SLANG_UNUSED(destRange); - SLANG_UNIMPLEMENTED_X("resolveResource"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery( - IQueryPool* queryPool, - GfxIndex index, - GfxCount count, - IBufferResource* buffer, - Offset offset) override - { - SLANG_UNUSED(queryPool); - SLANG_UNUSED(index); - SLANG_UNUSED(count); - SLANG_UNUSED(buffer); - SLANG_UNUSED(offset); - SLANG_UNIMPLEMENTED_X("resolveQuery"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer( - IBufferResource* dst, - Offset dstOffset, - Size dstSize, - Size dstRowStride, - ITextureResource* src, - ResourceState srcState, - SubresourceRange srcSubresource, - ITextureResource::Offset3D srcOffset, - ITextureResource::Extents extent) override - { - SLANG_UNUSED(dst); - SLANG_UNUSED(dstOffset); - SLANG_UNUSED(dstSize); - SLANG_UNUSED(dstRowStride); - SLANG_UNUSED(src); - SLANG_UNUSED(srcState); - SLANG_UNUSED(srcSubresource); - SLANG_UNUSED(srcOffset); - SLANG_UNUSED(extent); - SLANG_UNIMPLEMENTED_X("copyTextureToBuffer"); - } - - virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier( - ITextureResource* texture, - SubresourceRange subresourceRange, - ResourceState src, - ResourceState dst) override - { - SLANG_UNUSED(texture); - SLANG_UNUSED(subresourceRange); - SLANG_UNUSED(src); - SLANG_UNUSED(dst); - SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier"); - } - virtual SLANG_NO_THROW void SLANG_MCALL - beginDebugEvent(const char* name, float rgbColor[3]) override - { - SLANG_UNUSED(name); - SLANG_UNUSED(rgbColor); - } - virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {} - }; - - ResourceCommandEncoderImpl m_resourceCommandEncoder; - - virtual SLANG_NO_THROW void SLANG_MCALL - encodeResourceCommands(IResourceCommandEncoder** outEncoder) override - { - m_resourceCommandEncoder.init(this); - *outEncoder = &m_resourceCommandEncoder; - } - - class ComputeCommandEncoderImpl - : public IComputeCommandEncoder - , public ResourceCommandEncoderImpl - { - public: - SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl) - public: - CommandWriter* m_writer; - CommandBufferImpl* m_commandBuffer; - RefPtr<ShaderObjectBase> m_rootObject; - virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {} - void init(CommandBufferImpl* cmdBuffer) - { - m_writer = cmdBuffer; - m_commandBuffer = cmdBuffer; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override - { - m_writer->setPipelineState(state); - PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); - SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( - pipelineImpl->m_program, m_rootObject.writeRef())); - returnComPtr(outRootObject, m_rootObject); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override - { - m_writer->setPipelineState(state); - PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state); - SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject( - pipelineImpl->m_program, m_rootObject.writeRef())); - m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap); - return SLANG_OK; - } - - virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override - { - m_writer->bindRootShaderObject(m_rootObject); - m_writer->dispatchCompute(x, y, z); - } - - virtual SLANG_NO_THROW void SLANG_MCALL - dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override - { - SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect"); - } - }; - - ComputeCommandEncoderImpl m_computeCommandEncoder; - virtual SLANG_NO_THROW void SLANG_MCALL - encodeComputeCommands(IComputeCommandEncoder** outEncoder) override - { - m_computeCommandEncoder.init(this); - *outEncoder = &m_computeCommandEncoder; - } - - virtual SLANG_NO_THROW void SLANG_MCALL - encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override - { - *outEncoder = nullptr; - } - - virtual SLANG_NO_THROW void SLANG_MCALL close() override {} - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override - { - return SLANG_FAIL; - } - }; - - class CommandQueueImpl - : public ICommandQueue - , public ComObject - { - public: - SLANG_COM_OBJECT_IUNKNOWN_ALL - ICommandQueue* getInterface(const Guid& guid) - { - if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue) - return static_cast<ICommandQueue*>(this); - return nullptr; - } - - public: - RefPtr<CUDAPipelineState> currentPipeline; - RefPtr<CUDARootShaderObject> currentRootObject; - RefPtr<CUDADevice> renderer; - CUstream stream; - Desc m_desc; - public: - void init(CUDADevice* inRenderer) - { - renderer = inRenderer; - m_desc.type = ICommandQueue::QueueType::Graphics; - cuStreamCreate(&stream, 0); - } - ~CommandQueueImpl() - { - cuStreamSynchronize(stream); - cuStreamDestroy(stream); - currentPipeline = nullptr; - currentRootObject = nullptr; - } - - public: - virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override - { - return m_desc; - } - - virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers( - GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override - { - SLANG_UNUSED(valueToSignal); - // TODO: implement fence. - assert(fence == nullptr); - for (GfxIndex i = 0; i < count; i++) - { - execute(static_cast<CommandBufferImpl*>(commandBuffers[i])); - } - } - - virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override - { - auto resultCode = cuStreamSynchronize(stream); - if (resultCode != cudaSuccess) - SLANG_CUDA_HANDLE_ERROR(resultCode); - } - - virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice( - GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override - { - return SLANG_FAIL; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override - { - return SLANG_FAIL; - } - - public: - void setPipelineState(IPipelineState* state) - { - currentPipeline = dynamic_cast<CUDAPipelineState*>(state); - } - - Result bindRootShaderObject(IShaderObject* object) - { - currentRootObject = dynamic_cast<CUDARootShaderObject*>(object); - if (currentRootObject) - return SLANG_OK; - return SLANG_E_INVALID_ARG; - } - - void dispatchCompute(int x, int y, int z) - { - // Specialize the compute kernel based on the shader object bindings. - RefPtr<PipelineStateBase> newPipeline; - renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline); - currentPipeline = static_cast<CUDAPipelineState*>(newPipeline.Ptr()); - - // Find out thread group size from program reflection. - auto& kernelName = currentPipeline->shaderProgram->kernelName; - auto programLayout = static_cast<CUDAProgramLayout*>(currentRootObject->getLayout()); - int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); - SLANG_ASSERT(kernelId != -1); - UInt threadGroupSize[3]; - programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); - - int sharedSizeInBytes; - cuFuncGetAttribute( - &sharedSizeInBytes, - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - currentPipeline->shaderProgram->cudaKernel); - - // Copy global parameter data to the `SLANG_globalParams` symbol. - { - CUdeviceptr globalParamsSymbol = 0; - size_t globalParamsSymbolSize = 0; - cuModuleGetGlobal( - &globalParamsSymbol, - &globalParamsSymbolSize, - currentPipeline->shaderProgram->cudaModule, - "SLANG_globalParams"); - - CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); - cudaMemcpyAsync( - (void*)globalParamsSymbol, - (void*)globalParamsCUDAData, - globalParamsSymbolSize, - cudaMemcpyDefault, - 0); - } - // - // The argument data for the entry-point parameters are already - // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel. - // - auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer(); - auto entryPointDataSize = - currentRootObject->entryPointObjects[kernelId]->getBufferSize(); - - void* extraOptions[] = { - CU_LAUNCH_PARAM_BUFFER_POINTER, - entryPointBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, - &entryPointDataSize, - CU_LAUNCH_PARAM_END, - }; - - // Once we have all the decessary data extracted and/or - // set up, we can launch the kernel and see what happens. - // - auto cudaLaunchResult = cuLaunchKernel( - currentPipeline->shaderProgram->cudaKernel, - x, - y, - z, - int(threadGroupSize[0]), - int(threadGroupSize[1]), - int(threadGroupSize[2]), - sharedSizeInBytes, - stream, - nullptr, - extraOptions); - - SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); - } - - void copyBuffer( - IBufferResource* dst, - size_t dstOffset, - IBufferResource* src, - size_t srcOffset, - size_t size) - { - auto dstImpl = static_cast<MemoryCUDAResource*>(dst); - auto srcImpl = static_cast<MemoryCUDAResource*>(src); - cudaMemcpy( - (uint8_t*)dstImpl->m_cudaMemory + dstOffset, - (uint8_t*)srcImpl->m_cudaMemory + srcOffset, - size, - cudaMemcpyDefault); - } - - void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) - { - auto dstImpl = static_cast<MemoryCUDAResource*>(dst); - cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); - } - - void writeTimestamp(IQueryPool* pool, SlangInt index) - { - auto poolImpl = static_cast<CUDAQueryPool*>(pool); - cuEventRecord(poolImpl->m_events[index], stream); - } - - void execute(CommandBufferImpl* commandBuffer) - { - for (auto& cmd : commandBuffer->m_commands) - { - switch (cmd.name) - { - case CommandName::SetPipelineState: - setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0])); - break; - case CommandName::BindRootShaderObject: - bindRootShaderObject( - commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0])); - break; - case CommandName::DispatchCompute: - dispatchCompute( - int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2])); - break; - case CommandName::CopyBuffer: - copyBuffer( - commandBuffer->getObject<BufferResource>(cmd.operands[0]), - cmd.operands[1], - commandBuffer->getObject<BufferResource>(cmd.operands[2]), - cmd.operands[3], - cmd.operands[4]); - break; - case CommandName::UploadBufferData: - uploadBufferData( - commandBuffer->getObject<BufferResource>(cmd.operands[0]), - cmd.operands[1], - cmd.operands[2], - commandBuffer->getData<uint8_t>(cmd.operands[3])); - break; - case CommandName::WriteTimestamp: - writeTimestamp( - commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]), - (SlangInt)cmd.operands[1]); - } - } - } - }; - - using TransientResourceHeapImpl = SimpleTransientResourceHeap<CUDADevice, CommandBufferImpl>; - -public: - virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override - { - SLANG_RETURN_ON_FAIL(slangContext.initialize( - desc.slang, - SLANG_PTX, - "sm_5_1", - makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView())); - - SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc)); - - SLANG_RETURN_ON_FAIL(_initCuda(reportType)); - - SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex)); - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType); - - m_context = new CUDAContext(); - - int count = -1; - cuDeviceGetCount(&count); - SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex)); - - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL( - cuCtxCreate(&m_context->m_context, 0, m_device), reportType); - - // Not clear how to detect half support on CUDA. For now we'll assume we have it - { - m_features.add("half"); - } - - // Initialize DeviceInfo - { - m_info.deviceType = DeviceType::CUDA; - m_info.bindingStyle = BindingStyle::CUDA; - m_info.projectionStyle = ProjectionStyle::DirectX; - m_info.apiName = "CUDA"; - static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}; - ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity)); - cudaDeviceProp deviceProperties; - cudaGetDeviceProperties(&deviceProperties, m_deviceIndex); - m_adapterName = deviceProperties.name; - m_info.adapterName = m_adapterName.begin(); - m_info.timestampFrequency = 1000000; - } - - return SLANG_OK; - } - - Result getCUDAFormat(Format format, CUarray_format* outFormat) - { - // TODO: Expand to cover all available formats that can be supported in CUDA - switch (format) - { - case Format::R32G32B32A32_FLOAT: - case Format::R32G32B32_FLOAT: - case Format::R32G32_FLOAT: - case Format::R32_FLOAT: - case Format::D32_FLOAT: - *outFormat = CU_AD_FORMAT_FLOAT; - return SLANG_OK; - case Format::R16G16B16A16_FLOAT: - case Format::R16G16_FLOAT: - case Format::R16_FLOAT: - *outFormat = CU_AD_FORMAT_HALF; - return SLANG_OK; - case Format::R32G32B32A32_UINT: - case Format::R32G32B32_UINT: - case Format::R32G32_UINT: - case Format::R32_UINT: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT32; - return SLANG_OK; - case Format::R16G16B16A16_UINT: - case Format::R16G16_UINT: - case Format::R16_UINT: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT16; - return SLANG_OK; - case Format::R8G8B8A8_UINT: - case Format::R8G8_UINT: - case Format::R8_UINT: - case Format::R8G8B8A8_UNORM: - *outFormat = CU_AD_FORMAT_UNSIGNED_INT8; - return SLANG_OK; - case Format::R32G32B32A32_SINT: - case Format::R32G32B32_SINT: - case Format::R32G32_SINT: - case Format::R32_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT32; - return SLANG_OK; - case Format::R16G16B16A16_SINT: - case Format::R16G16_SINT: - case Format::R16_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT16; - return SLANG_OK; - case Format::R8G8B8A8_SINT: - case Format::R8G8_SINT: - case Format::R8_SINT: - *outFormat = CU_AD_FORMAT_SIGNED_INT8; - return SLANG_OK; - default: - SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); - return SLANG_FAIL; - } - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource( - const ITextureResource::Desc& desc, - const ITextureResource::SubresourceData* initData, - ITextureResource** outResource) override - { - TextureResource::Desc srcDesc = fixupTextureDesc(desc); - - RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(srcDesc); - tex->m_cudaContext = m_context; - - CUresourcetype resourceType; - - // The size of the element/texel in bytes - size_t elementSize = 0; - - // Our `ITextureResource::Desc` uses an enumeration to specify - // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's - // `cuMipmappedArrayCreate` seemingly relies on a policy where - // the extents of the array in dimenions above the rank are - // specified as zero (e.g., a 1D texture requires `height==0`). - // - // We will start by massaging the extents as specified by the - // user into a form that CUDA wants/expects, based on the - // texture shape as specified in the `desc`. - // - int width = desc.size.width; - int height = desc.size.height; - int depth = desc.size.depth; - switch (desc.type) - { - case IResource::Type::Texture1D: - height = 0; - depth = 0; - break; - - case IResource::Type::Texture2D: - depth = 0; - break; - - case IResource::Type::Texture3D: - break; - - case IResource::Type::TextureCube: - depth = 1; - break; - } - - { - CUarray_format format = CU_AD_FORMAT_FLOAT; - int numChannels = 0; - - SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format)); - FormatInfo info; - gfxGetFormatInfo(desc.format, &info); - numChannels = info.channelCount; - - switch (format) - { - case CU_AD_FORMAT_FLOAT: - { - elementSize = sizeof(float) * numChannels; - break; - } - case CU_AD_FORMAT_HALF: - { - elementSize = sizeof(uint16_t) * numChannels; - break; - } - case CU_AD_FORMAT_UNSIGNED_INT8: - { - elementSize = sizeof(uint8_t) * numChannels; - break; - } - default: - { - SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now"); - return SLANG_FAIL; - } - } - - if (desc.numMipLevels > 1) - { - resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; - - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Width = width; - arrayDesc.Height = height; - arrayDesc.Depth = depth; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - arrayDesc.Flags = 0; - - if (desc.arraySize > 1) - { - if (desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED; - arrayDesc.Depth = desc.arraySize; - } - else - { - SLANG_ASSERT(!"Arrays only supported for 1D and 2D"); - return SLANG_FAIL; - } - } - - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - arrayDesc.Depth *= 6; - } - - SLANG_CUDA_RETURN_ON_FAIL( - cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels)); - } - else - { - resourceType = CU_RESOURCE_TYPE_ARRAY; - - if (desc.arraySize > 1) - { - if (desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube) - { - SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported"); - return SLANG_FAIL; - } - - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - // Set the depth as the array length - arrayDesc.Depth = desc.arraySize; - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Depth *= 6; - } - - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); - } - else if (desc.type == IResource::Type::Texture3D || - desc.type == IResource::Type::TextureCube) - { - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Depth = depth; - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - arrayDesc.Flags = 0; - - // Handle cube texture - if (desc.type == IResource::Type::TextureCube) - { - arrayDesc.Depth = 6; - arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); - } - else - { - CUDA_ARRAY_DESCRIPTOR arrayDesc; - memset(&arrayDesc, 0, sizeof(arrayDesc)); - - arrayDesc.Height = height; - arrayDesc.Width = width; - arrayDesc.Format = format; - arrayDesc.NumChannels = numChannels; - - // Allocate the array, will work for 1D or 2D case - SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); - } - } - } - - // Work space for holding data for uploading if it needs to be rearranged - if (initData) - { - List<uint8_t> workspace; - for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel) - { - int mipWidth = width >> mipLevel; - int mipHeight = height >> mipLevel; - int mipDepth = depth >> mipLevel; - - mipWidth = (mipWidth == 0) ? 1 : mipWidth; - mipHeight = (mipHeight == 0) ? 1 : mipHeight; - mipDepth = (mipDepth == 0) ? 1 : mipDepth; - - // If it's a cubemap then the depth is always 6 - if (desc.type == IResource::Type::TextureCube) - { - mipDepth = 6; - } - - auto dstArray = tex->m_cudaArray; - if (tex->m_cudaMipMappedArray) - { - // Get the array for the mip level - SLANG_CUDA_RETURN_ON_FAIL( - cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel)); - } - SLANG_ASSERT(dstArray); - - // Check using the desc to see if it's plausible - { - CUDA_ARRAY_DESCRIPTOR arrayDesc; - SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray)); - - SLANG_ASSERT(mipWidth == arrayDesc.Width); - SLANG_ASSERT( - mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0)); - } - - const void* srcDataPtr = nullptr; - - if (desc.arraySize > 1) - { - SLANG_ASSERT( - desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube); - - // TODO(JS): Here I assume that arrays are just held contiguously within a - // 'face' This seems reasonable and works with the Copy3D. - const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; - - Index faceCount = desc.arraySize; - if (desc.type == IResource::Type::TextureCube) - { - faceCount *= 6; - } - - const size_t mipSizeInBytes = faceSizeInBytes * faceCount; - workspace.setCount(mipSizeInBytes); - - // We need to add the face data from each mip - // We iterate over face count so we copy all of the cubemap faces - for (Index j = 0; j < faceCount; j++) - { - const auto srcData = initData[mipLevel + j * desc.numMipLevels].data; - // Copy over to the workspace to make contiguous - ::memcpy( - workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes); - } - - srcDataPtr = workspace.getBuffer(); - } - else - { - if (desc.type == IResource::Type::TextureCube) - { - size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; - - workspace.setCount(faceSizeInBytes * 6); - // Copy the data over to make contiguous - for (Index j = 0; j < 6; j++) - { - const auto srcData = - initData[mipLevel + j * desc.numMipLevels].data; - ::memcpy( - workspace.getBuffer() + faceSizeInBytes * j, - srcData, - faceSizeInBytes); - } - srcDataPtr = workspace.getBuffer(); - } - else - { - const auto srcData = initData[mipLevel].data; - srcDataPtr = srcData; - } - } - - if (desc.arraySize > 1) - { - SLANG_ASSERT( - desc.type == IResource::Type::Texture1D || - desc.type == IResource::Type::Texture2D || - desc.type == IResource::Type::TextureCube); - - CUDA_MEMCPY3D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - // Set the depth to the array length - copyParam.Depth = desc.arraySize; - - if (desc.type == IResource::Type::TextureCube) - { - copyParam.Depth *= 6; - } - - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); - } - else - { - switch (desc.type) - { - case IResource::Type::Texture1D: - case IResource::Type::Texture2D: - { - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); - break; - } - case IResource::Type::Texture3D: - case IResource::Type::TextureCube: - { - CUDA_MEMCPY3D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = dstArray; - - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = srcDataPtr; - copyParam.srcPitch = mipWidth * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = mipHeight; - copyParam.Depth = mipDepth; - - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); - break; - } - - default: - { - SLANG_ASSERT(!"Not implemented"); - break; - } - } - } - } - } - // Set up texture sampling parameters, and create final texture obj - - { - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); - resDesc.resType = resourceType; - - if (tex->m_cudaArray) - { - resDesc.res.array.hArray = tex->m_cudaArray; - } - if (tex->m_cudaMipMappedArray) - { - resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray; - } - - // If the texture might be used as a UAV, then we need to allocate - // a CUDA "surface" for it. - // - // Note: We cannot do this unconditionally, because it will fail - // on surfaces that are not usable as UAVs (e.g., those with - // mipmaps). - // - // TODO: We should really only be allocating the array at the - // time we create a resource, and then allocate the surface or - // texture objects as part of view creation. - // - if (desc.allowedStates.contains(ResourceState::UnorderedAccess)) - { - // On CUDA surfaces only support a single MIP map - SLANG_ASSERT(desc.numMipLevels == 1); - - SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc)); - } - - - // Create handle for sampling. - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); - texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; - texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - SLANG_CUDA_RETURN_ON_FAIL( - cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); - } - - returnComPtr(outResource, tex); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource( - const IBufferResource::Desc& descIn, - const void* initData, - IBufferResource** outResource) override - { - auto desc = fixupBufferDesc(descIn); - RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc); - resource->m_cudaContext = m_context; - SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes)); - if (initData) - { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault)); - } - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle( - InteropHandle handle, - const IBufferResource::Desc& desc, - IBufferResource** outResource) override - { - if (handle.handleValue == 0) - { - *outResource = nullptr; - return SLANG_OK; - } - - RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc); - resource->m_cudaContext = m_context; - - // CUDA manages sharing of buffers through the idea of an - // "external memory" object, which represents the relationship - // with another API's objects. In order to create this external - // memory association, we first need to fill in a descriptor struct. - cudaExternalMemoryHandleDesc externalMemoryHandleDesc; - memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - switch (handle.api) - { - case InteropHandleAPI::D3D12: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; - break; - case InteropHandleAPI::Vulkan: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; - break; - default: - return SLANG_FAIL; - } - externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; - externalMemoryHandleDesc.size = desc.sizeInBytes; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; - - // Once we have filled in the descriptor, we can request - // that CUDA create the required association between the - // external buffer and its own memory. - cudaExternalMemory_t externalMemory; - SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); - resource->m_cudaExternalMemory = externalMemory; - - // The CUDA "external memory" handle is not itself a device - // pointer, so we need to query for a suitable device address - // for the buffer with another call. - // - // Just as for the external memory, we fill in a descriptor - // structure (although in this case we only need to specify - // the size). - cudaExternalMemoryBufferDesc bufferDesc; - memset(&bufferDesc, 0, sizeof(bufferDesc)); - bufferDesc.size = desc.sizeInBytes; - - // Finally, we can "map" the buffer to get a device address. - void* deviceAddress; - SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc)); - resource->m_cudaMemory = deviceAddress; - - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle( - InteropHandle handle, - const ITextureResource::Desc& desc, - const size_t size, - ITextureResource** outResource) override - { - if (handle.handleValue == 0) - { - *outResource = nullptr; - return SLANG_OK; - } - - RefPtr<TextureCUDAResource> resource = new TextureCUDAResource(desc); - resource->m_cudaContext = m_context; - - // CUDA manages sharing of buffers through the idea of an - // "external memory" object, which represents the relationship - // with another API's objects. In order to create this external - // memory association, we first need to fill in a descriptor struct. - CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc; - memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - switch (handle.api) - { - case InteropHandleAPI::D3D12: - externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE; - break; - case InteropHandleAPI::Vulkan: - externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; - break; - default: - return SLANG_FAIL; - } - externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; - externalMemoryHandleDesc.size = size; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; - - CUexternalMemory externalMemory; - SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); - resource->m_cudaExternalMemory = externalMemory; - - FormatInfo formatInfo; - SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo)); - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - arrayDesc.Depth = desc.size.depth; - arrayDesc.Height = desc.size.height; - arrayDesc.Width = desc.size.width; - arrayDesc.NumChannels = formatInfo.channelCount; - getCUDAFormat(desc.format, &arrayDesc.Format); - arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER - - CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc; - memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc)); - externalMemoryMipDesc.offset = 0; - externalMemoryMipDesc.arrayDesc = arrayDesc; - externalMemoryMipDesc.numLevels = desc.numMipLevels; - - CUmipmappedArray mipArray; - SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc)); - resource->m_cudaMipMappedArray = mipArray; - - CUarray cuArray; - SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0)); - resource->m_cudaArray = cuArray; - - CUDA_RESOURCE_DESC surfDesc; - memset(&surfDesc, 0, sizeof(surfDesc)); - surfDesc.resType = CU_RESOURCE_TYPE_ARRAY; - surfDesc.res.array.hArray = cuArray; - - CUsurfObject surface; - SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc)); - resource->m_cudaSurfObj = surface; - - returnComPtr(outResource, resource); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView( - ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override - { - RefPtr<CUDAResourceView> view = new CUDAResourceView(); - view->m_desc = desc; - view->textureResource = dynamic_cast<TextureCUDAResource*>(texture); - returnComPtr(outView, view); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView( - IBufferResource* buffer, - IBufferResource* counterBuffer, - IResourceView::Desc const& desc, - IResourceView** outView) override - { - RefPtr<CUDAResourceView> view = new CUDAResourceView(); - view->m_desc = desc; - view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer); - returnComPtr(outView, view); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool( - const IQueryPool::Desc& desc, - IQueryPool** outPool) override - { - RefPtr<CUDAQueryPool> pool = new CUDAQueryPool(); - SLANG_RETURN_ON_FAIL(pool->init(desc)); - returnComPtr(outPool, pool); - return SLANG_OK; - } - - virtual Result createShaderObjectLayout( - slang::TypeLayoutReflection* typeLayout, - ShaderObjectLayoutBase** outLayout) override - { - RefPtr<CUDAShaderObjectLayout> cudaLayout; - cudaLayout = new CUDAShaderObjectLayout(this, typeLayout); - returnRefPtrMove(outLayout, cudaLayout); - return SLANG_OK; - } - - virtual Result createShaderObject( - ShaderObjectLayoutBase* layout, - IShaderObject** outObject) override - { - RefPtr<CUDAShaderObject> result = new CUDAShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout))); - returnComPtr(outObject, result); - return SLANG_OK; - } - - virtual Result createMutableShaderObject( - ShaderObjectLayoutBase* layout, - IShaderObject** outObject) override - { - RefPtr<CUDAMutableShaderObject> result = new CUDAMutableShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout))); - returnComPtr(outObject, result); - return SLANG_OK; - } - - Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject) - { - auto cudaProgram = dynamic_cast<CUDAShaderProgram*>(program); - auto cudaLayout = cudaProgram->layout; - - RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject(); - SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout)); - returnRefPtrMove(outObject, result); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createProgram( - const IShaderProgram::Desc& desc, - IShaderProgram** outProgram, - ISlangBlob** outDiagnosticBlob) override - { - // If this is a specializable program, we just keep a reference to the slang program and - // don't actually create any kernels. This program will be specialized later when we know - // the shader object bindings. - RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram(); - cudaProgram->init(desc); - cudaProgram->cudaContext = m_context; - if (desc.slangGlobalScope->getSpecializationParamCount() != 0) - { - cudaProgram->layout = new CUDAProgramLayout(this, desc.slangGlobalScope->getLayout()); - returnComPtr(outProgram, cudaProgram); - return SLANG_OK; - } - - ComPtr<ISlangBlob> kernelCode; - ComPtr<ISlangBlob> diagnostics; - auto compileResult = desc.slangGlobalScope->getEntryPointCode( - (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef()); - if (diagnostics) - { - getDebugCallback()->handleMessage( - compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error, - DebugMessageSource::Slang, - (char*)diagnostics->getBufferPointer()); - if (outDiagnosticBlob) - returnComPtr(outDiagnosticBlob, diagnostics); - } - SLANG_RETURN_ON_FAIL(compileResult); - - SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer())); - cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName(); - SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction( - &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer())); - - auto slangGlobalScope = desc.slangGlobalScope; - if( slangGlobalScope ) - { - cudaProgram->slangGlobalScope = slangGlobalScope; - - auto slangProgramLayout = slangGlobalScope->getLayout(); - if(!slangProgramLayout) - return SLANG_FAIL; - - RefPtr<CUDAProgramLayout> cudaLayout; - cudaLayout = new CUDAProgramLayout(this, slangProgramLayout); - cudaLayout->programLayout = slangProgramLayout; - cudaProgram->layout = cudaLayout; - } - - returnComPtr(outProgram, cudaProgram); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState( - const ComputePipelineStateDesc& desc, IPipelineState** outState) override - { - RefPtr<CUDAPipelineState> state = new CUDAPipelineState(); - state->shaderProgram = static_cast<CUDAShaderProgram*>(desc.program); - state->init(desc); - returnComPtr(outState, state); - return Result(); - } - - void* map(IBufferResource* buffer) - { - return static_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory; - } - - void unmap(IBufferResource* buffer) - { - SLANG_UNUSED(buffer); - } - - virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override - { - return m_info; - } - -public: - virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap( - const ITransientResourceHeap::Desc& desc, - ITransientResourceHeap** outHeap) override - { - RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl(); - SLANG_RETURN_ON_FAIL(result->init(this, desc)); - returnComPtr(outHeap, result); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL - createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override - { - RefPtr<CommandQueueImpl> queue = new CommandQueueImpl(); - queue->init(this); - returnComPtr(outQueue, queue); - return SLANG_OK; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain( - const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(window); - SLANG_UNUSED(outSwapchain); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout( - const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outLayout); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outFramebuffer); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout( - const IRenderPassLayout::Desc& desc, - IRenderPassLayout** outRenderPassLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outRenderPassLayout); - return SLANG_FAIL; - } - virtual SLANG_NO_THROW Result SLANG_MCALL - createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override - { - SLANG_UNUSED(desc); - *outSampler = nullptr; - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout( - IInputLayout::Desc const& desc, - IInputLayout** outLayout) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outLayout); - return SLANG_E_NOT_AVAILABLE; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState( - const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override - { - SLANG_UNUSED(desc); - SLANG_UNUSED(outState); - return SLANG_E_NOT_AVAILABLE; - } - - virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource( - ITextureResource* texture, - ResourceState state, - ISlangBlob** outBlob, - size_t* outRowPitch, - size_t* outPixelSize) override - { - auto textureImpl = static_cast<TextureCUDAResource*>(texture); - RefPtr<ListBlob> blob = new ListBlob(); - - auto desc = textureImpl->getDesc(); - auto width = desc->size.width; - auto height = desc->size.height; - FormatInfo sizeInfo; - SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo)); - size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock; - size_t rowPitch = width * pixelSize; - size_t size = height * rowPitch; - blob->m_data.setCount((Index)size); - - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - - copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.srcArray = textureImpl->m_cudaArray; - - copyParam.dstMemoryType = CU_MEMORYTYPE_HOST; - copyParam.dstHost = blob->m_data.getBuffer(); - copyParam.dstPitch = rowPitch; - copyParam.WidthInBytes = copyParam.dstPitch; - copyParam.Height = height; - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); - - *outRowPitch = rowPitch; - *outPixelSize = pixelSize; - returnComPtr(outBlob, blob); - return SLANG_OK; - } - - virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource( - IBufferResource* buffer, - size_t offset, - size_t size, - ISlangBlob** outBlob) override - { - auto bufferImpl = static_cast<MemoryCUDAResource*>(buffer); - RefPtr<ListBlob> blob = new ListBlob(); - blob->m_data.setCount((Index)size); - cudaMemcpy( - blob->m_data.getBuffer(), - (uint8_t*)bufferImpl->m_cudaMemory + offset, - size, - cudaMemcpyDefault); - returnComPtr(outBlob, blob); - return SLANG_OK; - } -}; - -SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout) -{ - m_layout = typeLayout; - - // If the layout tells us that there is any uniform data, - // then we need to allocate a constant buffer to hold that data. - // - // TODO: Do we need to allocate a shadow copy for use from - // the CPU? - // - // TODO: When/where do we bind this constant buffer into - // a descriptor set for later use? - // - auto slangLayout = getLayout()->getElementTypeLayout(); - size_t uniformSize = slangLayout->getSize(); - if (uniformSize) - { - m_data.setCount((Index)uniformSize); - } - - // If the layout specifies that we have any resources or sub-objects, - // then we need to size the appropriate arrays to account for them. - // - // Note: the counts here are the *total* number of resources/sub-objects - // and not just the number of resource/sub-object ranges. - // - resources.setCount(typeLayout->getResourceCount()); - m_objects.setCount(typeLayout->getSubObjectCount()); - - for (auto subObjectRange : getLayout()->subObjectRanges) - { - RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout; - - // In the case where the sub-object range represents an - // existential-type leaf field (e.g., an `IBar`), we - // cannot pre-allocate the object(s) to go into that - // range, since we can't possibly know what to allocate - // at this point. - // - if (!subObjectLayout) - continue; - // - // Otherwise, we will allocate a sub-object to fill - // in each entry in this range, based on the layout - // information we already have. - - auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex]; - for (Index i = 0; i < bindingRangeInfo.count; ++i) - { - RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject(); - SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout)); - - ShaderOffset offset; - offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i; - offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex; - offset.bindingArrayIndex = (GfxIndex)i; - - SLANG_RETURN_ON_FAIL(setObject(offset, subObject)); - } - } - return SLANG_OK; -} - -SlangResult CUDARootShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout) -{ - SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(device, typeLayout)); - auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout); - for (auto& entryPoint : programLayout->entryPointLayouts) - { - RefPtr<CUDAEntryPointShaderObject> object = new CUDAEntryPointShaderObject(); - SLANG_RETURN_ON_FAIL(object->init(device, entryPoint)); - entryPointObjects.add(object); - } - return SLANG_OK; -} - -SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) -{ - RefPtr<CUDADevice> result = new CUDADevice(); - SLANG_RETURN_ON_FAIL(result->initialize(*desc)); - returnComPtr(outDevice, result); - return SLANG_OK; -} -#else -SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) -{ - SLANG_UNUSED(desc); - *outDevice = nullptr; - return SLANG_FAIL; -} -#endif - -} |
