summaryrefslogtreecommitdiffstats
path: root/tools/gfx/cuda/render-cuda.cpp
diff options
context:
space:
mode:
authorlucy96chen <47800040+lucy96chen@users.noreply.github.com>2022-07-25 13:36:43 -0700
committerGitHub <noreply@github.com>2022-07-25 13:36:43 -0700
commitce6e946f6f4882aba8a62392ae791c948633e2e3 (patch)
treefde46847900ef80ee111a6b612f7e0931b15fdf2 /tools/gfx/cuda/render-cuda.cpp
parent129294a58d2a51308af78ad5d8d436c026863259 (diff)
Split render-cuda.cpp into smaller files (#2334)
* render-cuda split, compile errors galore due to missing includes etc. * render-cuda split and fully compiles * Ran premake.bat to disable cuda; Added all new files * Removed render-cuda files * CI fixes * Rerun CI
Diffstat (limited to 'tools/gfx/cuda/render-cuda.cpp')
-rw-r--r--tools/gfx/cuda/render-cuda.cpp2513
1 files changed, 0 insertions, 2513 deletions
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
deleted file mode 100644
index 114751a3d..000000000
--- a/tools/gfx/cuda/render-cuda.cpp
+++ /dev/null
@@ -1,2513 +0,0 @@
-#include "render-cuda.h"
-
-#ifdef GFX_ENABLE_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "core/slang-basic.h"
-#include "core/slang-blob.h"
-#include "core/slang-std-writers.h"
-
-#include "slang.h"
-#include "slang-com-ptr.h"
-#include "slang-com-helper.h"
-#include "../command-writer.h"
-#include "../renderer-shared.h"
-#include "../mutable-shader-object.h"
-#include "../simple-transient-resource-heap.h"
-#include "../slang-context.h"
-#include "../command-encoder-com-forward.h"
-
-# ifdef RENDER_TEST_OPTIX
-
-// The `optix_stubs.h` header produces warnings when compiled with MSVC
-# ifdef _MSC_VER
-# pragma warning(disable: 4996)
-# endif
-
-# include <optix.h>
-# include <optix_function_table_definition.h>
-# include <optix_stubs.h>
-# endif
-
-#endif
-
-namespace gfx
-{
-#ifdef GFX_ENABLE_CUDA
-using namespace Slang;
-
-SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
-
-// A enum used to control if errors are reported on failure of CUDA call.
-enum class CUDAReportStyle
-{
- Normal,
- Silent,
-};
-
-struct CUDAErrorInfo
-{
- CUDAErrorInfo(
- const char* filePath,
- int lineNo,
- const char* errorName = nullptr,
- const char* errorString = nullptr)
- : m_filePath(filePath)
- , m_lineNo(lineNo)
- , m_errorName(errorName)
- , m_errorString(errorString)
- {}
- SlangResult handle() const
- {
- StringBuilder builder;
- builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
-
- if (m_errorName)
- {
- builder << m_errorName << " : ";
- }
- if (m_errorString)
- {
- builder << m_errorString;
- }
-
- getDebugCallback()->handleMessage(DebugMessageType::Error, DebugMessageSource::Driver,
- builder.getUnownedSlice().begin());
-
- // Slang::signalUnexpectedError(builder.getBuffer());
- return SLANG_FAIL;
- }
-
- const char* m_filePath;
- int m_lineNo;
- const char* m_errorName;
- const char* m_errorString;
-};
-
-// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
-
-static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
-{
- CUDAErrorInfo info(file, line);
- cuGetErrorString(cuResult, &info.m_errorString);
- cuGetErrorName(cuResult, &info.m_errorName);
- return info.handle();
-}
-
-static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
- return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
-# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
-
-# define SLANG_CUDA_RETURN_ON_FAIL(x) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- return SLANG_CUDA_HANDLE_ERROR(_res); \
- }
-# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- { \
- return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \
- : SLANG_FAIL; \
- } \
- }
-
-# define SLANG_CUDA_ASSERT_ON_FAIL(x) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- { \
- SLANG_ASSERT(!"Failed CUDA call"); \
- }; \
- }
-
-# ifdef RENDER_TEST_OPTIX
-
-static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }
-
-# if 1
-static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
-{
- fprintf(
- stderr,
- "%s(%d): optix: %s (%s)\n",
- file,
- line,
- optixGetErrorString(result),
- optixGetErrorName(result));
- return SLANG_FAIL;
-}
-# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
-# else
-# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
-# endif
-
-# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \
- do \
- { \
- auto _res = EXPR; \
- if (_isError(_res)) \
- return SLANG_OPTIX_HANDLE_ERROR(_res); \
- } while (0)
-
-void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
-{
- fprintf(stderr, "optix: %s (%s)\n", message, tag);
-}
-
-# endif
-
-class CUDAContext : public RefObject
-{
-public:
- CUcontext m_context = nullptr;
- ~CUDAContext() { cuCtxDestroy(m_context); }
-};
-
-class MemoryCUDAResource : public BufferResource
-{
-public:
- MemoryCUDAResource(const Desc& _desc)
- : BufferResource(_desc)
- {}
-
- ~MemoryCUDAResource()
- {
- if (m_cudaMemory)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
- }
- }
-
- uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; }
-
- void* m_cudaExternalMemory = nullptr;
- void* m_cudaMemory = nullptr;
-
- RefPtr<CUDAContext> m_cudaContext;
-
- virtual SLANG_NO_THROW DeviceAddress SLANG_MCALL getDeviceAddress() override
- {
- return (DeviceAddress)m_cudaMemory;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
- {
- outHandle->handleValue = getBindlessHandle();
- outHandle->api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- map(MemoryRange* rangeToRead, void** outPointer) override
- {
- SLANG_UNUSED(rangeToRead);
- SLANG_UNUSED(outPointer);
- return SLANG_FAIL;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL unmap(MemoryRange* writtenRange) override
- {
- SLANG_UNUSED(writtenRange);
- return SLANG_FAIL;
- }
-};
-
-class TextureCUDAResource : public TextureResource
-{
-public:
- TextureCUDAResource(const TextureResource::Desc& desc)
- : TextureResource(desc)
- {}
- ~TextureCUDAResource()
- {
- if (m_cudaSurfObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
- }
- if (m_cudaTexObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
- }
- if (m_cudaArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
- }
- if (m_cudaMipMappedArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
- }
- }
-
- uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; }
-
- // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by
- // a long long
- CUtexObject m_cudaTexObj = CUtexObject();
-
- // The surfObj is for reading/writing 'texture like' things, but not for sampling.
- CUsurfObject m_cudaSurfObj = CUsurfObject();
-
- CUarray m_cudaArray = CUarray();
- CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
-
- void* m_cudaExternalMemory = nullptr;
-
- RefPtr<CUDAContext> m_cudaContext;
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeResourceHandle(InteropHandle* outHandle) override
- {
- outHandle->handleValue = getBindlessHandle();
- outHandle->api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-};
-
-class CUDAResourceView : public ResourceViewBase
-{
-public:
- RefPtr<MemoryCUDAResource> memoryResource = nullptr;
- RefPtr<TextureCUDAResource> textureResource = nullptr;
- void* proxyBuffer = nullptr;
-};
-
-class CUDAShaderObjectLayout : public ShaderObjectLayoutBase
-{
-public:
- struct BindingRangeInfo
- {
- slang::BindingType bindingType;
- Index count;
- Index baseIndex; // Flat index for sub-ojects
- Index subObjectIndex;
-
- // TODO: The `uniformOffset` field should be removed,
- // since it cannot be supported by the Slang reflection
- // API once we fix some design issues.
- //
- // It is only being used today for pre-allocation of sub-objects
- // for constant buffers and parameter blocks (which should be
- // deprecated/removed anyway).
- //
- // Note: We would need to bring this field back, plus
- // a lot of other complexity, if we ever want to support
- // setting of resources/buffers directly by a binding
- // range index and array index.
- //
- Index uniformOffset; // Uniform offset for a resource typed field.
- };
-
- struct SubObjectRangeInfo
- {
- RefPtr<CUDAShaderObjectLayout> layout;
- Index bindingRangeIndex;
- };
-
- List<SubObjectRangeInfo> subObjectRanges;
- List<BindingRangeInfo> m_bindingRanges;
-
- Index m_subObjectCount = 0;
- Index m_resourceCount = 0;
-
- CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout)
- {
- m_elementTypeLayout = _unwrapParameterGroups(layout, m_containerType);
-
- initBase(renderer, m_elementTypeLayout);
-
- // Compute the binding ranges that are used to store
- // the logical contents of the object in memory. These will relate
- // to the descriptor ranges in the various sets, but not always
- // in a one-to-one fashion.
-
- SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
- for (SlangInt r = 0; r < bindingRangeCount; ++r)
- {
- slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
- SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
- slang::TypeLayoutReflection* slangLeafTypeLayout =
- m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
-
- SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
- SlangInt rangeIndexInDescriptorSet =
- m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
-
- // TODO: This logic assumes that for any binding range that might consume
- // multiple kinds of resources, the descriptor range for its uniform
- // usage will be the first one in the range.
- //
- // We need to decide whether that assumption is one we intend to support
- // applications making, or whether they should be forced to perform a
- // linear search over the descriptor ranges for a specific binding range.
- //
- auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
- descriptorSetIndex, rangeIndexInDescriptorSet);
-
- Index baseIndex = 0;
- Index subObjectIndex = 0;
- switch (slangBindingType)
- {
- case slang::BindingType::ConstantBuffer:
- case slang::BindingType::ParameterBlock:
- case slang::BindingType::ExistentialValue:
- baseIndex = m_subObjectCount;
- subObjectIndex = baseIndex;
- m_subObjectCount += count;
- break;
- case slang::BindingType::RawBuffer:
- case slang::BindingType::MutableRawBuffer:
- if (slangLeafTypeLayout->getType()->getElementType() != nullptr)
- {
- // A structured buffer occupies both a resource slot and
- // a sub-object slot.
- subObjectIndex = m_subObjectCount;
- m_subObjectCount += count;
- }
- baseIndex = m_resourceCount;
- m_resourceCount += count;
- break;
- default:
- baseIndex = m_resourceCount;
- m_resourceCount += count;
- break;
- }
-
- BindingRangeInfo bindingRangeInfo;
- bindingRangeInfo.bindingType = slangBindingType;
- bindingRangeInfo.count = count;
- bindingRangeInfo.baseIndex = baseIndex;
- bindingRangeInfo.uniformOffset = uniformOffset;
- bindingRangeInfo.subObjectIndex = subObjectIndex;
- m_bindingRanges.add(bindingRangeInfo);
- }
-
- SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
- for (SlangInt r = 0; r < subObjectRangeCount; ++r)
- {
- SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
- auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
- slang::TypeLayoutReflection* slangLeafTypeLayout =
- m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
-
- // A sub-object range can either represent a sub-object of a known
- // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
- // (in which case we can pre-compute a layout to use, based on
- // the type `Foo`) *or* it can represent a sub-object of some
- // existential type (e.g., `IBar`) in which case we cannot
- // know the appropraite type/layout of sub-object to allocate.
- //
- RefPtr<CUDAShaderObjectLayout> subObjectLayout;
- if (slangBindingType != slang::BindingType::ExistentialValue)
- {
- subObjectLayout =
- new CUDAShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout());
- }
-
- SubObjectRangeInfo subObjectRange;
- subObjectRange.bindingRangeIndex = bindingRangeIndex;
- subObjectRange.layout = subObjectLayout;
- subObjectRanges.add(subObjectRange);
- }
- }
-
- Index getResourceCount() const { return m_resourceCount; }
- Index getSubObjectCount() const { return m_subObjectCount; }
- List<SubObjectRangeInfo>& getSubObjectRanges() { return subObjectRanges; }
- BindingRangeInfo getBindingRange(Index index) { return m_bindingRanges[index]; }
- Index getBindingRangeCount() const { return m_bindingRanges.getCount(); }
-};
-
-class CUDAProgramLayout : public CUDAShaderObjectLayout
-{
-public:
- slang::ProgramLayout* programLayout = nullptr;
- List<RefPtr<CUDAShaderObjectLayout>> entryPointLayouts;
- CUDAProgramLayout(RendererBase* renderer, slang::ProgramLayout* inProgramLayout)
- : CUDAShaderObjectLayout(renderer, inProgramLayout->getGlobalParamsTypeLayout())
- , programLayout(inProgramLayout)
- {
- for (UInt i =0; i< programLayout->getEntryPointCount(); i++)
- {
- entryPointLayouts.add(new CUDAShaderObjectLayout(
- renderer,
- programLayout->getEntryPointByIndex(i)->getTypeLayout()));
- }
-
- }
-
- int getKernelIndex(UnownedStringSlice kernelName)
- {
- for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++)
- {
- auto entryPoint = programLayout->getEntryPointByIndex(i);
- if (kernelName == entryPoint->getName())
- {
- return i;
- }
- }
- return -1;
- }
-
- void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
- {
- auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex);
- entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
- }
-};
-
-class CUDAShaderObjectData
-{
-public:
- bool isHostOnly = false;
- Slang::RefPtr<MemoryCUDAResource> m_bufferResource;
- Slang::RefPtr<CUDAResourceView> m_bufferView;
- Slang::List<uint8_t> m_cpuBuffer;
- Result setCount(Index count)
- {
- if (isHostOnly)
- {
- m_cpuBuffer.setCount(count);
- if (!m_bufferView)
- {
- IResourceView::Desc viewDesc = {};
- viewDesc.type = IResourceView::Type::UnorderedAccess;
- m_bufferView = new CUDAResourceView();
- m_bufferView->proxyBuffer = m_cpuBuffer.getBuffer();
- m_bufferView->m_desc = viewDesc;
- }
- return SLANG_OK;
- }
-
- if (!m_bufferResource)
- {
- IBufferResource::Desc desc;
- desc.type = IResource::Type::Buffer;
- desc.sizeInBytes = count;
- m_bufferResource = new MemoryCUDAResource(desc);
- if (count)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
- }
- IResourceView::Desc viewDesc = {};
- viewDesc.type = IResourceView::Type::UnorderedAccess;
- m_bufferView = new CUDAResourceView();
- m_bufferView->memoryResource = m_bufferResource;
- m_bufferView->m_desc = viewDesc;
- }
- auto oldSize = m_bufferResource->getDesc()->sizeInBytes;
- if ((size_t)count != oldSize)
- {
- void* newMemory = nullptr;
- if (count)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
- }
- if (oldSize)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- newMemory,
- m_bufferResource->m_cudaMemory,
- Math::Min((size_t)count, oldSize),
- cudaMemcpyDefault));
- }
- cudaFree(m_bufferResource->m_cudaMemory);
- m_bufferResource->m_cudaMemory = newMemory;
- m_bufferResource->getDesc()->sizeInBytes = count;
- }
- return SLANG_OK;
- }
-
- Slang::Index getCount()
- {
- if (isHostOnly)
- return m_cpuBuffer.getCount();
- if (m_bufferResource)
- return (Slang::Index)(m_bufferResource->getDesc()->sizeInBytes);
- else
- return 0;
- }
-
- void* getBuffer()
- {
- if (isHostOnly)
- return m_cpuBuffer.getBuffer();
-
- if (m_bufferResource)
- return m_bufferResource->m_cudaMemory;
- return nullptr;
- }
-
- /// Returns a resource view for GPU access into the buffer content.
- ResourceViewBase* getResourceView(
- RendererBase* device,
- slang::TypeLayoutReflection* elementLayout,
- slang::BindingType bindingType)
- {
- SLANG_UNUSED(device);
- m_bufferResource->getDesc()->elementSize = (int)elementLayout->getSize();
- return m_bufferView.Ptr();
- }
-};
-
-class CUDAShaderObject
- : public ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
-{
- typedef ShaderObjectBaseImpl<CUDAShaderObject, CUDAShaderObjectLayout, CUDAShaderObjectData>
- Super;
-
-public:
- List<RefPtr<CUDAResourceView>> resources;
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- init(IDevice* device, CUDAShaderObjectLayout* typeLayout);
-
- virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return 0; }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
- {
- *outEntryPoint = nullptr;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW const void* SLANG_MCALL getRawData() override
- {
- return m_data.getBuffer();
- }
-
- virtual SLANG_NO_THROW Size SLANG_MCALL getSize() override
- {
- return (Size)m_data.getCount();
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setData(ShaderOffset const& offset, void const* data, Size size) override
- {
- size = Math::Min(size, (Size)m_data.getCount() - offset.uniformOffset);
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setResource(ShaderOffset const& offset, IResourceView* resourceView) override
- {
- if (!resourceView)
- return SLANG_OK;
-
- auto layout = getLayout();
-
- auto bindingRangeIndex = offset.bindingRangeIndex;
- SLANG_ASSERT(bindingRangeIndex >= 0);
- SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
-
- auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
-
- auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
- auto cudaView = static_cast<CUDAResourceView*>(resourceView);
-
- resources[viewIndex] = cudaView;
-
- if (cudaView->textureResource)
- {
- if (cudaView->m_desc.type == IResourceView::Type::UnorderedAccess)
- {
- auto handle = cudaView->textureResource->m_cudaSurfObj;
- setData(offset, &handle, sizeof(uint64_t));
- }
- else
- {
- auto handle = cudaView->textureResource->getBindlessHandle();
- setData(offset, &handle, sizeof(uint64_t));
- }
- }
- else if (cudaView->memoryResource)
- {
- auto handle = cudaView->memoryResource->getBindlessHandle();
- setData(offset, &handle, sizeof(handle));
- auto sizeOffset = offset;
- sizeOffset.uniformOffset += sizeof(handle);
- auto& desc = *cudaView->memoryResource->getDesc();
- size_t size = desc.sizeInBytes;
- if (desc.elementSize > 1)
- size /= desc.elementSize;
- setData(sizeOffset, &size, sizeof(size));
- }
- else if (cudaView->proxyBuffer)
- {
- auto handle = cudaView->proxyBuffer;
- setData(offset, &handle, sizeof(handle));
- auto sizeOffset = offset;
- sizeOffset.uniformOffset += sizeof(handle);
- auto& desc = *cudaView->memoryResource->getDesc();
- size_t size = desc.sizeInBytes;
- if (desc.elementSize > 1)
- size /= desc.elementSize;
- setData(sizeOffset, &size, sizeof(size));
- }
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setObject(ShaderOffset const& offset, IShaderObject* object) override
- {
- SLANG_RETURN_ON_FAIL(Super::setObject(offset, object));
-
- auto bindingRangeIndex = offset.bindingRangeIndex;
- auto& bindingRange = getLayout()->m_bindingRanges[bindingRangeIndex];
-
- CUDAShaderObject* subObject = static_cast<CUDAShaderObject*>(object);
- switch (bindingRange.bindingType)
- {
- default:
- {
- void* subObjectDataBuffer = subObject->getBuffer();
- SLANG_RETURN_ON_FAIL(setData(offset, &subObjectDataBuffer, sizeof(void*)));
- }
- break;
- case slang::BindingType::ExistentialValue:
- case slang::BindingType::RawBuffer:
- case slang::BindingType::MutableRawBuffer:
- break;
- }
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- setSampler(ShaderOffset const& offset, ISamplerState* sampler) override
- {
- SLANG_UNUSED(sampler);
- SLANG_UNUSED(offset);
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
- ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler) override
- {
- SLANG_UNUSED(sampler);
- setResource(offset, textureView);
- return SLANG_OK;
- }
-};
-
-class CUDAMutableShaderObject : public MutableShaderObject< CUDAMutableShaderObject, CUDAShaderObjectLayout>
-{};
-
-class CUDAEntryPointShaderObject : public CUDAShaderObject
-{
-public:
- CUDAEntryPointShaderObject() { m_data.isHostOnly = true; }
-};
-
-class CUDARootShaderObject : public CUDAShaderObject
-{
-public:
- virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() override { return 1; }
- virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() override { return 1; }
-public:
- List<RefPtr<CUDAEntryPointShaderObject>> entryPointObjects;
- virtual SLANG_NO_THROW Result SLANG_MCALL
- init(IDevice* device, CUDAShaderObjectLayout* typeLayout) override;
- virtual SLANG_NO_THROW GfxCount SLANG_MCALL getEntryPointCount() override { return (GfxCount)entryPointObjects.getCount(); }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- getEntryPoint(GfxIndex index, IShaderObject** outEntryPoint) override
- {
- returnComPtr(outEntryPoint, entryPointObjects[index]);
- return SLANG_OK;
- }
- virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override
- {
- SLANG_RETURN_ON_FAIL(CUDAShaderObject::collectSpecializationArgs(args));
- for (auto& entryPoint : entryPointObjects)
- {
- SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
- }
- return SLANG_OK;
- }
-};
-
-class CUDAShaderProgram : public ShaderProgramBase
-{
-public:
- CUmodule cudaModule = nullptr;
- CUfunction cudaKernel;
- String kernelName;
- RefPtr<CUDAProgramLayout> layout;
- RefPtr<CUDAContext> cudaContext;
- ~CUDAShaderProgram()
- {
- if (cudaModule)
- cuModuleUnload(cudaModule);
- }
-};
-
-class CUDAPipelineState : public PipelineStateBase
-{
-public:
- RefPtr<CUDAShaderProgram> shaderProgram;
- void init(const ComputePipelineStateDesc& inDesc)
- {
- PipelineStateDesc pipelineDesc;
- pipelineDesc.type = PipelineType::Compute;
- pipelineDesc.compute = inDesc;
- initializeBase(pipelineDesc);
- }
-};
-
-class CUDAQueryPool : public QueryPoolBase
-{
-public:
- // The event object for each query. Owned by the pool.
- List<CUevent> m_events;
-
- // The event that marks the starting point.
- CUevent m_startEvent;
-
- Result init(const IQueryPool::Desc& desc)
- {
- cuEventCreate(&m_startEvent, 0);
- cuEventRecord(m_startEvent, 0);
- m_events.setCount(desc.count);
- for (SlangInt i = 0; i < m_events.getCount(); i++)
- {
- cuEventCreate(&m_events[i], 0);
- }
- return SLANG_OK;
- }
-
- ~CUDAQueryPool()
- {
- for (auto& e : m_events)
- {
- cuEventDestroy(e);
- }
- cuEventDestroy(m_startEvent);
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getResult(
- GfxIndex queryIndex, GfxCount count, uint64_t* data) override
- {
- for (GfxIndex i = 0; i < count; i++)
- {
- float time = 0.0f;
- cuEventSynchronize(m_events[i + queryIndex]);
- cuEventElapsedTime(&time, m_startEvent, m_events[i + queryIndex]);
- data[i] = (uint64_t)((double)time * 1000.0f);
- }
- return SLANG_OK;
- }
-};
-
-class CUDADevice : public RendererBase
-{
-private:
- static const CUDAReportStyle reportType = CUDAReportStyle::Normal;
- static int _calcSMCountPerMultiProcessor(int major, int minor)
- {
- // Defines for GPU Architecture types (using the SM version to determine
- // the # of cores per SM
- struct SMInfo
- {
- int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
- int coreCount;
- };
-
- static const SMInfo infos[] = {
- {0x30, 192},
- {0x32, 192},
- {0x35, 192},
- {0x37, 192},
- {0x50, 128},
- {0x52, 128},
- {0x53, 128},
- {0x60, 64},
- {0x61, 128},
- {0x62, 128},
- {0x70, 64},
- {0x72, 64},
- {0x75, 64}};
-
- const int sm = ((major << 4) + minor);
- for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
- {
- if (infos[i].sm == sm)
- {
- return infos[i].coreCount;
- }
- }
-
- const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
-
- // It must be newer presumably
- SLANG_ASSERT(sm > last.sm);
-
- // Default to the last entry
- return last.coreCount;
- }
-
- static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
- {
- int smPerMultiproc = 0;
- int maxPerfDevice = -1;
- int deviceCount = 0;
- int devicesProhibited = 0;
-
- uint64_t maxComputePerf = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
-
- // Find the best CUDA capable GPU device
- for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
- {
- int computeMode = -1, major = 0, minor = 0;
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
-
- // If this GPU is not running on Compute Mode prohibited,
- // then we can add it to the list
- if (computeMode != cudaComputeModeProhibited)
- {
- if (major == 9999 && minor == 9999)
- {
- smPerMultiproc = 1;
- }
- else
- {
- smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
- }
-
- int multiProcessorCount = 0, clockRate = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
- &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
- uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
-
- if (compute_perf > maxComputePerf)
- {
- maxComputePerf = compute_perf;
- maxPerfDevice = currentDevice;
- }
- }
- else
- {
- devicesProhibited++;
- }
- }
-
- if (maxPerfDevice < 0)
- {
- return SLANG_FAIL;
- }
-
- *outDeviceIndex = maxPerfDevice;
- return SLANG_OK;
- }
-
- static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
- {
- static CUresult res = cuInit(0);
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
- return SLANG_OK;
- }
-
-private:
- int m_deviceIndex = -1;
- CUdevice m_device = 0;
- RefPtr<CUDAContext> m_context;
- DeviceInfo m_info;
- String m_adapterName;
-
-public:
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeDeviceHandles(InteropHandles* outHandles) override
- {
- outHandles->handles[0].handleValue = (uint64_t)m_device;
- outHandles->handles[0].api = InteropHandleAPI::CUDA;
- return SLANG_OK;
- }
-
- class CommandQueueImpl;
-
- class CommandBufferImpl
- : public ICommandBuffer
- , public CommandWriter
- , public ComObject
- {
- public:
- SLANG_COM_OBJECT_IUNKNOWN_ALL
- ICommandBuffer* getInterface(const Guid& guid)
- {
- if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
- return static_cast<ICommandBuffer*>(this);
- return nullptr;
- }
- public:
- CUDADevice* m_device;
- TransientResourceHeapBase* m_transientHeap;
-
- void init(CUDADevice* device, TransientResourceHeapBase* transientHeap)
- {
- m_device = device;
- m_transientHeap = transientHeap;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
- IRenderPassLayout* renderPass,
- IFramebuffer* framebuffer,
- IRenderCommandEncoder** outEncoder) override
- {
- SLANG_UNUSED(renderPass);
- SLANG_UNUSED(framebuffer);
- *outEncoder = nullptr;
- }
-
- class ResourceCommandEncoderImpl : public IResourceCommandEncoder
- {
- public:
- CommandWriter* m_writer;
-
- void init(CommandBufferImpl* cmdBuffer) { m_writer = cmdBuffer; }
-
- virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
- virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
- IBufferResource* dst,
- Offset dstOffset,
- IBufferResource* src,
- Offset srcOffset,
- Size size) override
- {
- m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL textureBarrier(
- GfxCount count,
- ITextureResource* const* textures,
- ResourceState src,
- ResourceState dst) override
- {}
-
- virtual SLANG_NO_THROW void SLANG_MCALL bufferBarrier(
- GfxCount count,
- IBufferResource* const* buffers,
- ResourceState src,
- ResourceState dst) override
- {}
-
- virtual SLANG_NO_THROW void SLANG_MCALL uploadBufferData(
- IBufferResource* dst, Offset offset, Size size, void* data) override
- {
- m_writer->uploadBufferData(dst, offset, size, data);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- writeTimestamp(IQueryPool* pool, GfxIndex index) override
- {
- m_writer->writeTimestamp(pool, index);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL copyTexture(
- ITextureResource* dst,
- ResourceState dstState,
- SubresourceRange dstSubresource,
- ITextureResource::Offset3D dstOffset,
- ITextureResource* src,
- ResourceState srcState,
- SubresourceRange srcSubresource,
- ITextureResource::Offset3D srcOffset,
- ITextureResource::Extents extent) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(dstState);
- SLANG_UNUSED(dstSubresource);
- SLANG_UNUSED(dstOffset);
- SLANG_UNUSED(src);
- SLANG_UNUSED(srcState);
- SLANG_UNUSED(srcSubresource);
- SLANG_UNUSED(srcOffset);
- SLANG_UNUSED(extent);
- SLANG_UNIMPLEMENTED_X("copyTexture");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL uploadTextureData(
- ITextureResource* dst,
- SubresourceRange subResourceRange,
- ITextureResource::Offset3D offset,
- ITextureResource::Extents extent,
- ITextureResource::SubresourceData* subResourceData,
- GfxCount subResourceDataCount) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(subResourceRange);
- SLANG_UNUSED(offset);
- SLANG_UNUSED(extent);
- SLANG_UNUSED(subResourceData);
- SLANG_UNUSED(subResourceDataCount);
- SLANG_UNIMPLEMENTED_X("uploadTextureData");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL clearResourceView(
- IResourceView* view,
- ClearValue* clearValue,
- ClearResourceViewFlags::Enum flags) override
- {
- SLANG_UNUSED(view);
- SLANG_UNUSED(clearValue);
- SLANG_UNUSED(flags);
- SLANG_UNIMPLEMENTED_X("clearResourceView");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL resolveResource(
- ITextureResource* source,
- ResourceState sourceState,
- SubresourceRange sourceRange,
- ITextureResource* dest,
- ResourceState destState,
- SubresourceRange destRange) override
- {
- SLANG_UNUSED(source);
- SLANG_UNUSED(sourceState);
- SLANG_UNUSED(sourceRange);
- SLANG_UNUSED(dest);
- SLANG_UNUSED(destState);
- SLANG_UNUSED(destRange);
- SLANG_UNIMPLEMENTED_X("resolveResource");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL resolveQuery(
- IQueryPool* queryPool,
- GfxIndex index,
- GfxCount count,
- IBufferResource* buffer,
- Offset offset) override
- {
- SLANG_UNUSED(queryPool);
- SLANG_UNUSED(index);
- SLANG_UNUSED(count);
- SLANG_UNUSED(buffer);
- SLANG_UNUSED(offset);
- SLANG_UNIMPLEMENTED_X("resolveQuery");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL copyTextureToBuffer(
- IBufferResource* dst,
- Offset dstOffset,
- Size dstSize,
- Size dstRowStride,
- ITextureResource* src,
- ResourceState srcState,
- SubresourceRange srcSubresource,
- ITextureResource::Offset3D srcOffset,
- ITextureResource::Extents extent) override
- {
- SLANG_UNUSED(dst);
- SLANG_UNUSED(dstOffset);
- SLANG_UNUSED(dstSize);
- SLANG_UNUSED(dstRowStride);
- SLANG_UNUSED(src);
- SLANG_UNUSED(srcState);
- SLANG_UNUSED(srcSubresource);
- SLANG_UNUSED(srcOffset);
- SLANG_UNUSED(extent);
- SLANG_UNIMPLEMENTED_X("copyTextureToBuffer");
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL textureSubresourceBarrier(
- ITextureResource* texture,
- SubresourceRange subresourceRange,
- ResourceState src,
- ResourceState dst) override
- {
- SLANG_UNUSED(texture);
- SLANG_UNUSED(subresourceRange);
- SLANG_UNUSED(src);
- SLANG_UNUSED(dst);
- SLANG_UNIMPLEMENTED_X("textureSubresourceBarrier");
- }
- virtual SLANG_NO_THROW void SLANG_MCALL
- beginDebugEvent(const char* name, float rgbColor[3]) override
- {
- SLANG_UNUSED(name);
- SLANG_UNUSED(rgbColor);
- }
- virtual SLANG_NO_THROW void SLANG_MCALL endDebugEvent() override {}
- };
-
- ResourceCommandEncoderImpl m_resourceCommandEncoder;
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeResourceCommands(IResourceCommandEncoder** outEncoder) override
- {
- m_resourceCommandEncoder.init(this);
- *outEncoder = &m_resourceCommandEncoder;
- }
-
- class ComputeCommandEncoderImpl
- : public IComputeCommandEncoder
- , public ResourceCommandEncoderImpl
- {
- public:
- SLANG_GFX_FORWARD_RESOURCE_COMMAND_ENCODER_IMPL(ResourceCommandEncoderImpl)
- public:
- CommandWriter* m_writer;
- CommandBufferImpl* m_commandBuffer;
- RefPtr<ShaderObjectBase> m_rootObject;
- virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
- void init(CommandBufferImpl* cmdBuffer)
- {
- m_writer = cmdBuffer;
- m_commandBuffer = cmdBuffer;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- bindPipeline(IPipelineState* state, IShaderObject** outRootObject) override
- {
- m_writer->setPipelineState(state);
- PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
- SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
- pipelineImpl->m_program, m_rootObject.writeRef()));
- returnComPtr(outRootObject, m_rootObject);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- bindPipelineWithRootObject(IPipelineState* state, IShaderObject* rootObject) override
- {
- m_writer->setPipelineState(state);
- PipelineStateBase* pipelineImpl = static_cast<PipelineStateBase*>(state);
- SLANG_RETURN_ON_FAIL(m_commandBuffer->m_device->createRootShaderObject(
- pipelineImpl->m_program, m_rootObject.writeRef()));
- m_rootObject->copyFrom(rootObject, m_commandBuffer->m_transientHeap);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override
- {
- m_writer->bindRootShaderObject(m_rootObject);
- m_writer->dispatchCompute(x, y, z);
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- dispatchComputeIndirect(IBufferResource* argBuffer, Offset offset) override
- {
- SLANG_UNIMPLEMENTED_X("dispatchComputeIndirect");
- }
- };
-
- ComputeCommandEncoderImpl m_computeCommandEncoder;
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeComputeCommands(IComputeCommandEncoder** outEncoder) override
- {
- m_computeCommandEncoder.init(this);
- *outEncoder = &m_computeCommandEncoder;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL
- encodeRayTracingCommands(IRayTracingCommandEncoder** outEncoder) override
- {
- *outEncoder = nullptr;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
- {
- return SLANG_FAIL;
- }
- };
-
- class CommandQueueImpl
- : public ICommandQueue
- , public ComObject
- {
- public:
- SLANG_COM_OBJECT_IUNKNOWN_ALL
- ICommandQueue* getInterface(const Guid& guid)
- {
- if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
- return static_cast<ICommandQueue*>(this);
- return nullptr;
- }
-
- public:
- RefPtr<CUDAPipelineState> currentPipeline;
- RefPtr<CUDARootShaderObject> currentRootObject;
- RefPtr<CUDADevice> renderer;
- CUstream stream;
- Desc m_desc;
- public:
- void init(CUDADevice* inRenderer)
- {
- renderer = inRenderer;
- m_desc.type = ICommandQueue::QueueType::Graphics;
- cuStreamCreate(&stream, 0);
- }
- ~CommandQueueImpl()
- {
- cuStreamSynchronize(stream);
- cuStreamDestroy(stream);
- currentPipeline = nullptr;
- currentRootObject = nullptr;
- }
-
- public:
- virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override
- {
- return m_desc;
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL executeCommandBuffers(
- GfxCount count, ICommandBuffer* const* commandBuffers, IFence* fence, uint64_t valueToSignal) override
- {
- SLANG_UNUSED(valueToSignal);
- // TODO: implement fence.
- assert(fence == nullptr);
- for (GfxIndex i = 0; i < count; i++)
- {
- execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
- }
- }
-
- virtual SLANG_NO_THROW void SLANG_MCALL waitOnHost() override
- {
- auto resultCode = cuStreamSynchronize(stream);
- if (resultCode != cudaSuccess)
- SLANG_CUDA_HANDLE_ERROR(resultCode);
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL waitForFenceValuesOnDevice(
- GfxCount fenceCount, IFence** fences, uint64_t* waitValues) override
- {
- return SLANG_FAIL;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL getNativeHandle(InteropHandle* outHandle) override
- {
- return SLANG_FAIL;
- }
-
- public:
- void setPipelineState(IPipelineState* state)
- {
- currentPipeline = dynamic_cast<CUDAPipelineState*>(state);
- }
-
- Result bindRootShaderObject(IShaderObject* object)
- {
- currentRootObject = dynamic_cast<CUDARootShaderObject*>(object);
- if (currentRootObject)
- return SLANG_OK;
- return SLANG_E_INVALID_ARG;
- }
-
- void dispatchCompute(int x, int y, int z)
- {
- // Specialize the compute kernel based on the shader object bindings.
- RefPtr<PipelineStateBase> newPipeline;
- renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
- currentPipeline = static_cast<CUDAPipelineState*>(newPipeline.Ptr());
-
- // Find out thread group size from program reflection.
- auto& kernelName = currentPipeline->shaderProgram->kernelName;
- auto programLayout = static_cast<CUDAProgramLayout*>(currentRootObject->getLayout());
- int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice());
- SLANG_ASSERT(kernelId != -1);
- UInt threadGroupSize[3];
- programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize);
-
- int sharedSizeInBytes;
- cuFuncGetAttribute(
- &sharedSizeInBytes,
- CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
- currentPipeline->shaderProgram->cudaKernel);
-
- // Copy global parameter data to the `SLANG_globalParams` symbol.
- {
- CUdeviceptr globalParamsSymbol = 0;
- size_t globalParamsSymbolSize = 0;
- cuModuleGetGlobal(
- &globalParamsSymbol,
- &globalParamsSymbolSize,
- currentPipeline->shaderProgram->cudaModule,
- "SLANG_globalParams");
-
- CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
- cudaMemcpyAsync(
- (void*)globalParamsSymbol,
- (void*)globalParamsCUDAData,
- globalParamsSymbolSize,
- cudaMemcpyDefault,
- 0);
- }
- //
- // The argument data for the entry-point parameters are already
- // stored in host memory in a CUDAEntryPointShaderObject, as expected by cuLaunchKernel.
- //
- auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->getBuffer();
- auto entryPointDataSize =
- currentRootObject->entryPointObjects[kernelId]->getBufferSize();
-
- void* extraOptions[] = {
- CU_LAUNCH_PARAM_BUFFER_POINTER,
- entryPointBuffer,
- CU_LAUNCH_PARAM_BUFFER_SIZE,
- &entryPointDataSize,
- CU_LAUNCH_PARAM_END,
- };
-
- // Once we have all the decessary data extracted and/or
- // set up, we can launch the kernel and see what happens.
- //
- auto cudaLaunchResult = cuLaunchKernel(
- currentPipeline->shaderProgram->cudaKernel,
- x,
- y,
- z,
- int(threadGroupSize[0]),
- int(threadGroupSize[1]),
- int(threadGroupSize[2]),
- sharedSizeInBytes,
- stream,
- nullptr,
- extraOptions);
-
- SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS);
- }
-
- void copyBuffer(
- IBufferResource* dst,
- size_t dstOffset,
- IBufferResource* src,
- size_t srcOffset,
- size_t size)
- {
- auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
- auto srcImpl = static_cast<MemoryCUDAResource*>(src);
- cudaMemcpy(
- (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
- (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
- size,
- cudaMemcpyDefault);
- }
-
- void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
- {
- auto dstImpl = static_cast<MemoryCUDAResource*>(dst);
- cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
- }
-
- void writeTimestamp(IQueryPool* pool, SlangInt index)
- {
- auto poolImpl = static_cast<CUDAQueryPool*>(pool);
- cuEventRecord(poolImpl->m_events[index], stream);
- }
-
- void execute(CommandBufferImpl* commandBuffer)
- {
- for (auto& cmd : commandBuffer->m_commands)
- {
- switch (cmd.name)
- {
- case CommandName::SetPipelineState:
- setPipelineState(commandBuffer->getObject<PipelineStateBase>(cmd.operands[0]));
- break;
- case CommandName::BindRootShaderObject:
- bindRootShaderObject(
- commandBuffer->getObject<ShaderObjectBase>(cmd.operands[0]));
- break;
- case CommandName::DispatchCompute:
- dispatchCompute(
- int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
- break;
- case CommandName::CopyBuffer:
- copyBuffer(
- commandBuffer->getObject<BufferResource>(cmd.operands[0]),
- cmd.operands[1],
- commandBuffer->getObject<BufferResource>(cmd.operands[2]),
- cmd.operands[3],
- cmd.operands[4]);
- break;
- case CommandName::UploadBufferData:
- uploadBufferData(
- commandBuffer->getObject<BufferResource>(cmd.operands[0]),
- cmd.operands[1],
- cmd.operands[2],
- commandBuffer->getData<uint8_t>(cmd.operands[3]));
- break;
- case CommandName::WriteTimestamp:
- writeTimestamp(
- commandBuffer->getObject<QueryPoolBase>(cmd.operands[0]),
- (SlangInt)cmd.operands[1]);
- }
- }
- }
- };
-
- using TransientResourceHeapImpl = SimpleTransientResourceHeap<CUDADevice, CommandBufferImpl>;
-
-public:
- virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override
- {
- SLANG_RETURN_ON_FAIL(slangContext.initialize(
- desc.slang,
- SLANG_PTX,
- "sm_5_1",
- makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
-
- SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
-
- SLANG_RETURN_ON_FAIL(_initCuda(reportType));
-
- SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
- m_context = new CUDAContext();
-
- int count = -1;
- cuDeviceGetCount(&count);
- SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
-
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
- cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
-
- // Not clear how to detect half support on CUDA. For now we'll assume we have it
- {
- m_features.add("half");
- }
-
- // Initialize DeviceInfo
- {
- m_info.deviceType = DeviceType::CUDA;
- m_info.bindingStyle = BindingStyle::CUDA;
- m_info.projectionStyle = ProjectionStyle::DirectX;
- m_info.apiName = "CUDA";
- static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
- ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
- cudaDeviceProp deviceProperties;
- cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
- m_adapterName = deviceProperties.name;
- m_info.adapterName = m_adapterName.begin();
- m_info.timestampFrequency = 1000000;
- }
-
- return SLANG_OK;
- }
-
- Result getCUDAFormat(Format format, CUarray_format* outFormat)
- {
- // TODO: Expand to cover all available formats that can be supported in CUDA
- switch (format)
- {
- case Format::R32G32B32A32_FLOAT:
- case Format::R32G32B32_FLOAT:
- case Format::R32G32_FLOAT:
- case Format::R32_FLOAT:
- case Format::D32_FLOAT:
- *outFormat = CU_AD_FORMAT_FLOAT;
- return SLANG_OK;
- case Format::R16G16B16A16_FLOAT:
- case Format::R16G16_FLOAT:
- case Format::R16_FLOAT:
- *outFormat = CU_AD_FORMAT_HALF;
- return SLANG_OK;
- case Format::R32G32B32A32_UINT:
- case Format::R32G32B32_UINT:
- case Format::R32G32_UINT:
- case Format::R32_UINT:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
- return SLANG_OK;
- case Format::R16G16B16A16_UINT:
- case Format::R16G16_UINT:
- case Format::R16_UINT:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
- return SLANG_OK;
- case Format::R8G8B8A8_UINT:
- case Format::R8G8_UINT:
- case Format::R8_UINT:
- case Format::R8G8B8A8_UNORM:
- *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
- return SLANG_OK;
- case Format::R32G32B32A32_SINT:
- case Format::R32G32B32_SINT:
- case Format::R32G32_SINT:
- case Format::R32_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT32;
- return SLANG_OK;
- case Format::R16G16B16A16_SINT:
- case Format::R16G16_SINT:
- case Format::R16_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT16;
- return SLANG_OK;
- case Format::R8G8B8A8_SINT:
- case Format::R8G8_SINT:
- case Format::R8_SINT:
- *outFormat = CU_AD_FORMAT_SIGNED_INT8;
- return SLANG_OK;
- default:
- SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
- return SLANG_FAIL;
- }
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
- const ITextureResource::Desc& desc,
- const ITextureResource::SubresourceData* initData,
- ITextureResource** outResource) override
- {
- TextureResource::Desc srcDesc = fixupTextureDesc(desc);
-
- RefPtr<TextureCUDAResource> tex = new TextureCUDAResource(srcDesc);
- tex->m_cudaContext = m_context;
-
- CUresourcetype resourceType;
-
- // The size of the element/texel in bytes
- size_t elementSize = 0;
-
- // Our `ITextureResource::Desc` uses an enumeration to specify
- // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
- // `cuMipmappedArrayCreate` seemingly relies on a policy where
- // the extents of the array in dimenions above the rank are
- // specified as zero (e.g., a 1D texture requires `height==0`).
- //
- // We will start by massaging the extents as specified by the
- // user into a form that CUDA wants/expects, based on the
- // texture shape as specified in the `desc`.
- //
- int width = desc.size.width;
- int height = desc.size.height;
- int depth = desc.size.depth;
- switch (desc.type)
- {
- case IResource::Type::Texture1D:
- height = 0;
- depth = 0;
- break;
-
- case IResource::Type::Texture2D:
- depth = 0;
- break;
-
- case IResource::Type::Texture3D:
- break;
-
- case IResource::Type::TextureCube:
- depth = 1;
- break;
- }
-
- {
- CUarray_format format = CU_AD_FORMAT_FLOAT;
- int numChannels = 0;
-
- SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
- FormatInfo info;
- gfxGetFormatInfo(desc.format, &info);
- numChannels = info.channelCount;
-
- switch (format)
- {
- case CU_AD_FORMAT_FLOAT:
- {
- elementSize = sizeof(float) * numChannels;
- break;
- }
- case CU_AD_FORMAT_HALF:
- {
- elementSize = sizeof(uint16_t) * numChannels;
- break;
- }
- case CU_AD_FORMAT_UNSIGNED_INT8:
- {
- elementSize = sizeof(uint8_t) * numChannels;
- break;
- }
- default:
- {
- SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
- return SLANG_FAIL;
- }
- }
-
- if (desc.numMipLevels > 1)
- {
- resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Width = width;
- arrayDesc.Height = height;
- arrayDesc.Depth = depth;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
- arrayDesc.Flags = 0;
-
- if (desc.arraySize > 1)
- {
- if (desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
- arrayDesc.Depth = desc.arraySize;
- }
- else
- {
- SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
- return SLANG_FAIL;
- }
- }
-
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- arrayDesc.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(
- cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
- }
- else
- {
- resourceType = CU_RESOURCE_TYPE_ARRAY;
-
- if (desc.arraySize > 1)
- {
- if (desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube)
- {
- SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
- return SLANG_FAIL;
- }
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- // Set the depth as the array length
- arrayDesc.Depth = desc.arraySize;
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Depth *= 6;
- }
-
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else if (desc.type == IResource::Type::Texture3D ||
- desc.type == IResource::Type::TextureCube)
- {
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Depth = depth;
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- arrayDesc.Flags = 0;
-
- // Handle cube texture
- if (desc.type == IResource::Type::TextureCube)
- {
- arrayDesc.Depth = 6;
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- // Allocate the array, will work for 1D or 2D case
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
- }
- }
- }
-
- // Work space for holding data for uploading if it needs to be rearranged
- if (initData)
- {
- List<uint8_t> workspace;
- for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
- {
- int mipWidth = width >> mipLevel;
- int mipHeight = height >> mipLevel;
- int mipDepth = depth >> mipLevel;
-
- mipWidth = (mipWidth == 0) ? 1 : mipWidth;
- mipHeight = (mipHeight == 0) ? 1 : mipHeight;
- mipDepth = (mipDepth == 0) ? 1 : mipDepth;
-
- // If it's a cubemap then the depth is always 6
- if (desc.type == IResource::Type::TextureCube)
- {
- mipDepth = 6;
- }
-
- auto dstArray = tex->m_cudaArray;
- if (tex->m_cudaMipMappedArray)
- {
- // Get the array for the mip level
- SLANG_CUDA_RETURN_ON_FAIL(
- cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
- }
- SLANG_ASSERT(dstArray);
-
- // Check using the desc to see if it's plausible
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
-
- SLANG_ASSERT(mipWidth == arrayDesc.Width);
- SLANG_ASSERT(
- mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
- }
-
- const void* srcDataPtr = nullptr;
-
- if (desc.arraySize > 1)
- {
- SLANG_ASSERT(
- desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube);
-
- // TODO(JS): Here I assume that arrays are just held contiguously within a
- // 'face' This seems reasonable and works with the Copy3D.
- const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- Index faceCount = desc.arraySize;
- if (desc.type == IResource::Type::TextureCube)
- {
- faceCount *= 6;
- }
-
- const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
- workspace.setCount(mipSizeInBytes);
-
- // We need to add the face data from each mip
- // We iterate over face count so we copy all of the cubemap faces
- for (Index j = 0; j < faceCount; j++)
- {
- const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
- // Copy over to the workspace to make contiguous
- ::memcpy(
- workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
- }
-
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- if (desc.type == IResource::Type::TextureCube)
- {
- size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- workspace.setCount(faceSizeInBytes * 6);
- // Copy the data over to make contiguous
- for (Index j = 0; j < 6; j++)
- {
- const auto srcData =
- initData[mipLevel + j * desc.numMipLevels].data;
- ::memcpy(
- workspace.getBuffer() + faceSizeInBytes * j,
- srcData,
- faceSizeInBytes);
- }
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- const auto srcData = initData[mipLevel].data;
- srcDataPtr = srcData;
- }
- }
-
- if (desc.arraySize > 1)
- {
- SLANG_ASSERT(
- desc.type == IResource::Type::Texture1D ||
- desc.type == IResource::Type::Texture2D ||
- desc.type == IResource::Type::TextureCube);
-
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- // Set the depth to the array length
- copyParam.Depth = desc.arraySize;
-
- if (desc.type == IResource::Type::TextureCube)
- {
- copyParam.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- }
- else
- {
- switch (desc.type)
- {
- case IResource::Type::Texture1D:
- case IResource::Type::Texture2D:
- {
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
- break;
- }
- case IResource::Type::Texture3D:
- case IResource::Type::TextureCube:
- {
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- copyParam.Depth = mipDepth;
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- break;
- }
-
- default:
- {
- SLANG_ASSERT(!"Not implemented");
- break;
- }
- }
- }
- }
- }
- // Set up texture sampling parameters, and create final texture obj
-
- {
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
- resDesc.resType = resourceType;
-
- if (tex->m_cudaArray)
- {
- resDesc.res.array.hArray = tex->m_cudaArray;
- }
- if (tex->m_cudaMipMappedArray)
- {
- resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
- }
-
- // If the texture might be used as a UAV, then we need to allocate
- // a CUDA "surface" for it.
- //
- // Note: We cannot do this unconditionally, because it will fail
- // on surfaces that are not usable as UAVs (e.g., those with
- // mipmaps).
- //
- // TODO: We should really only be allocating the array at the
- // time we create a resource, and then allocate the surface or
- // texture objects as part of view creation.
- //
- if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
- {
- // On CUDA surfaces only support a single MIP map
- SLANG_ASSERT(desc.numMipLevels == 1);
-
- SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
- }
-
-
- // Create handle for sampling.
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
- texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- SLANG_CUDA_RETURN_ON_FAIL(
- cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
- }
-
- returnComPtr(outResource, tex);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
- const IBufferResource::Desc& descIn,
- const void* initData,
- IBufferResource** outResource) override
- {
- auto desc = fixupBufferDesc(descIn);
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
- resource->m_cudaContext = m_context;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
- if (initData)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
- }
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferFromSharedHandle(
- InteropHandle handle,
- const IBufferResource::Desc& desc,
- IBufferResource** outResource) override
- {
- if (handle.handleValue == 0)
- {
- *outResource = nullptr;
- return SLANG_OK;
- }
-
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource(desc);
- resource->m_cudaContext = m_context;
-
- // CUDA manages sharing of buffers through the idea of an
- // "external memory" object, which represents the relationship
- // with another API's objects. In order to create this external
- // memory association, we first need to fill in a descriptor struct.
- cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
- memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
- switch (handle.api)
- {
- case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
- break;
- case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
- break;
- default:
- return SLANG_FAIL;
- }
- externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
- externalMemoryHandleDesc.size = desc.sizeInBytes;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
- // Once we have filled in the descriptor, we can request
- // that CUDA create the required association between the
- // external buffer and its own memory.
- cudaExternalMemory_t externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
- resource->m_cudaExternalMemory = externalMemory;
-
- // The CUDA "external memory" handle is not itself a device
- // pointer, so we need to query for a suitable device address
- // for the buffer with another call.
- //
- // Just as for the external memory, we fill in a descriptor
- // structure (although in this case we only need to specify
- // the size).
- cudaExternalMemoryBufferDesc bufferDesc;
- memset(&bufferDesc, 0, sizeof(bufferDesc));
- bufferDesc.size = desc.sizeInBytes;
-
- // Finally, we can "map" the buffer to get a device address.
- void* deviceAddress;
- SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
- resource->m_cudaMemory = deviceAddress;
-
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureFromSharedHandle(
- InteropHandle handle,
- const ITextureResource::Desc& desc,
- const size_t size,
- ITextureResource** outResource) override
- {
- if (handle.handleValue == 0)
- {
- *outResource = nullptr;
- return SLANG_OK;
- }
-
- RefPtr<TextureCUDAResource> resource = new TextureCUDAResource(desc);
- resource->m_cudaContext = m_context;
-
- // CUDA manages sharing of buffers through the idea of an
- // "external memory" object, which represents the relationship
- // with another API's objects. In order to create this external
- // memory association, we first need to fill in a descriptor struct.
- CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
- memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
- switch (handle.api)
- {
- case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
- break;
- case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
- break;
- default:
- return SLANG_FAIL;
- }
- externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
- externalMemoryHandleDesc.size = size;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
-
- CUexternalMemory externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
- resource->m_cudaExternalMemory = externalMemory;
-
- FormatInfo formatInfo;
- SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- arrayDesc.Depth = desc.size.depth;
- arrayDesc.Height = desc.size.height;
- arrayDesc.Width = desc.size.width;
- arrayDesc.NumChannels = formatInfo.channelCount;
- getCUDAFormat(desc.format, &arrayDesc.Format);
- arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
-
- CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
- memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
- externalMemoryMipDesc.offset = 0;
- externalMemoryMipDesc.arrayDesc = arrayDesc;
- externalMemoryMipDesc.numLevels = desc.numMipLevels;
-
- CUmipmappedArray mipArray;
- SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
- resource->m_cudaMipMappedArray = mipArray;
-
- CUarray cuArray;
- SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
- resource->m_cudaArray = cuArray;
-
- CUDA_RESOURCE_DESC surfDesc;
- memset(&surfDesc, 0, sizeof(surfDesc));
- surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- surfDesc.res.array.hArray = cuArray;
-
- CUsurfObject surface;
- SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
- resource->m_cudaSurfObj = surface;
-
- returnComPtr(outResource, resource);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
- ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView) override
- {
- RefPtr<CUDAResourceView> view = new CUDAResourceView();
- view->m_desc = desc;
- view->textureResource = dynamic_cast<TextureCUDAResource*>(texture);
- returnComPtr(outView, view);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
- IBufferResource* buffer,
- IBufferResource* counterBuffer,
- IResourceView::Desc const& desc,
- IResourceView** outView) override
- {
- RefPtr<CUDAResourceView> view = new CUDAResourceView();
- view->m_desc = desc;
- view->memoryResource = dynamic_cast<MemoryCUDAResource*>(buffer);
- returnComPtr(outView, view);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createQueryPool(
- const IQueryPool::Desc& desc,
- IQueryPool** outPool) override
- {
- RefPtr<CUDAQueryPool> pool = new CUDAQueryPool();
- SLANG_RETURN_ON_FAIL(pool->init(desc));
- returnComPtr(outPool, pool);
- return SLANG_OK;
- }
-
- virtual Result createShaderObjectLayout(
- slang::TypeLayoutReflection* typeLayout,
- ShaderObjectLayoutBase** outLayout) override
- {
- RefPtr<CUDAShaderObjectLayout> cudaLayout;
- cudaLayout = new CUDAShaderObjectLayout(this, typeLayout);
- returnRefPtrMove(outLayout, cudaLayout);
- return SLANG_OK;
- }
-
- virtual Result createShaderObject(
- ShaderObjectLayoutBase* layout,
- IShaderObject** outObject) override
- {
- RefPtr<CUDAShaderObject> result = new CUDAShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
- returnComPtr(outObject, result);
- return SLANG_OK;
- }
-
- virtual Result createMutableShaderObject(
- ShaderObjectLayoutBase* layout,
- IShaderObject** outObject) override
- {
- RefPtr<CUDAMutableShaderObject> result = new CUDAMutableShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<CUDAShaderObjectLayout*>(layout)));
- returnComPtr(outObject, result);
- return SLANG_OK;
- }
-
- Result createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
- {
- auto cudaProgram = dynamic_cast<CUDAShaderProgram*>(program);
- auto cudaLayout = cudaProgram->layout;
-
- RefPtr<CUDARootShaderObject> result = new CUDARootShaderObject();
- SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
- returnRefPtrMove(outObject, result);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createProgram(
- const IShaderProgram::Desc& desc,
- IShaderProgram** outProgram,
- ISlangBlob** outDiagnosticBlob) override
- {
- // If this is a specializable program, we just keep a reference to the slang program and
- // don't actually create any kernels. This program will be specialized later when we know
- // the shader object bindings.
- RefPtr<CUDAShaderProgram> cudaProgram = new CUDAShaderProgram();
- cudaProgram->init(desc);
- cudaProgram->cudaContext = m_context;
- if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
- {
- cudaProgram->layout = new CUDAProgramLayout(this, desc.slangGlobalScope->getLayout());
- returnComPtr(outProgram, cudaProgram);
- return SLANG_OK;
- }
-
- ComPtr<ISlangBlob> kernelCode;
- ComPtr<ISlangBlob> diagnostics;
- auto compileResult = desc.slangGlobalScope->getEntryPointCode(
- (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
- if (diagnostics)
- {
- getDebugCallback()->handleMessage(
- compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
- DebugMessageSource::Slang,
- (char*)diagnostics->getBufferPointer());
- if (outDiagnosticBlob)
- returnComPtr(outDiagnosticBlob, diagnostics);
- }
- SLANG_RETURN_ON_FAIL(compileResult);
-
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
- cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
- &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
-
- auto slangGlobalScope = desc.slangGlobalScope;
- if( slangGlobalScope )
- {
- cudaProgram->slangGlobalScope = slangGlobalScope;
-
- auto slangProgramLayout = slangGlobalScope->getLayout();
- if(!slangProgramLayout)
- return SLANG_FAIL;
-
- RefPtr<CUDAProgramLayout> cudaLayout;
- cudaLayout = new CUDAProgramLayout(this, slangProgramLayout);
- cudaLayout->programLayout = slangProgramLayout;
- cudaProgram->layout = cudaLayout;
- }
-
- returnComPtr(outProgram, cudaProgram);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
- const ComputePipelineStateDesc& desc, IPipelineState** outState) override
- {
- RefPtr<CUDAPipelineState> state = new CUDAPipelineState();
- state->shaderProgram = static_cast<CUDAShaderProgram*>(desc.program);
- state->init(desc);
- returnComPtr(outState, state);
- return Result();
- }
-
- void* map(IBufferResource* buffer)
- {
- return static_cast<MemoryCUDAResource*>(buffer)->m_cudaMemory;
- }
-
- void unmap(IBufferResource* buffer)
- {
- SLANG_UNUSED(buffer);
- }
-
- virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override
- {
- return m_info;
- }
-
-public:
- virtual SLANG_NO_THROW Result SLANG_MCALL createTransientResourceHeap(
- const ITransientResourceHeap::Desc& desc,
- ITransientResourceHeap** outHeap) override
- {
- RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
- SLANG_RETURN_ON_FAIL(result->init(this, desc));
- returnComPtr(outHeap, result);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override
- {
- RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
- queue->init(this);
- returnComPtr(outQueue, queue);
- return SLANG_OK;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
- const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(window);
- SLANG_UNUSED(outSwapchain);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
- const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outLayout);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outFramebuffer);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
- const IRenderPassLayout::Desc& desc,
- IRenderPassLayout** outRenderPassLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outRenderPassLayout);
- return SLANG_FAIL;
- }
- virtual SLANG_NO_THROW Result SLANG_MCALL
- createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override
- {
- SLANG_UNUSED(desc);
- *outSampler = nullptr;
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
- IInputLayout::Desc const& desc,
- IInputLayout** outLayout) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outLayout);
- return SLANG_E_NOT_AVAILABLE;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
- const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override
- {
- SLANG_UNUSED(desc);
- SLANG_UNUSED(outState);
- return SLANG_E_NOT_AVAILABLE;
- }
-
- virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
- ITextureResource* texture,
- ResourceState state,
- ISlangBlob** outBlob,
- size_t* outRowPitch,
- size_t* outPixelSize) override
- {
- auto textureImpl = static_cast<TextureCUDAResource*>(texture);
- RefPtr<ListBlob> blob = new ListBlob();
-
- auto desc = textureImpl->getDesc();
- auto width = desc->size.width;
- auto height = desc->size.height;
- FormatInfo sizeInfo;
- SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
- size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
- size_t rowPitch = width * pixelSize;
- size_t size = height * rowPitch;
- blob->m_data.setCount((Index)size);
-
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.srcArray = textureImpl->m_cudaArray;
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.dstHost = blob->m_data.getBuffer();
- copyParam.dstPitch = rowPitch;
- copyParam.WidthInBytes = copyParam.dstPitch;
- copyParam.Height = height;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
-
- *outRowPitch = rowPitch;
- *outPixelSize = pixelSize;
- returnComPtr(outBlob, blob);
- return SLANG_OK;
- }
-
- virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
- IBufferResource* buffer,
- size_t offset,
- size_t size,
- ISlangBlob** outBlob) override
- {
- auto bufferImpl = static_cast<MemoryCUDAResource*>(buffer);
- RefPtr<ListBlob> blob = new ListBlob();
- blob->m_data.setCount((Index)size);
- cudaMemcpy(
- blob->m_data.getBuffer(),
- (uint8_t*)bufferImpl->m_cudaMemory + offset,
- size,
- cudaMemcpyDefault);
- returnComPtr(outBlob, blob);
- return SLANG_OK;
- }
-};
-
-SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
- m_layout = typeLayout;
-
- // If the layout tells us that there is any uniform data,
- // then we need to allocate a constant buffer to hold that data.
- //
- // TODO: Do we need to allocate a shadow copy for use from
- // the CPU?
- //
- // TODO: When/where do we bind this constant buffer into
- // a descriptor set for later use?
- //
- auto slangLayout = getLayout()->getElementTypeLayout();
- size_t uniformSize = slangLayout->getSize();
- if (uniformSize)
- {
- m_data.setCount((Index)uniformSize);
- }
-
- // If the layout specifies that we have any resources or sub-objects,
- // then we need to size the appropriate arrays to account for them.
- //
- // Note: the counts here are the *total* number of resources/sub-objects
- // and not just the number of resource/sub-object ranges.
- //
- resources.setCount(typeLayout->getResourceCount());
- m_objects.setCount(typeLayout->getSubObjectCount());
-
- for (auto subObjectRange : getLayout()->subObjectRanges)
- {
- RefPtr<CUDAShaderObjectLayout> subObjectLayout = subObjectRange.layout;
-
- // In the case where the sub-object range represents an
- // existential-type leaf field (e.g., an `IBar`), we
- // cannot pre-allocate the object(s) to go into that
- // range, since we can't possibly know what to allocate
- // at this point.
- //
- if (!subObjectLayout)
- continue;
- //
- // Otherwise, we will allocate a sub-object to fill
- // in each entry in this range, based on the layout
- // information we already have.
-
- auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
- for (Index i = 0; i < bindingRangeInfo.count; ++i)
- {
- RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject();
- SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
-
- ShaderOffset offset;
- offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
- offset.bindingRangeIndex = (GfxIndex)subObjectRange.bindingRangeIndex;
- offset.bindingArrayIndex = (GfxIndex)i;
-
- SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
- }
- }
- return SLANG_OK;
-}
-
-SlangResult CUDARootShaderObject::init(IDevice* device, CUDAShaderObjectLayout* typeLayout)
-{
- SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(device, typeLayout));
- auto programLayout = dynamic_cast<CUDAProgramLayout*>(typeLayout);
- for (auto& entryPoint : programLayout->entryPointLayouts)
- {
- RefPtr<CUDAEntryPointShaderObject> object = new CUDAEntryPointShaderObject();
- SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
- entryPointObjects.add(object);
- }
- return SLANG_OK;
-}
-
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
- RefPtr<CUDADevice> result = new CUDADevice();
- SLANG_RETURN_ON_FAIL(result->initialize(*desc));
- returnComPtr(outDevice, result);
- return SLANG_OK;
-}
-#else
-SlangResult SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
-{
- SLANG_UNUSED(desc);
- *outDevice = nullptr;
- return SLANG_FAIL;
-}
-#endif
-
-}