From 856d7d3705cedabcc2e9389a3f0ac730b0e33476 Mon Sep 17 00:00:00 2001 From: Yong He Date: Fri, 11 Dec 2020 09:42:23 -0800 Subject: Implements CUDA renderer in gfx. (#1637) * Implements CUDA renderer in gfx. * Revert unnecessary change. * Revert unnecessary changes. Co-authored-by: Tim Foley --- build/visual-studio/gfx/gfx.vcxproj | 2 + build/visual-studio/gfx/gfx.vcxproj.filters | 6 + premake5.lua | 20 +- tests/compute/array-param.slang | 1 + tests/compute/entry-point-uniform-params.slang | 3 +- tools/gfx/cuda/render-cuda.cpp | 1506 ++++++++++++++++++++++++ tools/gfx/cuda/render-cuda.h | 8 + tools/gfx/render.cpp | 5 + tools/render-test/render-test-main.cpp | 2 +- 9 files changed, 1547 insertions(+), 6 deletions(-) create mode 100644 tools/gfx/cuda/render-cuda.cpp create mode 100644 tools/gfx/cuda/render-cuda.h diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj index d04ac9c2a..841a11c03 100644 --- a/build/visual-studio/gfx/gfx.vcxproj +++ b/build/visual-studio/gfx/gfx.vcxproj @@ -171,6 +171,7 @@ + @@ -197,6 +198,7 @@ + diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters index ada84b968..d55d0d4bb 100644 --- a/build/visual-studio/gfx/gfx.vcxproj.filters +++ b/build/visual-studio/gfx/gfx.vcxproj.filters @@ -9,6 +9,9 @@ + + Header Files + Header Files @@ -83,6 +86,9 @@ + + Source Files + Source Files diff --git a/premake5.lua b/premake5.lua index 801847cde..588caaf79 100644 --- a/premake5.lua +++ b/premake5.lua @@ -723,8 +723,6 @@ toolSharedLibrary "render-test" includedirs { optixPath .. "include/" } end - links { "cuda", "cudart" } - filter { "platforms:x86" } libdirs { cudaPath .. "/lib/Win32/" } @@ -767,9 +765,23 @@ tool "gfx" addSourceDir "tools/gfx/d3d" addSourceDir "tools/gfx/d3d11" addSourceDir "tools/gfx/d3d12" - + addSourceDir "tools/gfx/cuda" addSourceDir "tools/gfx/windows" - + + if type(cudaPath) == "string" then + defines { "GFX_ENABLE_CUDA" } + includedirs { cudaPath .. "/include" } + includedirs { cudaPath .. "/include", cudaPath .. "/common/inc" } + if optixPath then + defines { "GFX_OPTIX" } + includedirs { optixPath .. "include/" } + end + links { "cuda", "cudart" } + filter { "platforms:x86" } + libdirs { cudaPath .. "/lib/Win32/" } + filter { "platforms:x64" } + libdirs { cudaPath .. "/lib/x64/" } + end elseif targetDetail == "mingw" or targetDetail == "cygwin" then -- Don't support any render techs... elseif os.target() == "macosx" then diff --git a/tests/compute/array-param.slang b/tests/compute/array-param.slang index ceac34e78..5f5528a10 100644 --- a/tests/compute/array-param.slang +++ b/tests/compute/array-param.slang @@ -1,3 +1,4 @@ +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -compile-arg -O3 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 diff --git a/tests/compute/entry-point-uniform-params.slang b/tests/compute/entry-point-uniform-params.slang index 736a4c05a..db43f4ab7 100644 --- a/tests/compute/entry-point-uniform-params.slang +++ b/tests/compute/entry-point-uniform-params.slang @@ -3,7 +3,8 @@ // Confirm that `uniform` parameters on // entry points are allowed, and work as expected. -//DISABLE_TEST:CPU_REFLECTION: -profile cs_5_0 -entry computeMain -target cpp +//DISABLE_TEST:CPU_REFLECTION: -profile cs_5_0 -entry computeMain -target cpp +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cpu //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp new file mode 100644 index 000000000..fcb27fe0b --- /dev/null +++ b/tools/gfx/cuda/render-cuda.cpp @@ -0,0 +1,1506 @@ +#include "render-cuda.h" + +#ifdef GFX_ENABLE_CUDA +#include "../render.h" +#include +#include +#include "../../source/core/slang-std-writers.h" +#include "slang.h" +#endif + +namespace gfx +{ +#ifdef GFX_ENABLE_CUDA +using namespace Slang; + +SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } +SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } + +// A enum used to control if errors are reported on failure of CUDA call. +enum class CUDAReportStyle +{ + Normal, + Silent, +}; + +struct CUDAErrorInfo +{ + CUDAErrorInfo( + const char* filePath, + int lineNo, + const char* errorName = nullptr, + const char* errorString = nullptr) + : m_filePath(filePath) + , m_lineNo(lineNo) + , m_errorName(errorName) + , m_errorString(errorString) + {} + SlangResult handle() const + { + StringBuilder builder; + builder << "Error: " << m_filePath << " (" << m_lineNo << ") :"; + + if (m_errorName) + { + builder << m_errorName << " : "; + } + if (m_errorString) + { + builder << m_errorString; + } + + StdWriters::getError().put(builder.getUnownedSlice()); + + // Slang::signalUnexpectedError(builder.getBuffer()); + return SLANG_FAIL; + } + + const char* m_filePath; + int m_lineNo; + const char* m_errorName; + const char* m_errorString; +}; + +# if 1 +// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream. + +static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line) +{ + CUDAErrorInfo info(file, line); + cuGetErrorString(cuResult, &info.m_errorString); + cuGetErrorName(cuResult, &info.m_errorName); + return info.handle(); +} + +static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line) +{ + return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle(); +} + +# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res, __FILE__, __LINE__) + +# else +// If this code path is enabled, errors are not reported, but can have an assert enabled + +static SlangResult _handleCUDAError(CUresult cuResult) +{ + SLANG_UNUSED(cuResult); + // SLANG_ASSERT(!"Failed CUDA call"); + return SLANG_FAIL; +} + +static SlangResult _handleCUDAError(cudaError_t error) +{ + SLANG_UNUSED(error); + // SLANG_ASSERT(!"Failed CUDA call"); + return SLANG_FAIL; +} + +# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res) +# endif + +# define SLANG_CUDA_RETURN_ON_FAIL(x) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + return SLANG_CUDA_HANDLE_ERROR(_res); \ + } +# define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + { \ + return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) \ + : SLANG_FAIL; \ + } \ + } + +# define SLANG_CUDA_ASSERT_ON_FAIL(x) \ + { \ + auto _res = x; \ + if (_isError(_res)) \ + { \ + SLANG_ASSERT(!"Failed CUDA call"); \ + }; \ + } + +# ifdef RENDER_TEST_OPTIX + +static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; } + +# if 1 +static SlangResult _handleOptixError(OptixResult result, char const* file, int line) +{ + fprintf( + stderr, + "%s(%d): optix: %s (%s)\n", + file, + line, + optixGetErrorString(result), + optixGetErrorName(result)); + return SLANG_FAIL; +} +# define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__) +# else +# define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL +# endif + +# define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) \ + do \ + { \ + auto _res = EXPR; \ + if (_isError(_res)) \ + return SLANG_OPTIX_HANDLE_ERROR(_res); \ + } while (0) + +void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData) +{ + fprintf(stderr, "optix: %s (%s)\n", message, tag); +} + +# endif + +class MemoryCUDAResource : public BufferResource +{ +public: + MemoryCUDAResource(const Desc& _desc) + : BufferResource(_desc) + {} + + ~MemoryCUDAResource() + { + if (m_cudaMemory) + { + SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + } + } + + uint64_t getBindlessHandle() { return (uint64_t)m_cudaMemory; } + + void* m_cudaMemory = nullptr; +}; + +class TextureCUDAResource : public TextureResource +{ +public: + TextureCUDAResource(const TextureResource::Desc& desc) + : TextureResource(desc) + {} + ~TextureCUDAResource() + { + if (m_cudaSurfObj) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj)); + } + if (m_cudaTexObj) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); + } + if (m_cudaArray) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); + } + if (m_cudaMipMappedArray) + { + SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray)); + } + } + + uint64_t getBindlessHandle() { return (uint64_t)m_cudaTexObj; } + + // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by + // a long long + CUtexObject m_cudaTexObj = CUtexObject(); + + // The surfObj is for reading/writing 'texture like' things, but not for sampling. + CUsurfObject m_cudaSurfObj = CUsurfObject(); + + CUarray m_cudaArray = CUarray(); + CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray(); +}; + +class CUDAResourceView : public ResourceView +{ +public: + Desc desc; + RefPtr memoryResource = nullptr; + RefPtr textureResource = nullptr; +}; + +class CUDAShaderProgram : public ShaderProgram +{ +public: + CUmodule cudaModule = nullptr; + CUfunction cudaKernel; + String kernelName; + ~CUDAShaderProgram() + { + if (cudaModule) + cuModuleUnload(cudaModule); + } +}; + +class CUDAPipelineState : public PipelineState +{ +public: + RefPtr shaderProgram; +}; + +class CUDAShaderObjectLayout : public ShaderObjectLayout +{ +public: + slang::TypeLayoutReflection* typeLayout = nullptr; + + struct BindingRangeInfo + { + slang::BindingType bindingType; + Index count; + Index baseIndex; + Index descriptorSetIndex; + Index rangeIndexInDescriptorSet; + Index uniformOffset; + // Index subObjectRangeIndex = -1; + }; + + struct SubObjectRangeInfo + { + RefPtr layout; + Index bindingRangeIndex; + }; + + List subObjectRanges; + List m_bindingRanges; + + Index m_resourceViewCount = 0; + Index m_samplerCount = 0; + Index m_combinedTextureSamplerCount = 0; + Index m_subObjectCount = 0; + Index m_varyingInputCount = 0; + Index m_varyingOutputCount = 0; + + slang::TypeLayoutReflection* unwrapParameterGroups(slang::TypeLayoutReflection* typeLayout) + { + for (;;) + { + if (!typeLayout->getType()) + { + if (auto elementTypeLayout = typeLayout->getElementTypeLayout()) + typeLayout = elementTypeLayout; + } + + switch (typeLayout->getKind()) + { + default: + return typeLayout; + + case slang::TypeReflection::Kind::ConstantBuffer: + case slang::TypeReflection::Kind::ParameterBlock: + typeLayout = typeLayout->getElementTypeLayout(); + continue; + } + } + } + + CUDAShaderObjectLayout(slang::TypeLayoutReflection* layout) + { + typeLayout = unwrapParameterGroups(layout); + + // Compute the binding ranges that are used to store + // the logical contents of the object in memory. These will relate + // to the descriptor ranges in the various sets, but not always + // in a one-to-one fashion. + + SlangInt bindingRangeCount = typeLayout->getBindingRangeCount(); + for (SlangInt r = 0; r < bindingRangeCount; ++r) + { + slang::BindingType slangBindingType = typeLayout->getBindingRangeType(r); + SlangInt count = typeLayout->getBindingRangeBindingCount(r); + slang::TypeLayoutReflection* slangLeafTypeLayout = + typeLayout->getBindingRangeLeafTypeLayout(r); + + SlangInt descriptorSetIndex = typeLayout->getBindingRangeDescriptorSetIndex(r); + SlangInt rangeIndexInDescriptorSet = + typeLayout->getBindingRangeFirstDescriptorRangeIndex(r); + + auto uniformOffset = typeLayout->getDescriptorSetDescriptorRangeIndexOffset( + descriptorSetIndex, rangeIndexInDescriptorSet); + + Index baseIndex = 0; + switch (slangBindingType) + { + case slang::BindingType::ConstantBuffer: + case slang::BindingType::ParameterBlock: + case slang::BindingType::ExistentialValue: + baseIndex = m_subObjectCount; + m_subObjectCount += count; + break; + + case slang::BindingType::Sampler: + baseIndex = m_samplerCount; + m_samplerCount += count; + break; + + case slang::BindingType::CombinedTextureSampler: + baseIndex = m_combinedTextureSamplerCount; + m_combinedTextureSamplerCount += count; + break; + + case slang::BindingType::VaryingInput: + baseIndex = m_varyingInputCount; + m_varyingInputCount += count; + break; + + case slang::BindingType::VaryingOutput: + baseIndex = m_varyingOutputCount; + m_varyingOutputCount += count; + break; + + default: + baseIndex = m_resourceViewCount; + m_resourceViewCount += count; + break; + } + + BindingRangeInfo bindingRangeInfo; + bindingRangeInfo.bindingType = slangBindingType; + bindingRangeInfo.count = count; + // bindingRangeInfo.descriptorSetIndex = descriptorSetIndex; + // bindingRangeInfo.rangeIndexInDescriptorSet = slotRangeIndex; + // bindingRangeInfo.subObjectRangeIndex = subObjectRangeIndex; + bindingRangeInfo.baseIndex = baseIndex; + bindingRangeInfo.descriptorSetIndex = descriptorSetIndex; + bindingRangeInfo.rangeIndexInDescriptorSet = rangeIndexInDescriptorSet; + bindingRangeInfo.uniformOffset = uniformOffset; + m_bindingRanges.add(bindingRangeInfo); + } + + SlangInt subObjectRangeCount = typeLayout->getSubObjectRangeCount(); + for (SlangInt r = 0; r < subObjectRangeCount; ++r) + { + SlangInt bindingRangeIndex = typeLayout->getSubObjectRangeBindingRangeIndex(r); + auto slangBindingType = typeLayout->getBindingRangeType(bindingRangeIndex); + slang::TypeLayoutReflection* slangLeafTypeLayout = + typeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex); + + // A sub-object range can either represent a sub-object of a known + // type, like a `ConstantBuffer` or `ParameterBlock` + // (in which case we can pre-compute a layout to use, based on + // the type `Foo`) *or* it can represent a sub-object of some + // existential type (e.g., `IBar`) in which case we cannot + // know the appropraite type/layout of sub-object to allocate. + // + RefPtr subObjectLayout; + if (slangBindingType != slang::BindingType::ExistentialValue) + { + subObjectLayout = + new CUDAShaderObjectLayout(slangLeafTypeLayout->getElementTypeLayout()); + } + + SubObjectRangeInfo subObjectRange; + subObjectRange.bindingRangeIndex = bindingRangeIndex; + subObjectRange.layout = subObjectLayout; + subObjectRanges.add(subObjectRange); + } + } +}; + +class CUDAProgramLayout : public CUDAShaderObjectLayout +{ +public: + slang::ProgramLayout* programLayout = nullptr; + List> entryPointLayouts; + CUDAProgramLayout(slang::ProgramLayout* inProgramLayout) + : CUDAShaderObjectLayout(inProgramLayout->getGlobalParamsTypeLayout()) + , programLayout(inProgramLayout) + { + for (UInt i =0; i< programLayout->getEntryPointCount(); i++) + { + entryPointLayouts.add(new CUDAShaderObjectLayout( + programLayout->getEntryPointByIndex(i)->getTypeLayout())); + } + + } + + int getKernelIndex(UnownedStringSlice kernelName) + { + for (int i = 0; i < (int)programLayout->getEntryPointCount(); i++) + { + auto entryPoint = programLayout->getEntryPointByIndex(i); + if (kernelName == entryPoint->getName()) + { + return i; + } + } + return -1; + } + + void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes) + { + auto entryPoint = programLayout->getEntryPointByIndex(kernelIndex); + entryPoint->getComputeThreadGroupSize(3, threadGroupSizes); + } +}; + +class CUDAShaderObject : public ShaderObject +{ +public: + RefPtr bufferResource; + RefPtr layout; + List> objects; + List> resources; + + virtual SlangResult init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout); + + virtual slang::TypeLayoutReflection* getElementTypeLayout() override + { + return layout->typeLayout; + } + virtual Slang::Index getEntryPointCount() override { return 0; } + virtual ShaderObject* getEntryPoint(Slang::Index index) override { return nullptr; } + virtual SlangResult setData(ShaderOffset const& offset, void const* data, size_t size) + { + size = Math::Min(size, bufferResource->getDesc().sizeInBytes - offset.uniformOffset); + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( + (uint8_t*)bufferResource->m_cudaMemory + offset.uniformOffset, + data, + size, + cudaMemcpyHostToDevice)); + return SLANG_OK; + } + virtual SlangResult getObject(ShaderOffset const& offset, ShaderObject** object) + { + auto subObjectIndex = + layout->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex; + if (subObjectIndex >= objects.getCount()) + { + *object = nullptr; + return SLANG_OK; + } + *object = objects[subObjectIndex].Ptr(); + return SLANG_OK; + } + virtual SlangResult setObject(ShaderOffset const& offset, ShaderObject* object) + { + auto subObjectIndex = + layout->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex; + SLANG_ASSERT( + offset.uniformOffset == + layout->m_bindingRanges[offset.bindingRangeIndex].uniformOffset + + offset.bindingArrayIndex * sizeof(void*)); + auto cudaObject = dynamic_cast(object); + if (subObjectIndex >= objects.getCount()) + objects.setCount(subObjectIndex + 1); + objects[subObjectIndex] = cudaObject; + return setData(offset, &cudaObject->bufferResource->m_cudaMemory, sizeof(void*)); + } + virtual SlangResult setResource(ShaderOffset const& offset, ResourceView* resourceView) + { + auto cudaView = dynamic_cast(resourceView); + if (offset.bindingRangeIndex >= resources.getCount()) + resources.setCount(offset.bindingRangeIndex + 1); + resources[offset.bindingRangeIndex] = cudaView; + if (cudaView->textureResource) + { + if (cudaView->desc.type == ResourceView::Type::UnorderedAccess) + { + auto handle = cudaView->textureResource->getBindlessHandle(); + setData(offset, &handle, sizeof(uint64_t)); + } + else + { + auto handle = cudaView->textureResource->m_cudaSurfObj; + setData(offset, &handle, sizeof(uint64_t)); + } + } + else + { + auto handle = cudaView->memoryResource->getBindlessHandle(); + setData(offset, &handle, sizeof(handle)); + auto sizeOffset = offset; + sizeOffset.uniformOffset += sizeof(handle); + auto& desc = cudaView->memoryResource->getDesc(); + size_t size = desc.sizeInBytes; + if (desc.elementSize > 1) + size /= desc.elementSize; + setData(sizeOffset, &size, sizeof(size)); + + } + return SLANG_OK; + } + virtual SlangResult setSampler(ShaderOffset const& offset, SamplerState* sampler) + { + SLANG_UNUSED(sampler); + SLANG_UNUSED(offset); + return SLANG_OK; + } + virtual SlangResult setCombinedTextureSampler( + ShaderOffset const& offset, ResourceView* textureView, SamplerState* sampler) + { + SLANG_UNUSED(sampler); + setResource(offset, textureView); + return SLANG_OK; + } +}; + +class CUDARootShaderObject : public CUDAShaderObject +{ +public: + List> entryPointObjects; + virtual SlangResult init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout) override; + virtual Slang::Index getEntryPointCount() override { return entryPointObjects.getCount(); } + virtual ShaderObject* getEntryPoint(Slang::Index index) override { return entryPointObjects[index].Ptr(); } +}; + +class CUDARenderer : public Renderer +{ +private: + static const CUDAReportStyle reportType = CUDAReportStyle::Normal; + static int _calcSMCountPerMultiProcessor(int major, int minor) + { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + struct SMInfo + { + int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version + int coreCount; + }; + + static const SMInfo infos[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}}; + + const int sm = ((major << 4) + minor); + for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i) + { + if (infos[i].sm == sm) + { + return infos[i].coreCount; + } + } + + const auto& last = infos[SLANG_COUNT_OF(infos) - 1]; + + // It must be newer presumably + SLANG_ASSERT(sm > last.sm); + + // Default to the last entry + return last.coreCount; + } + + static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex) + { + int smPerMultiproc = 0; + int maxPerfDevice = -1; + int deviceCount = 0; + int devicesProhibited = 0; + + uint64_t maxComputePerf = 0; + SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); + + // Find the best CUDA capable GPU device + for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) + { + int computeMode = -1, major = 0, minor = 0; + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) + { + if (major == 9999 && minor == 9999) + { + smPerMultiproc = 1; + } + else + { + smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor); + } + + int multiProcessorCount = 0, clockRate = 0; + SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute( + &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL( + cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); + uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; + + if (compute_perf > maxComputePerf) + { + maxComputePerf = compute_perf; + maxPerfDevice = currentDevice; + } + } + else + { + devicesProhibited++; + } + } + + if (maxPerfDevice < 0) + { + return SLANG_FAIL; + } + + *outDeviceIndex = maxPerfDevice; + return SLANG_OK; + } + + static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal) + { + static CUresult res = cuInit(0); + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType); + return SLANG_OK; + } + +private: + int m_deviceIndex = -1; + CUdevice m_device = 0; + CUcontext m_context = nullptr; + CUDAPipelineState* currentPipeline = nullptr; + CUDARootShaderObject* currentRootObject = nullptr; + public: + ~CUDARenderer() + { + if (m_context) + { + cuCtxDestroy(m_context); + } + } + virtual SlangResult initialize(const Desc& desc, void* inWindowHandle) override + { + SLANG_RETURN_ON_FAIL(_initCuda(reportType)); + + SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex)); + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType); + + if (m_context) + { + cuCtxDestroy(m_context); + m_context = nullptr; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex)); + + SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, 0, m_device), reportType); + return SLANG_OK; + } + + virtual Result createTextureResource( + Resource::Usage initialUsage, + const TextureResource::Desc& desc, + const TextureResource::Data* initData, + TextureResource** outResource) override + { + RefPtr tex = new TextureCUDAResource(desc); + CUresourcetype resourceType; + size_t elementSize = 0; + + { + CUarray_format format = CU_AD_FORMAT_FLOAT; + int numChannels = 0; + + switch (desc.format) + { + case Format::R_Float32: + { + format = CU_AD_FORMAT_FLOAT; + numChannels = 1; + elementSize = sizeof(float); + break; + } + case Format::RGBA_Unorm_UInt8: + { + format = CU_AD_FORMAT_UNSIGNED_INT8; + numChannels = 4; + elementSize = sizeof(uint32_t); + break; + } + default: + { + SLANG_ASSERT(!"Only support R_Float32/RGBA_Unorm_UInt8 formats for now"); + return SLANG_FAIL; + } + } + + if (desc.numMipLevels > 1) + { + resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; + + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Width = desc.size.width; + arrayDesc.Height = desc.size.height; + arrayDesc.Depth = desc.size.depth; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + arrayDesc.Flags = 0; + + if (desc.arraySize > 1) + { + if (desc.type == Resource::Type::Texture1D || + desc.type == Resource::Type::Texture2D || + desc.type == Resource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED; + arrayDesc.Depth = desc.arraySize; + } + else + { + SLANG_ASSERT(!"Arrays only supported for 1D and 2D"); + return SLANG_FAIL; + } + } + + if (desc.type == Resource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + arrayDesc.Depth *= 6; + } + + SLANG_CUDA_RETURN_ON_FAIL( + cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels)); + } + else + { + resourceType = CU_RESOURCE_TYPE_ARRAY; + + if (desc.arraySize > 1) + { + if (desc.type == Resource::Type::Texture1D || + desc.type == Resource::Type::Texture2D || + desc.type == Resource::Type::TextureCube) + { + SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported"); + return SLANG_FAIL; + } + + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + // Set the depth as the array length + arrayDesc.Depth = desc.arraySize; + if (desc.type == Resource::Type::TextureCube) + { + arrayDesc.Depth *= 6; + } + + arrayDesc.Height = desc.size.height; + arrayDesc.Width = desc.size.width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + if (desc.type == Resource::Type::TextureCube) + { + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); + } + else if (desc.type == Resource::Type::Texture3D || + desc.type == Resource::Type::TextureCube) + { + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Depth = desc.size.depth; + arrayDesc.Height = desc.size.height; + arrayDesc.Width = desc.size.width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + arrayDesc.Flags = 0; + + // Handle cube texture + if (desc.type == Resource::Type::TextureCube) + { + arrayDesc.Depth = 6; + arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); + } + else + { + CUDA_ARRAY_DESCRIPTOR arrayDesc; + memset(&arrayDesc, 0, sizeof(arrayDesc)); + + arrayDesc.Height = desc.size.height; + arrayDesc.Width = desc.size.width; + arrayDesc.Format = format; + arrayDesc.NumChannels = numChannels; + + // Allocate the array, will work for 1D or 2D case + SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); + } + } + } + + // Work space for holding data for uploading if it needs to be rearranged + List workspace; + auto width = desc.size.width; + auto height = desc.size.height; + auto depth = desc.size.depth; + for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel) + { + int mipWidth = width >> mipLevel; + int mipHeight = height >> mipLevel; + int mipDepth = depth >> mipLevel; + + mipWidth = (mipWidth == 0) ? 1 : mipWidth; + mipHeight = (mipHeight == 0) ? 1 : mipHeight; + mipDepth = (mipDepth == 0) ? 1 : mipDepth; + + // If it's a cubemap then the depth is always 6 + if (desc.type == Resource::Type::TextureCube) + { + mipDepth = 6; + } + + auto dstArray = tex->m_cudaArray; + if (tex->m_cudaMipMappedArray) + { + // Get the array for the mip level + SLANG_CUDA_RETURN_ON_FAIL( + cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel)); + } + SLANG_ASSERT(dstArray); + + // Check using the desc to see if it's plausible + { + CUDA_ARRAY_DESCRIPTOR arrayDesc; + SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray)); + + SLANG_ASSERT(mipWidth == arrayDesc.Width); + SLANG_ASSERT( + mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0)); + } + + const void* srcDataPtr = nullptr; + + if (desc.arraySize > 1) + { + SLANG_ASSERT( + desc.type == Resource::Type::Texture1D || + desc.type == Resource::Type::Texture2D || + desc.type == Resource::Type::TextureCube); + + // TODO(JS): Here I assume that arrays are just held contiguously within a 'face' + // This seems reasonable and works with the Copy3D. + const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; + + Index faceCount = desc.arraySize; + if (desc.type == Resource::Type::TextureCube) + { + faceCount *= 6; + } + + const size_t mipSizeInBytes = faceSizeInBytes * faceCount; + workspace.setCount(mipSizeInBytes); + + // We need to add the face data from each mip + // We iterate over face count so we copy all of the cubemap faces + if (initData) + { + for (Index j = 0; j < faceCount; j++) + { + const auto srcData = initData->subResources[mipLevel + j * desc.numMipLevels]; + // Copy over to the workspace to make contiguous + ::memcpy( + workspace.begin() + faceSizeInBytes * j, srcData, + faceSizeInBytes); + } + } + + srcDataPtr = workspace.getBuffer(); + } + else + { + if (desc.type == Resource::Type::TextureCube) + { + size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; + + workspace.setCount(faceSizeInBytes * 6); + + // Copy the data over to make contiguous + for (Index j = 0; j < 6; j++) + { + const auto srcData = + initData->subResources[mipLevel + j * desc.numMipLevels]; + ::memcpy( + workspace.getBuffer() + faceSizeInBytes * j, srcData, + faceSizeInBytes); + } + + srcDataPtr = workspace.getBuffer(); + } + else + { + const auto srcData = initData->subResources[mipLevel]; + srcDataPtr = srcData; + } + } + + if (desc.arraySize > 1) + { + SLANG_ASSERT( + desc.type == Resource::Type::Texture1D || + desc.type == Resource::Type::Texture2D || + desc.type == Resource::Type::TextureCube); + + CUDA_MEMCPY3D copyParam; + memset(©Param, 0, sizeof(copyParam)); + + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + // Set the depth to the array length + copyParam.Depth = desc.arraySize; + + if (desc.type == Resource::Type::TextureCube) + { + copyParam.Depth *= 6; + } + + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); + } + else + { + switch (desc.type) + { + case Resource::Type::Texture1D: + case Resource::Type::Texture2D: + { + CUDA_MEMCPY2D copyParam; + memset(©Param, 0, sizeof(copyParam)); + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); + break; + } + case Resource::Type::Texture3D: + case Resource::Type::TextureCube: + { + CUDA_MEMCPY3D copyParam; + memset(©Param, 0, sizeof(copyParam)); + + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = dstArray; + + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = srcDataPtr; + copyParam.srcPitch = mipWidth * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = mipHeight; + copyParam.Depth = mipDepth; + + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); + break; + } + + default: + { + SLANG_ASSERT(!"Not implemented"); + break; + } + } + } + } + + // Set up texture sampling parameters, and create final texture obj + + { + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); + resDesc.resType = resourceType; + + if (tex->m_cudaArray) + { + resDesc.res.array.hArray = tex->m_cudaArray; + } + if (tex->m_cudaMipMappedArray) + { + resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray; + } + + // Create handle for uav. + SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc)); + + // Create handle for sampling. + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); + texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; + texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + SLANG_CUDA_RETURN_ON_FAIL( + cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); + } + + *outResource = tex.detach(); + return SLANG_OK; + } + + virtual Result createBufferResource( + Resource::Usage initialUsage, + const BufferResource::Desc& desc, + const void* initData, + BufferResource** outResource) override + { + RefPtr resource = new MemoryCUDAResource(desc); + SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes)); + if (initData) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyHostToDevice)); + } + *outResource = resource.detach(); + return SLANG_OK; + } + + virtual Result createTextureView( + TextureResource* texture, ResourceView::Desc const& desc, ResourceView** outView) override + { + RefPtr view = new CUDAResourceView(); + view->desc = desc; + view->textureResource = dynamic_cast(texture); + *outView = view.detach(); + return SLANG_OK; + } + + virtual Result createBufferView( + BufferResource* buffer, ResourceView::Desc const& desc, ResourceView** outView) override + { + RefPtr view = new CUDAResourceView(); + view->desc = desc; + view->memoryResource = dynamic_cast(buffer); + *outView = view.detach(); + return SLANG_OK; + } + + virtual Result createShaderObjectLayout( + slang::TypeLayoutReflection* typeLayout, ShaderObjectLayout** outLayout) override + { + RefPtr cudaLayout; + cudaLayout = new CUDAShaderObjectLayout(typeLayout); + *outLayout = cudaLayout.detach(); + return SLANG_OK; + } + + virtual Result createRootShaderObjectLayout( + slang::ProgramLayout* layout, ShaderObjectLayout** outLayout) override + { + RefPtr cudaLayout; + cudaLayout = new CUDAProgramLayout(layout); + cudaLayout->programLayout = layout; + *outLayout = cudaLayout.detach(); + return SLANG_OK; + } + + virtual Result createShaderObject(ShaderObjectLayout* layout, ShaderObject** outObject) override + { + RefPtr result = new CUDAShaderObject(); + SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast(layout))); + *outObject = result.detach(); + return SLANG_OK; + } + + virtual Result createRootShaderObject(ShaderObjectLayout* layout, ShaderObject** outObject) override + { + RefPtr result = new CUDARootShaderObject(); + SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast(layout))); + *outObject = result.detach(); + return SLANG_OK; + } + + virtual Result bindRootShaderObject(PipelineType pipelineType, ShaderObject* object) override + { + currentRootObject = dynamic_cast(object); + if (currentRootObject) + return SLANG_OK; + return SLANG_E_INVALID_ARG; + } + + virtual Result createProgram(const ShaderProgram::Desc& desc, ShaderProgram** outProgram) override + { + if (desc.kernelCount != 1) + return SLANG_E_INVALID_ARG; + RefPtr cudaProgram = new CUDAShaderProgram(); + SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, desc.kernels[0].codeBegin)); + SLANG_CUDA_RETURN_ON_FAIL( + cuModuleGetFunction(&cudaProgram->cudaKernel, cudaProgram->cudaModule, desc.kernels[0].entryPointName)); + cudaProgram->kernelName = desc.kernels[0].entryPointName; + *outProgram = cudaProgram.detach(); + return SLANG_OK; + } + + virtual Result createComputePipelineState(const ComputePipelineStateDesc& desc, PipelineState** outState) override + { + RefPtr state = new CUDAPipelineState(); + state->shaderProgram = dynamic_cast(desc.program); + *outState = state.detach(); + return Result(); + } + + virtual void* map(BufferResource* buffer, MapFlavor flavor) override + { + return dynamic_cast(buffer)->m_cudaMemory; + } + + virtual void unmap(BufferResource* buffer) override + { + SLANG_UNUSED(buffer); + } + + virtual void setPipelineState(PipelineType pipelineType, PipelineState* state) override + { + SLANG_ASSERT(pipelineType == PipelineType::Compute); + currentPipeline = dynamic_cast(state); + } + + virtual void dispatchCompute(int x, int y, int z) override + { + // Find out thread group size from program reflection. + auto& kernelName = currentPipeline->shaderProgram->kernelName; + auto programLayout = dynamic_cast(currentRootObject->layout.Ptr()); + int kernelId = programLayout->getKernelIndex(kernelName.getUnownedSlice()); + SLANG_ASSERT(kernelId != -1); + UInt threadGroupSize[3]; + programLayout->getKernelThreadGroupSize(kernelId, threadGroupSize); + + int sharedSizeInBytes; + cuFuncGetAttribute( + &sharedSizeInBytes, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + currentPipeline->shaderProgram->cudaKernel); + + // Copy global parameter data to the `SLANG_globalParams` symbol. + { + CUdeviceptr globalParamsSymbol = 0; + size_t globalParamsSymbolSize = 0; + cuModuleGetGlobal( + &globalParamsSymbol, + &globalParamsSymbolSize, + currentPipeline->shaderProgram->cudaModule, + "SLANG_globalParams"); + + CUdeviceptr globalParamsCUDAData = + currentRootObject->bufferResource + ? currentRootObject->bufferResource->getBindlessHandle() + : 0; + cudaMemcpyAsync( + (void*)globalParamsSymbol, + (void*)globalParamsCUDAData, + globalParamsSymbolSize, + cudaMemcpyDeviceToDevice, + 0); + } + // + // In the case of the entry-point parameters, we have to deal with + // two different wrinkles. + // + // First, the `bindRoot` will have the entry-point argument data + // stored in a GPU-memory buffer, but we actually need it to be + // in host CPU memory. We handle that for now by allocating a + // temporary host memory buffer (if needed) and copying the data + // from device to host. + // + auto entryPointBuffer = currentRootObject->entryPointObjects[kernelId]->bufferResource.Ptr(); + size_t entryPointDataSize = entryPointBuffer ? entryPointBuffer->getDesc().sizeInBytes : 0; + void* entryPointHostData = nullptr; + if (entryPointDataSize) + { + entryPointHostData = alloca(entryPointDataSize); + cudaMemcpy( + entryPointHostData, + (void*)entryPointBuffer->getBindlessHandle(), + entryPointDataSize, + cudaMemcpyDeviceToHost); + } + // + // Second, the argument data for the entry-point parameters has + // been allocated and filled in as a single buffer, but `cuLaunchKernel` + // defaults to taking pointers to each of the kernel arguments. + // + // We could loop over the entry-point parameters using the refleciton + // information, and set up a pointer to each using the offset stored + // for it in the reflection data. Such an approach would require + // us to create and fill in a dynamically-sized array here. + // + // Instead, we take advantage of a documented but seldom-used feature + // of `cuLaunchKernel` that allows the argument data for all of the + // kernel "launch parameters" to be specified as a single buffer. + // + + void* extraOptions[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + (void*)entryPointHostData, + CU_LAUNCH_PARAM_BUFFER_SIZE, + &entryPointDataSize, + CU_LAUNCH_PARAM_END, + }; + + // Once we have all the decessary data extracted and/or + // set up, we can launch the kernel and see what happens. + // + auto cudaLaunchResult = cuLaunchKernel( + currentPipeline->shaderProgram->cudaKernel, + x, + y, + z, + int(threadGroupSize[0]), + int(threadGroupSize[1]), + int(threadGroupSize[2]), + sharedSizeInBytes, + 0, + nullptr, + extraOptions); + + SLANG_ASSERT(cudaLaunchResult == CUDA_SUCCESS); + } + + virtual void submitGpuWork() override {} + + virtual void waitForGpu() override + { + auto result = cudaDeviceSynchronize(); + SLANG_ASSERT(result == CUDA_SUCCESS); + } + + virtual RendererType getRendererType() const override { return RendererType::CUDA; } + +public: + // Unused public interfaces. These functions are not supported on CUDA. + virtual const Slang::List& getFeatures() override + { + static Slang::List featureSet; + return featureSet; + } + virtual void setClearColor(const float color[4]) override + { + SLANG_UNUSED(color); + } + virtual void clearFrame() override {} + virtual void presentFrame() override {} + virtual TextureResource::Desc getSwapChainTextureDesc() override + { + return TextureResource::Desc(); + } + + virtual Result createSamplerState(SamplerState::Desc const& desc, SamplerState** outSampler) override + { + SLANG_UNUSED(desc); + *outSampler = nullptr; + return SLANG_OK; + } + + virtual Result createInputLayout( + const InputElementDesc* inputElements, + UInt inputElementCount, + InputLayout** outLayout) override + { + SLANG_UNUSED(inputElements); + SLANG_UNUSED(inputElementCount); + SLANG_UNUSED(outLayout); + return SLANG_E_NOT_AVAILABLE; + } + virtual Result createDescriptorSetLayout( + const DescriptorSetLayout::Desc& desc, DescriptorSetLayout** outLayout) override + { + SLANG_UNUSED(desc); + SLANG_UNUSED(outLayout); + return SLANG_E_NOT_AVAILABLE; + } + virtual Result createPipelineLayout(const PipelineLayout::Desc& desc, PipelineLayout** outLayout) override + { + SLANG_UNUSED(desc); + SLANG_UNUSED(outLayout); + return SLANG_E_NOT_AVAILABLE; + } + virtual Result createDescriptorSet(DescriptorSetLayout* layout, DescriptorSet** outDescriptorSet) override + { + SLANG_UNUSED(layout); + SLANG_UNUSED(outDescriptorSet); + return SLANG_E_NOT_AVAILABLE; + } + virtual Result createGraphicsPipelineState(const GraphicsPipelineStateDesc& desc, PipelineState** outState) override + { + SLANG_UNUSED(desc); + SLANG_UNUSED(outState); + return SLANG_E_NOT_AVAILABLE; + } + virtual SlangResult captureScreenSurface(Surface& surfaceOut) override + { + SLANG_UNUSED(surfaceOut); + return SLANG_E_NOT_AVAILABLE; + } + virtual void setPrimitiveTopology(PrimitiveTopology topology) override + { + SLANG_UNUSED(topology); + } + virtual void setDescriptorSet( + PipelineType pipelineType, + PipelineLayout* layout, + UInt index, + DescriptorSet* descriptorSet) override + { + SLANG_UNUSED(pipelineType); + SLANG_UNUSED(layout); + SLANG_UNUSED(index); + SLANG_UNUSED(descriptorSet); + } + virtual void setVertexBuffers( + UInt startSlot, + UInt slotCount, + BufferResource* const* buffers, + const UInt* strides, + const UInt* offsets) override + { + SLANG_UNUSED(startSlot); + SLANG_UNUSED(slotCount); + SLANG_UNUSED(buffers); + SLANG_UNUSED(strides); + SLANG_UNUSED(offsets); + } + virtual void setIndexBuffer(BufferResource* buffer, Format indexFormat, UInt offset = 0) override + { + SLANG_UNUSED(buffer); + SLANG_UNUSED(indexFormat); + SLANG_UNUSED(offset); + } + virtual void setDepthStencilTarget(ResourceView* depthStencilView) override + { + SLANG_UNUSED(depthStencilView); + } + virtual void setViewports(UInt count, Viewport const* viewports) override + { + SLANG_UNUSED(count); + SLANG_UNUSED(viewports); + } + virtual void setScissorRects(UInt count, ScissorRect const* rects) override + { + SLANG_UNUSED(count); + SLANG_UNUSED(rects); + } + virtual void draw(UInt vertexCount, UInt startVertex) override + { + SLANG_UNUSED(vertexCount); + SLANG_UNUSED(startVertex); + } + virtual void drawIndexed(UInt indexCount, UInt startIndex, UInt baseVertex) override + { + SLANG_UNUSED(indexCount); + SLANG_UNUSED(startIndex); + SLANG_UNUSED(baseVertex); + } +}; + +SlangResult CUDAShaderObject::init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout) +{ + this->layout = typeLayout; + + // If the layout tells us that there is any uniform data, + // then we need to allocate a constant buffer to hold that data. + // + // TODO: Do we need to allocate a shadow copy for use from + // the CPU? + // + // TODO: When/where do we bind this constant buffer into + // a descriptor set for later use? + // + auto slangLayout = layout->typeLayout; + size_t uniformSize = layout->typeLayout->getSize(); + if (uniformSize) + { + BufferResource::Desc bufferDesc; + bufferDesc.init(uniformSize); + bufferDesc.cpuAccessFlags |= Resource::AccessFlag::Write; + RefPtr constantBuffer; + SLANG_RETURN_ON_FAIL(renderer->createBufferResource( + Resource::Usage::ConstantBuffer, bufferDesc, nullptr, constantBuffer.writeRef())); + bufferResource = dynamic_cast(constantBuffer.Ptr()); + } + + // If the layout specifies that we have any sub-objects, then + // we need to size the array to account for them. + // + Index subObjectCount = slangLayout->getSubObjectRangeCount(); + objects.setCount(subObjectCount); + + for (auto subObjectRange : layout->subObjectRanges) + { + RefPtr subObjectLayout = subObjectRange.layout; + + // In the case where the sub-object range represents an + // existential-type leaf field (e.g., an `IBar`), we + // cannot pre-allocate the object(s) to go into that + // range, since we can't possibly know what to allocate + // at this point. + // + if (!subObjectLayout) + continue; + // + // Otherwise, we will allocate a sub-object to fill + // in each entry in this range, based on the layout + // information we already have. + + auto& bindingRangeInfo = layout->m_bindingRanges[subObjectRange.bindingRangeIndex]; + for (Index i = 0; i < bindingRangeInfo.count; ++i) + { + RefPtr subObject = new CUDAShaderObject(); + SLANG_RETURN_ON_FAIL(subObject->init(renderer, subObjectLayout)); + objects[bindingRangeInfo.baseIndex + i] = subObject; + ShaderOffset offset; + offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i; + if (subObject->bufferResource) + SLANG_RETURN_ON_FAIL(setData(offset, &subObject->bufferResource->m_cudaMemory, sizeof(void*))); + } + } + return SLANG_OK; +} + +SlangResult CUDARootShaderObject::init(Renderer* renderer, CUDAShaderObjectLayout* typeLayout) +{ + SLANG_RETURN_ON_FAIL(CUDAShaderObject::init(renderer, typeLayout)); + auto programLayout = dynamic_cast(typeLayout); + for (auto& entryPoint : programLayout->entryPointLayouts) + { + RefPtr object = new CUDAShaderObject(); + SLANG_RETURN_ON_FAIL(object->init(renderer, entryPoint)); + entryPointObjects.add(object); + } + return SLANG_OK; +} + +Renderer* createCUDARenderer() { return new CUDARenderer(); } +#else +Renderer* createCUDARenderer() { return nullptr; } +#endif + +} diff --git a/tools/gfx/cuda/render-cuda.h b/tools/gfx/cuda/render-cuda.h new file mode 100644 index 000000000..cf63113c8 --- /dev/null +++ b/tools/gfx/cuda/render-cuda.h @@ -0,0 +1,8 @@ +#pragma once + +namespace gfx +{ +class Renderer; + +Renderer* createCUDARenderer(); +} diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp index 43a255817..cf3a0576c 100644 --- a/tools/gfx/render.cpp +++ b/tools/gfx/render.cpp @@ -7,6 +7,7 @@ #include "d3d12/render-d3d12.h" #include "open-gl/render-gl.h" #include "vulkan/render-vk.h" +#include "cuda/render-cuda.h" namespace gfx { using namespace Slang; @@ -432,6 +433,10 @@ ProjectionStyle RendererUtil::getProjectionStyle(RendererType type) { return &createVKRenderer; } + case RendererType::CUDA: + { + return &createCUDARenderer; + } #endif default: return nullptr; diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp index 5903c49f7..1c8dc1d72 100644 --- a/tools/render-test/render-test-main.cpp +++ b/tools/render-test/render-test-main.cpp @@ -1171,7 +1171,7 @@ static SlangResult _innerMain(Slang::StdWriters* stdWriters, SlangSession* sessi return SLANG_OK; } - if (options.rendererType == RendererType::CUDA) + if (options.rendererType == RendererType::CUDA && !options.useShaderObjects) { #if RENDER_TEST_CUDA // Check we have all the required features -- cgit v1.2.3