diff options
| -rw-r--r-- | CMakeLists.txt | 8 | ||||
| -rw-r--r-- | premake5.lua | 4 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-base.h | 1 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-buffer.cpp | 2 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-command-queue.cpp | 21 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-device.cpp | 112 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-helper-functions.cpp | 45 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-helper-functions.h | 4 | ||||
| -rw-r--r-- | tools/gfx/cuda/cuda-shader-object.cpp | 21 |
9 files changed, 116 insertions, 102 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b0b14d6d..49d2ab2f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -481,7 +481,7 @@ if(SLANG_ENABLE_GFX) slang Vulkan-Headers $<$<BOOL:${SLANG_ENABLE_XLIB}>:X11::X11> - $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cudart;CUDA::cuda_driver> + $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cuda_driver> LINK_WITH_FRAMEWORK Cocoa QuartzCore @@ -646,7 +646,7 @@ if(SLANG_ENABLE_TESTS AND SLANG_ENABLE_GFX AND SLANG_ENABLE_SLANGD AND SLANG_ENA gfx gfx-util platform - $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cudart;CUDA::cuda_driver> + $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cuda_driver> EXTRA_COMPILE_DEFINITIONS_PRIVATE $<$<BOOL:${SLANG_ENABLE_CUDA}>:RENDER_TEST_CUDA> $<$<BOOL:${SLANG_ENABLE_OPTIX}>:RENDER_TEST_OPTIX> @@ -676,7 +676,7 @@ if (SLANG_ENABLE_EXAMPLES AND SLANG_ENABLE_GFX) slang gfx platform - $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cudart;CUDA::cuda_driver> + $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cuda_driver> FOLDER examples ) @@ -698,7 +698,7 @@ if (SLANG_ENABLE_EXAMPLES AND SLANG_ENABLE_GFX) gfx gfx-util platform - $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cudart;CUDA::cuda_driver> + $<$<BOOL:${SLANG_ENABLE_CUDA}>:CUDA::cuda_driver> EXTRA_COMPILE_DEFINITIONS_PRIVATE $<$<BOOL:${SLANG_ENABLE_XLIB}>:SLANG_ENABLE_XLIB> REQUIRED_BY all-examples diff --git a/premake5.lua b/premake5.lua index 19fc9c6fa..1cfe33517 100644 --- a/premake5.lua +++ b/premake5.lua @@ -530,7 +530,7 @@ function addCUDAIfEnabled() filter {} includedirs { cudaPath .. "/include" } includedirs { cudaPath .. "/include", cudaPath .. "/common/inc" } - links { "cuda", "cudart" } + links { "cuda" } if optixPath then defines { "RENDER_TEST_OPTIX" } includedirs { optixPath .. "include/" } @@ -557,7 +557,7 @@ function addCUDAIfEnabled() filter { "platforms:x64" } libdirs { cudaPath .. "/lib64/" } filter {} - links { "cuda", "cudart" } + links { "cuda" } else print "Error: CUDA is enabled but --cuda-sdk-path is not specified." end diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h index 57a244089..63c280cde 100644 --- a/tools/gfx/cuda/cuda-base.h +++ b/tools/gfx/cuda/cuda-base.h @@ -4,7 +4,6 @@ #ifdef GFX_ENABLE_CUDA #include <cuda.h> -#include <cuda_runtime_api.h> #include "core/slang-basic.h" #include "core/slang-blob.h" #include "core/slang-std-writers.h" diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp index 1cd162841..bc4ef32ef 100644 --- a/tools/gfx/cuda/cuda-buffer.cpp +++ b/tools/gfx/cuda/cuda-buffer.cpp @@ -15,7 +15,7 @@ BufferResourceImpl::~BufferResourceImpl() { if (m_cudaMemory) { - SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree((CUdeviceptr)m_cudaMemory)); } } diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp index 4b0ab7d94..5c6f2db26 100644 --- a/tools/gfx/cuda/cuda-command-queue.cpp +++ b/tools/gfx/cuda/cuda-command-queue.cpp @@ -104,11 +104,10 @@ void CommandQueueImpl::dispatchCompute(int x, int y, int z) "SLANG_globalParams"); CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer(); - cudaMemcpyAsync( - (void*)globalParamsSymbol, - (void*)globalParamsCUDAData, + cuMemcpyAsync( + (CUdeviceptr)globalParamsSymbol, + (CUdeviceptr)globalParamsCUDAData, globalParamsSymbolSize, - cudaMemcpyDefault, 0); } // @@ -155,17 +154,19 @@ void CommandQueueImpl::copyBuffer( { auto dstImpl = static_cast<BufferResourceImpl*>(dst); auto srcImpl = static_cast<BufferResourceImpl*>(src); - cudaMemcpy( - (uint8_t*)dstImpl->m_cudaMemory + dstOffset, - (uint8_t*)srcImpl->m_cudaMemory + srcOffset, - size, - cudaMemcpyDefault); + cuMemcpy( + (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + dstOffset), + (CUdeviceptr)((uint8_t*)srcImpl->m_cudaMemory + srcOffset), + size); } void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data) { auto dstImpl = static_cast<BufferResourceImpl*>(dst); - cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault); + cuMemcpy( + (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + offset), + (CUdeviceptr)data, + size); } void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index) diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp index 0fcf9319e..b1b582d00 100644 --- a/tools/gfx/cuda/cuda-device.cpp +++ b/tools/gfx/cuda/cuda-device.cpp @@ -70,22 +70,21 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex) int devicesProhibited = 0; uint64_t maxComputePerf = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount)); // Find the best CUDA capable GPU device for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) { + CUdevice device; + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, currentDevice)); int computeMode = -1, major = 0, minor = 0; - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); // If this GPU is not running on Compute Mode prohibited, // then we can add it to the list - if (computeMode != cudaComputeModeProhibited) + if (computeMode != CU_COMPUTEMODE_PROHIBITED) { if (major == 9999 && minor == 9999) { @@ -97,10 +96,8 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex) } int multiProcessorCount = 0, clockRate = 0; - SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute( - &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); - SLANG_CUDA_RETURN_ON_FAIL( - cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; if (compute_perf > maxComputePerf) @@ -172,8 +169,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc) SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex)); } - SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType); - m_context = new CUDAContext(); SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex)); @@ -192,9 +187,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc) m_features.add("has-ptr"); } - cudaDeviceProp deviceProps; - cudaGetDeviceProperties(&deviceProps, m_deviceIndex); - // Initialize DeviceInfo { m_info.deviceType = DeviceType::CUDA; @@ -203,32 +195,53 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc) m_info.apiName = "CUDA"; static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity)); - m_adapterName = deviceProps.name; + char deviceName[256]; + cuDeviceGetName(deviceName, sizeof(deviceName), m_device); + m_adapterName = deviceName; m_info.adapterName = m_adapterName.begin(); m_info.timestampFrequency = 1000000; } // Get device limits. { + CUresult lastResult = CUDA_SUCCESS; + auto getAttribute = [&](CUdevice_attribute attribute) -> int + { + int value; + CUresult result = cuDeviceGetAttribute(&value, attribute, m_device); + if (result != CUDA_SUCCESS) + lastResult = result; + return value; + }; + DeviceLimits limits = {}; - limits.maxTextureDimension1D = deviceProps.maxSurface1D; - limits.maxTextureDimension2D = Math::Min(deviceProps.maxSurface2D[0], deviceProps.maxSurface2D[1]); - limits.maxTextureDimension3D = Math::Min(deviceProps.maxSurface3D[0], Math::Min(deviceProps.maxSurface3D[1], deviceProps.maxSurface3D[2])); - limits.maxTextureDimensionCube = deviceProps.maxSurfaceCubemap; - limits.maxTextureArrayLayers = Math::Min(deviceProps.maxSurface1DLayered[1], deviceProps.maxSurface2DLayered[2]); + + limits.maxTextureDimension1D = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH); + limits.maxTextureDimension2D = Math::Min( + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH), + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT)); + limits.maxTextureDimension3D = Math::Min( + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH), + Math::Min( + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT), + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH))); + limits.maxTextureDimensionCube = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH); + limits.maxTextureArrayLayers = Math::Min( + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS), + getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS)); // limits.maxVertexInputElements // limits.maxVertexInputElementOffset // limits.maxVertexStreams // limits.maxVertexStreamStride - limits.maxComputeThreadsPerGroup = deviceProps.maxThreadsPerBlock; - limits.maxComputeThreadGroupSize[0] = deviceProps.maxThreadsDim[0]; - limits.maxComputeThreadGroupSize[1] = deviceProps.maxThreadsDim[1]; - limits.maxComputeThreadGroupSize[2] = deviceProps.maxThreadsDim[2]; - limits.maxComputeDispatchThreadGroups[0] = deviceProps.maxGridSize[0]; - limits.maxComputeDispatchThreadGroups[1] = deviceProps.maxGridSize[1]; - limits.maxComputeDispatchThreadGroups[2] = deviceProps.maxGridSize[2]; + limits.maxComputeThreadsPerGroup = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); + limits.maxComputeThreadGroupSize[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X); + limits.maxComputeThreadGroupSize[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y); + limits.maxComputeThreadGroupSize[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z); + limits.maxComputeDispatchThreadGroups[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X); + limits.maxComputeDispatchThreadGroups[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y); + limits.maxComputeDispatchThreadGroups[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z); // limits.maxViewports // limits.maxViewportDimensions @@ -237,6 +250,8 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc) // limits.maxShaderVisibleSamplers m_info.limits = limits; + + SLANG_CUDA_RETURN_ON_FAIL(lastResult); } return SLANG_OK; @@ -728,10 +743,16 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource( auto desc = fixupBufferDesc(descIn); RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc); resource->m_cudaContext = m_context; - SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemAllocManaged( + (CUdeviceptr*)(&resource->m_cudaMemory), + desc.sizeInBytes, + CU_MEM_ATTACH_GLOBAL)); if (initData) { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy( + (CUdeviceptr)resource->m_cudaMemory, + (CUdeviceptr)initData, + desc.sizeInBytes)); } returnComPtr(outResource, resource); return SLANG_OK; @@ -755,28 +776,28 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle( // "external memory" object, which represents the relationship // with another API's objects. In order to create this external // memory association, we first need to fill in a descriptor struct. - cudaExternalMemoryHandleDesc externalMemoryHandleDesc; + CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc; memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); switch (handle.api) { case InteropHandleAPI::D3D12: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; + externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE; break; case InteropHandleAPI::Vulkan: - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; + externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; break; default: return SLANG_FAIL; } externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; externalMemoryHandleDesc.size = desc.sizeInBytes; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED; // Once we have filled in the descriptor, we can request // that CUDA create the required association between the // external buffer and its own memory. - cudaExternalMemory_t externalMemory; - SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); + CUexternalMemory externalMemory; + SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); resource->m_cudaExternalMemory = externalMemory; // The CUDA "external memory" handle is not itself a device @@ -786,13 +807,13 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle( // Just as for the external memory, we fill in a descriptor // structure (although in this case we only need to specify // the size). - cudaExternalMemoryBufferDesc bufferDesc; + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc; memset(&bufferDesc, 0, sizeof(bufferDesc)); bufferDesc.size = desc.sizeInBytes; // Finally, we can "map" the buffer to get a device address. void* deviceAddress; - SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc)); + SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedBuffer((CUdeviceptr*)&deviceAddress, externalMemory, &bufferDesc)); resource->m_cudaMemory = deviceAddress; returnComPtr(outResource, resource); @@ -833,7 +854,7 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle( } externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue; externalMemoryHandleDesc.size = size; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED; CUexternalMemory externalMemory; SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); @@ -1165,11 +1186,10 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource( List<uint8_t> blobData; blobData.setCount((Index)size); - cudaMemcpy( - blobData.getBuffer(), - (uint8_t*)bufferImpl->m_cudaMemory + offset, - size, - cudaMemcpyDefault); + cuMemcpy( + (CUdeviceptr)blobData.getBuffer(), + (CUdeviceptr)((uint8_t*)bufferImpl->m_cudaMemory + offset), + size); auto blob = ListBlob::moveCreate(blobData); diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp index 0a8d734d8..d478e8815 100644 --- a/tools/gfx/cuda/cuda-helper-functions.cpp +++ b/tools/gfx/cuda/cuda-helper-functions.cpp @@ -39,11 +39,6 @@ SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line) return info.handle(); } -SlangResult _handleCUDAError(cudaError_t error, const char* file, int line) -{ - return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle(); -} - # ifdef RENDER_TEST_OPTIX static bool _isError(OptixResult result) @@ -71,41 +66,41 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message, # endif # endif -AdapterLUID getAdapterLUID(int device) +AdapterLUID getAdapterLUID(int deviceIndex) { + CUdevice device; + cuDeviceGet(&device, deviceIndex); AdapterLUID luid = {}; -#if SLANG_WIN32 || SLANG_WIN64 - // LUID reported by CUDA is undefined i not on windows platform. - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device); - SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(cudaDeviceProp::luid)); - memcpy(&luid, prop.luid, sizeof(cudaDeviceProp::luid)); -#else - SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(int)); - memcpy(&luid, &device, sizeof(int)); -#endif + unsigned int deviceNodeMask; + cuDeviceGetLuid((char*)&luid, &deviceNodeMask, device); return luid; } -} // namespace cuda - -Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters) +Result SLANG_MCALL getAdapters(List<AdapterInfo>& outAdapters) { int deviceCount; - cudaGetDeviceCount(&deviceCount); - for (int device = 0; device < deviceCount; device++) + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount)); + for (int deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device); + CUdevice device; + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, deviceIndex)); + AdapterInfo info = {}; - memcpy(info.name, prop.name, Math::Min(strlen(prop.name), sizeof(AdapterInfo::name) - 1)); - info.luid = cuda::getAdapterLUID(device); + SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetName(info.name, sizeof(info.name), device)); + info.luid = getAdapterLUID(deviceIndex); outAdapters.add(info); } return SLANG_OK; } +} // namespace cuda + +Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters) +{ + return cuda::getAdapters(outAdapters); +} + Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice) { RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl(); diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h index 7417249d5..2217c727c 100644 --- a/tools/gfx/cuda/cuda-helper-functions.h +++ b/tools/gfx/cuda/cuda-helper-functions.h @@ -13,7 +13,6 @@ using namespace Slang; namespace cuda { SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; } -SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; } // A enum used to control if errors are reported on failure of CUDA call. enum class CUDAReportStyle @@ -45,7 +44,6 @@ struct CUDAErrorInfo // If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream. SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line); -SlangResult _handleCUDAError(cudaError_t error, const char* file, int line); # define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__) @@ -99,7 +97,7 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message, # endif -AdapterLUID getAdapterLUID(int device); +AdapterLUID getAdapterLUID(int deviceIndex); } // namespace cuda #endif diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp index 3fc55e401..02b4cf695 100644 --- a/tools/gfx/cuda/cuda-shader-object.cpp +++ b/tools/gfx/cuda/cuda-shader-object.cpp @@ -37,7 +37,7 @@ Result ShaderObjectData::setCount(Index count) m_bufferResource = new BufferResourceImpl(desc); if (count) { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&m_bufferResource->m_cudaMemory, (size_t)count)); } IResourceView::Desc viewDesc = {}; viewDesc.type = IResourceView::Type::UnorderedAccess; @@ -51,17 +51,16 @@ Result ShaderObjectData::setCount(Index count) void* newMemory = nullptr; if (count) { - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&newMemory, (size_t)count)); } if (oldSize) { - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - newMemory, - m_bufferResource->m_cudaMemory, - Math::Min((size_t)count, oldSize), - cudaMemcpyDefault)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy( + (CUdeviceptr)newMemory, + (CUdeviceptr)m_bufferResource->m_cudaMemory, + Math::Min((size_t)count, oldSize))); } - cudaFree(m_bufferResource->m_cudaMemory); + cuMemFree((CUdeviceptr)m_bufferResource->m_cudaMemory); m_bufferResource->m_cudaMemory = newMemory; m_bufferResource->getDesc()->sizeInBytes = count; } @@ -189,8 +188,10 @@ SLANG_NO_THROW Result SLANG_MCALL { Size temp = m_data.getCount() - (Size)offset.uniformOffset; size = Math::Min(size, temp); - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy( - (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy( + (CUdeviceptr)((uint8_t*)m_data.getBuffer() + offset.uniformOffset), + (CUdeviceptr)data, + size)); return SLANG_OK; } |
