summaryrefslogtreecommitdiffstats
path: root/tools/gfx/cuda/cuda-device.cpp
diff options
context:
space:
mode:
authorskallweitNV <64953474+skallweitNV@users.noreply.github.com>2024-03-15 18:25:21 +0100
committerGitHub <noreply@github.com>2024-03-15 10:25:21 -0700
commit9ee88a43f4e67d9c714c27bf968401b6bf7524af (patch)
tree11c0f7b46168d7885f2ac220ae8bd8f51d40ac83 /tools/gfx/cuda/cuda-device.cpp
parentd40931cc8bde13520ea45769cf94e7cc6cc9065f (diff)
[gfx] use CUDA driver API (#3776)
Diffstat (limited to 'tools/gfx/cuda/cuda-device.cpp')
-rw-r--r--tools/gfx/cuda/cuda-device.cpp112
1 files changed, 66 insertions, 46 deletions
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
index 0fcf9319e..b1b582d00 100644
--- a/tools/gfx/cuda/cuda-device.cpp
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -70,22 +70,21 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
int devicesProhibited = 0;
uint64_t maxComputePerf = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount));
// Find the best CUDA capable GPU device
for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
{
+ CUdevice device;
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, currentDevice));
int computeMode = -1, major = 0, minor = 0;
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
// If this GPU is not running on Compute Mode prohibited,
// then we can add it to the list
- if (computeMode != cudaComputeModeProhibited)
+ if (computeMode != CU_COMPUTEMODE_PROHIBITED)
{
if (major == 9999 && minor == 9999)
{
@@ -97,10 +96,8 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
}
int multiProcessorCount = 0, clockRate = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
- &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
if (compute_perf > maxComputePerf)
@@ -172,8 +169,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
}
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
m_context = new CUDAContext();
SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
@@ -192,9 +187,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
m_features.add("has-ptr");
}
- cudaDeviceProp deviceProps;
- cudaGetDeviceProperties(&deviceProps, m_deviceIndex);
-
// Initialize DeviceInfo
{
m_info.deviceType = DeviceType::CUDA;
@@ -203,32 +195,53 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
m_info.apiName = "CUDA";
static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
- m_adapterName = deviceProps.name;
+ char deviceName[256];
+ cuDeviceGetName(deviceName, sizeof(deviceName), m_device);
+ m_adapterName = deviceName;
m_info.adapterName = m_adapterName.begin();
m_info.timestampFrequency = 1000000;
}
// Get device limits.
{
+ CUresult lastResult = CUDA_SUCCESS;
+ auto getAttribute = [&](CUdevice_attribute attribute) -> int
+ {
+ int value;
+ CUresult result = cuDeviceGetAttribute(&value, attribute, m_device);
+ if (result != CUDA_SUCCESS)
+ lastResult = result;
+ return value;
+ };
+
DeviceLimits limits = {};
- limits.maxTextureDimension1D = deviceProps.maxSurface1D;
- limits.maxTextureDimension2D = Math::Min(deviceProps.maxSurface2D[0], deviceProps.maxSurface2D[1]);
- limits.maxTextureDimension3D = Math::Min(deviceProps.maxSurface3D[0], Math::Min(deviceProps.maxSurface3D[1], deviceProps.maxSurface3D[2]));
- limits.maxTextureDimensionCube = deviceProps.maxSurfaceCubemap;
- limits.maxTextureArrayLayers = Math::Min(deviceProps.maxSurface1DLayered[1], deviceProps.maxSurface2DLayered[2]);
+
+ limits.maxTextureDimension1D = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH);
+ limits.maxTextureDimension2D = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT));
+ limits.maxTextureDimension3D = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH),
+ Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH)));
+ limits.maxTextureDimensionCube = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH);
+ limits.maxTextureArrayLayers = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS));
// limits.maxVertexInputElements
// limits.maxVertexInputElementOffset
// limits.maxVertexStreams
// limits.maxVertexStreamStride
- limits.maxComputeThreadsPerGroup = deviceProps.maxThreadsPerBlock;
- limits.maxComputeThreadGroupSize[0] = deviceProps.maxThreadsDim[0];
- limits.maxComputeThreadGroupSize[1] = deviceProps.maxThreadsDim[1];
- limits.maxComputeThreadGroupSize[2] = deviceProps.maxThreadsDim[2];
- limits.maxComputeDispatchThreadGroups[0] = deviceProps.maxGridSize[0];
- limits.maxComputeDispatchThreadGroups[1] = deviceProps.maxGridSize[1];
- limits.maxComputeDispatchThreadGroups[2] = deviceProps.maxGridSize[2];
+ limits.maxComputeThreadsPerGroup = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+ limits.maxComputeThreadGroupSize[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X);
+ limits.maxComputeThreadGroupSize[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y);
+ limits.maxComputeThreadGroupSize[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z);
+ limits.maxComputeDispatchThreadGroups[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X);
+ limits.maxComputeDispatchThreadGroups[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y);
+ limits.maxComputeDispatchThreadGroups[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z);
// limits.maxViewports
// limits.maxViewportDimensions
@@ -237,6 +250,8 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
// limits.maxShaderVisibleSamplers
m_info.limits = limits;
+
+ SLANG_CUDA_RETURN_ON_FAIL(lastResult);
}
return SLANG_OK;
@@ -728,10 +743,16 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
auto desc = fixupBufferDesc(descIn);
RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
resource->m_cudaContext = m_context;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAllocManaged(
+ (CUdeviceptr*)(&resource->m_cudaMemory),
+ desc.sizeInBytes,
+ CU_MEM_ATTACH_GLOBAL));
if (initData)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+ (CUdeviceptr)resource->m_cudaMemory,
+ (CUdeviceptr)initData,
+ desc.sizeInBytes));
}
returnComPtr(outResource, resource);
return SLANG_OK;
@@ -755,28 +776,28 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
// "external memory" object, which represents the relationship
// with another API's objects. In order to create this external
// memory association, we first need to fill in a descriptor struct.
- cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+ CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
switch (handle.api)
{
case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
break;
case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
break;
default:
return SLANG_FAIL;
}
externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
externalMemoryHandleDesc.size = desc.sizeInBytes;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+ externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
// Once we have filled in the descriptor, we can request
// that CUDA create the required association between the
// external buffer and its own memory.
- cudaExternalMemory_t externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ CUexternalMemory externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
resource->m_cudaExternalMemory = externalMemory;
// The CUDA "external memory" handle is not itself a device
@@ -786,13 +807,13 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
// Just as for the external memory, we fill in a descriptor
// structure (although in this case we only need to specify
// the size).
- cudaExternalMemoryBufferDesc bufferDesc;
+ CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc;
memset(&bufferDesc, 0, sizeof(bufferDesc));
bufferDesc.size = desc.sizeInBytes;
// Finally, we can "map" the buffer to get a device address.
void* deviceAddress;
- SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+ SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedBuffer((CUdeviceptr*)&deviceAddress, externalMemory, &bufferDesc));
resource->m_cudaMemory = deviceAddress;
returnComPtr(outResource, resource);
@@ -833,7 +854,7 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
}
externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
externalMemoryHandleDesc.size = size;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+ externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
CUexternalMemory externalMemory;
SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
@@ -1165,11 +1186,10 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
List<uint8_t> blobData;
blobData.setCount((Index)size);
- cudaMemcpy(
- blobData.getBuffer(),
- (uint8_t*)bufferImpl->m_cudaMemory + offset,
- size,
- cudaMemcpyDefault);
+ cuMemcpy(
+ (CUdeviceptr)blobData.getBuffer(),
+ (CUdeviceptr)((uint8_t*)bufferImpl->m_cudaMemory + offset),
+ size);
auto blob = ListBlob::moveCreate(blobData);