[gfx] use CUDA driver API (#3776)

author: skallweitNV <64953474+skallweitNV@users.noreply.github.com> 2024-03-15 18:25:21 +0100
committer: GitHub <noreply@github.com> 2024-03-15 10:25:21 -0700
commit: 9ee88a43f4e67d9c714c27bf968401b6bf7524af (patch)
tree: 11c0f7b46168d7885f2ac220ae8bd8f51d40ac83 /tools
parent: d40931cc8bde13520ea45769cf94e7cc6cc9065f (diff)
7 files changed, 110 insertions, 96 deletions
diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h
index 57a244089..63c280cde 100644
--- a/tools/gfx/cuda/cuda-base.h
+++ b/tools/gfx/cuda/cuda-base.h
@@ -4,7 +4,6 @@
 
 #ifdef GFX_ENABLE_CUDA
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include "core/slang-basic.h"
 #include "core/slang-blob.h"
 #include "core/slang-std-writers.h"
diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp
index 1cd162841..bc4ef32ef 100644
--- a/tools/gfx/cuda/cuda-buffer.cpp
+++ b/tools/gfx/cuda/cuda-buffer.cpp
@@ -15,7 +15,7 @@ BufferResourceImpl::~BufferResourceImpl()
 {
     if (m_cudaMemory)
     {
-        SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+        SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree((CUdeviceptr)m_cudaMemory));
     }
 }
 
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
index 4b0ab7d94..5c6f2db26 100644
--- a/tools/gfx/cuda/cuda-command-queue.cpp
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -104,11 +104,10 @@ void CommandQueueImpl::dispatchCompute(int x, int y, int z)
             "SLANG_globalParams");
 
         CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
-        cudaMemcpyAsync(
-            (void*)globalParamsSymbol,
-            (void*)globalParamsCUDAData,
+        cuMemcpyAsync(
+            (CUdeviceptr)globalParamsSymbol,
+            (CUdeviceptr)globalParamsCUDAData,
             globalParamsSymbolSize,
-            cudaMemcpyDefault,
             0);
     }
     //
@@ -155,17 +154,19 @@ void CommandQueueImpl::copyBuffer(
 {
     auto dstImpl = static_cast<BufferResourceImpl*>(dst);
     auto srcImpl = static_cast<BufferResourceImpl*>(src);
-    cudaMemcpy(
-        (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
-        (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
-        size,
-        cudaMemcpyDefault);
+    cuMemcpy(
+        (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + dstOffset),
+        (CUdeviceptr)((uint8_t*)srcImpl->m_cudaMemory + srcOffset),
+        size);
 }
 
 void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
 {
     auto dstImpl = static_cast<BufferResourceImpl*>(dst);
-    cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+    cuMemcpy(
+        (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + offset),
+        (CUdeviceptr)data,
+        size);
 }
 
 void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
index 0fcf9319e..b1b582d00 100644
--- a/tools/gfx/cuda/cuda-device.cpp
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -70,22 +70,21 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
     int devicesProhibited = 0;
 
     uint64_t maxComputePerf = 0;
-    SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+    SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount));
 
     // Find the best CUDA capable GPU device
     for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
     {
+        CUdevice device;
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, currentDevice));
         int computeMode = -1, major = 0, minor = 0;
-        SLANG_CUDA_RETURN_ON_FAIL(
-            cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
-        SLANG_CUDA_RETURN_ON_FAIL(
-            cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
-        SLANG_CUDA_RETURN_ON_FAIL(
-            cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device));
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
 
         // If this GPU is not running on Compute Mode prohibited,
         // then we can add it to the list
-        if (computeMode != cudaComputeModeProhibited)
+        if (computeMode != CU_COMPUTEMODE_PROHIBITED)
         {
             if (major == 9999 && minor == 9999)
             {
@@ -97,10 +96,8 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
             }
 
             int multiProcessorCount = 0, clockRate = 0;
-            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
-                &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
-            SLANG_CUDA_RETURN_ON_FAIL(
-                cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+            SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
             uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
 
             if (compute_perf > maxComputePerf)
@@ -172,8 +169,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
         SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
     }
 
-    SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
     m_context = new CUDAContext();
 
     SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
@@ -192,9 +187,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
         m_features.add("has-ptr");
     }
 
-    cudaDeviceProp deviceProps;
-    cudaGetDeviceProperties(&deviceProps, m_deviceIndex);
-
     // Initialize DeviceInfo
     {
         m_info.deviceType = DeviceType::CUDA;
@@ -203,32 +195,53 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
         m_info.apiName = "CUDA";
         static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
         ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
-        m_adapterName = deviceProps.name;
+        char deviceName[256];
+        cuDeviceGetName(deviceName, sizeof(deviceName), m_device);
+        m_adapterName = deviceName;
         m_info.adapterName = m_adapterName.begin();
         m_info.timestampFrequency = 1000000;
     }
 
     // Get device limits.
     {
+        CUresult lastResult = CUDA_SUCCESS;
+        auto getAttribute = [&](CUdevice_attribute attribute) -> int
+        {
+            int value;
+            CUresult result = cuDeviceGetAttribute(&value, attribute, m_device);
+            if (result != CUDA_SUCCESS)
+                lastResult = result;
+            return value;
+        };
+
         DeviceLimits limits = {};
-        limits.maxTextureDimension1D = deviceProps.maxSurface1D;
-        limits.maxTextureDimension2D = Math::Min(deviceProps.maxSurface2D[0], deviceProps.maxSurface2D[1]);
-        limits.maxTextureDimension3D = Math::Min(deviceProps.maxSurface3D[0], Math::Min(deviceProps.maxSurface3D[1], deviceProps.maxSurface3D[2]));
-        limits.maxTextureDimensionCube = deviceProps.maxSurfaceCubemap;
-        limits.maxTextureArrayLayers = Math::Min(deviceProps.maxSurface1DLayered[1], deviceProps.maxSurface2DLayered[2]);
+
+        limits.maxTextureDimension1D = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH);
+        limits.maxTextureDimension2D = Math::Min(
+            getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH),
+            getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT));
+        limits.maxTextureDimension3D = Math::Min(
+            getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH),
+            Math::Min(
+                getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT),
+                getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH)));
+        limits.maxTextureDimensionCube = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH);
+        limits.maxTextureArrayLayers = Math::Min(
+            getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS),
+            getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS));
 
         // limits.maxVertexInputElements
         // limits.maxVertexInputElementOffset
         // limits.maxVertexStreams
         // limits.maxVertexStreamStride
 
-        limits.maxComputeThreadsPerGroup = deviceProps.maxThreadsPerBlock;
-        limits.maxComputeThreadGroupSize[0] = deviceProps.maxThreadsDim[0];
-        limits.maxComputeThreadGroupSize[1] = deviceProps.maxThreadsDim[1];
-        limits.maxComputeThreadGroupSize[2] = deviceProps.maxThreadsDim[2];
-        limits.maxComputeDispatchThreadGroups[0] = deviceProps.maxGridSize[0];
-        limits.maxComputeDispatchThreadGroups[1] = deviceProps.maxGridSize[1];
-        limits.maxComputeDispatchThreadGroups[2] = deviceProps.maxGridSize[2];
+        limits.maxComputeThreadsPerGroup = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+        limits.maxComputeThreadGroupSize[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X);
+        limits.maxComputeThreadGroupSize[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y);
+        limits.maxComputeThreadGroupSize[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z);
+        limits.maxComputeDispatchThreadGroups[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X);
+        limits.maxComputeDispatchThreadGroups[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y);
+        limits.maxComputeDispatchThreadGroups[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z);
 
         // limits.maxViewports
         // limits.maxViewportDimensions
@@ -237,6 +250,8 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
         // limits.maxShaderVisibleSamplers
 
         m_info.limits = limits;
+
+        SLANG_CUDA_RETURN_ON_FAIL(lastResult);
     }
 
     return SLANG_OK;
@@ -728,10 +743,16 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
     auto desc = fixupBufferDesc(descIn);
     RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
     resource->m_cudaContext = m_context;
-    SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+    SLANG_CUDA_RETURN_ON_FAIL(cuMemAllocManaged(
+        (CUdeviceptr*)(&resource->m_cudaMemory),
+        desc.sizeInBytes,
+        CU_MEM_ATTACH_GLOBAL));
     if (initData)
     {
-        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+            (CUdeviceptr)resource->m_cudaMemory,
+            (CUdeviceptr)initData,
+            desc.sizeInBytes));
     }
     returnComPtr(outResource, resource);
     return SLANG_OK;
@@ -755,28 +776,28 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
     // "external memory" object, which represents the relationship
     // with another API's objects. In order to create this external
     // memory association, we first need to fill in a descriptor struct.
-    cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
     memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
     switch (handle.api)
     {
     case InteropHandleAPI::D3D12:
-        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
         break;
     case InteropHandleAPI::Vulkan:
-        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+        externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
         break;
     default:
         return SLANG_FAIL;
     }
     externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
     externalMemoryHandleDesc.size = desc.sizeInBytes;
-    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+    externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
 
     // Once we have filled in the descriptor, we can request
     // that CUDA create the required association between the
     // external buffer and its own memory.
-    cudaExternalMemory_t externalMemory;
-    SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+    CUexternalMemory externalMemory;
+    SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
     resource->m_cudaExternalMemory = externalMemory;
 
     // The CUDA "external memory" handle is not itself a device
@@ -786,13 +807,13 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
     // Just as for the external memory, we fill in a descriptor
     // structure (although in this case we only need to specify
     // the size).
-    cudaExternalMemoryBufferDesc bufferDesc;
+    CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc;
     memset(&bufferDesc, 0, sizeof(bufferDesc));
     bufferDesc.size = desc.sizeInBytes;
 
     // Finally, we can "map" the buffer to get a device address.
     void* deviceAddress;
-    SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+    SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedBuffer((CUdeviceptr*)&deviceAddress, externalMemory, &bufferDesc));
     resource->m_cudaMemory = deviceAddress;
 
     returnComPtr(outResource, resource);
@@ -833,7 +854,7 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
     }
     externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
     externalMemoryHandleDesc.size = size;
-    externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+    externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
 
     CUexternalMemory externalMemory;
     SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
@@ -1165,11 +1186,10 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
     List<uint8_t> blobData;
 
     blobData.setCount((Index)size);
-    cudaMemcpy(
-        blobData.getBuffer(),
-        (uint8_t*)bufferImpl->m_cudaMemory + offset,
-        size,
-        cudaMemcpyDefault);
+    cuMemcpy(
+        (CUdeviceptr)blobData.getBuffer(),
+        (CUdeviceptr)((uint8_t*)bufferImpl->m_cudaMemory + offset),
+        size);
 
     auto blob = ListBlob::moveCreate(blobData);
 
diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp
index 0a8d734d8..d478e8815 100644
--- a/tools/gfx/cuda/cuda-helper-functions.cpp
+++ b/tools/gfx/cuda/cuda-helper-functions.cpp
@@ -39,11 +39,6 @@ SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
     return info.handle();
 }
 
-SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
-    return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
 #    ifdef RENDER_TEST_OPTIX
 
 static bool _isError(OptixResult result)
@@ -71,41 +66,41 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message,
 #       endif
 #    endif
 
-AdapterLUID getAdapterLUID(int device)
+AdapterLUID getAdapterLUID(int deviceIndex)
 {
+    CUdevice device;
+    cuDeviceGet(&device, deviceIndex);
     AdapterLUID luid = {};
-#if SLANG_WIN32 || SLANG_WIN64
-    // LUID reported by CUDA is undefined i not on windows platform.
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, device);
-    SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(cudaDeviceProp::luid));
-    memcpy(&luid, prop.luid, sizeof(cudaDeviceProp::luid));
-#else
-    SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(int));
-    memcpy(&luid, &device, sizeof(int));
-#endif
+    unsigned int deviceNodeMask;
+    cuDeviceGetLuid((char*)&luid, &deviceNodeMask, device);
     return luid;
 }
 
-} // namespace cuda
-
-Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters)
+Result SLANG_MCALL getAdapters(List<AdapterInfo>& outAdapters)
 {
     int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    for (int device = 0; device < deviceCount; device++)
+    SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount));
+    for (int deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++)
     {
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, device);
+        CUdevice device;
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, deviceIndex));
+
         AdapterInfo info = {};
-        memcpy(info.name, prop.name, Math::Min(strlen(prop.name), sizeof(AdapterInfo::name) - 1));
-        info.luid = cuda::getAdapterLUID(device);
+        SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetName(info.name, sizeof(info.name), device));
+        info.luid = getAdapterLUID(deviceIndex);
         outAdapters.add(info);
     }
 
     return SLANG_OK;
 }
 
+} // namespace cuda
+
+Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters)
+{
+    return cuda::getAdapters(outAdapters);
+}
+
 Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
 {
     RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl();
diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h
index 7417249d5..2217c727c 100644
--- a/tools/gfx/cuda/cuda-helper-functions.h
+++ b/tools/gfx/cuda/cuda-helper-functions.h
@@ -13,7 +13,6 @@ using namespace Slang;
 namespace cuda
 {
 SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; }
 
 // A enum used to control if errors are reported on failure of CUDA call.
 enum class CUDAReportStyle
@@ -45,7 +44,6 @@ struct CUDAErrorInfo
 // If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
 
 SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line);
-SlangResult _handleCUDAError(cudaError_t error, const char* file, int line);
 
 #    define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
 
@@ -99,7 +97,7 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message,
 
 #    endif
 
-AdapterLUID getAdapterLUID(int device);
+AdapterLUID getAdapterLUID(int deviceIndex);
 
 } // namespace cuda
 #endif
diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp
index 3fc55e401..02b4cf695 100644
--- a/tools/gfx/cuda/cuda-shader-object.cpp
+++ b/tools/gfx/cuda/cuda-shader-object.cpp
@@ -37,7 +37,7 @@ Result ShaderObjectData::setCount(Index count)
         m_bufferResource = new BufferResourceImpl(desc);
         if (count)
         {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
+            SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&m_bufferResource->m_cudaMemory, (size_t)count));
         }
         IResourceView::Desc viewDesc = {};
         viewDesc.type = IResourceView::Type::UnorderedAccess;
@@ -51,17 +51,16 @@ Result ShaderObjectData::setCount(Index count)
         void* newMemory = nullptr;
         if (count)
         {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
+            SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&newMemory, (size_t)count));
         }
         if (oldSize)
         {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-                newMemory,
-                m_bufferResource->m_cudaMemory,
-                Math::Min((size_t)count, oldSize),
-                cudaMemcpyDefault));
+            SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+                (CUdeviceptr)newMemory,
+                (CUdeviceptr)m_bufferResource->m_cudaMemory,
+                Math::Min((size_t)count, oldSize)));
         }
-        cudaFree(m_bufferResource->m_cudaMemory);
+        cuMemFree((CUdeviceptr)m_bufferResource->m_cudaMemory);
         m_bufferResource->m_cudaMemory = newMemory;
         m_bufferResource->getDesc()->sizeInBytes = count;
     }
@@ -189,8 +188,10 @@ SLANG_NO_THROW Result SLANG_MCALL
 {
     Size temp = m_data.getCount() - (Size)offset.uniformOffset;
     size = Math::Min(size, temp);
-    SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-        (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
+    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+        (CUdeviceptr)((uint8_t*)m_data.getBuffer() + offset.uniformOffset),
+        (CUdeviceptr)data,
+        size));
     return SLANG_OK;
 }
author	skallweitNV <64953474+skallweitNV@users.noreply.github.com>	2024-03-15 18:25:21 +0100
committer	GitHub <noreply@github.com>	2024-03-15 10:25:21 -0700
commit	9ee88a43f4e67d9c714c27bf968401b6bf7524af (patch)
tree	11c0f7b46168d7885f2ac220ae8bd8f51d40ac83 /tools
parent	d40931cc8bde13520ea45769cf94e7cc6cc9065f (diff)