summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorskallweitNV <64953474+skallweitNV@users.noreply.github.com>2024-03-15 18:25:21 +0100
committerGitHub <noreply@github.com>2024-03-15 10:25:21 -0700
commit9ee88a43f4e67d9c714c27bf968401b6bf7524af (patch)
tree11c0f7b46168d7885f2ac220ae8bd8f51d40ac83 /tools
parentd40931cc8bde13520ea45769cf94e7cc6cc9065f (diff)
[gfx] use CUDA driver API (#3776)
Diffstat (limited to 'tools')
-rw-r--r--tools/gfx/cuda/cuda-base.h1
-rw-r--r--tools/gfx/cuda/cuda-buffer.cpp2
-rw-r--r--tools/gfx/cuda/cuda-command-queue.cpp21
-rw-r--r--tools/gfx/cuda/cuda-device.cpp112
-rw-r--r--tools/gfx/cuda/cuda-helper-functions.cpp45
-rw-r--r--tools/gfx/cuda/cuda-helper-functions.h4
-rw-r--r--tools/gfx/cuda/cuda-shader-object.cpp21
7 files changed, 110 insertions, 96 deletions
diff --git a/tools/gfx/cuda/cuda-base.h b/tools/gfx/cuda/cuda-base.h
index 57a244089..63c280cde 100644
--- a/tools/gfx/cuda/cuda-base.h
+++ b/tools/gfx/cuda/cuda-base.h
@@ -4,7 +4,6 @@
#ifdef GFX_ENABLE_CUDA
#include <cuda.h>
-#include <cuda_runtime_api.h>
#include "core/slang-basic.h"
#include "core/slang-blob.h"
#include "core/slang-std-writers.h"
diff --git a/tools/gfx/cuda/cuda-buffer.cpp b/tools/gfx/cuda/cuda-buffer.cpp
index 1cd162841..bc4ef32ef 100644
--- a/tools/gfx/cuda/cuda-buffer.cpp
+++ b/tools/gfx/cuda/cuda-buffer.cpp
@@ -15,7 +15,7 @@ BufferResourceImpl::~BufferResourceImpl()
{
if (m_cudaMemory)
{
- SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+ SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree((CUdeviceptr)m_cudaMemory));
}
}
diff --git a/tools/gfx/cuda/cuda-command-queue.cpp b/tools/gfx/cuda/cuda-command-queue.cpp
index 4b0ab7d94..5c6f2db26 100644
--- a/tools/gfx/cuda/cuda-command-queue.cpp
+++ b/tools/gfx/cuda/cuda-command-queue.cpp
@@ -104,11 +104,10 @@ void CommandQueueImpl::dispatchCompute(int x, int y, int z)
"SLANG_globalParams");
CUdeviceptr globalParamsCUDAData = (CUdeviceptr)currentRootObject->getBuffer();
- cudaMemcpyAsync(
- (void*)globalParamsSymbol,
- (void*)globalParamsCUDAData,
+ cuMemcpyAsync(
+ (CUdeviceptr)globalParamsSymbol,
+ (CUdeviceptr)globalParamsCUDAData,
globalParamsSymbolSize,
- cudaMemcpyDefault,
0);
}
//
@@ -155,17 +154,19 @@ void CommandQueueImpl::copyBuffer(
{
auto dstImpl = static_cast<BufferResourceImpl*>(dst);
auto srcImpl = static_cast<BufferResourceImpl*>(src);
- cudaMemcpy(
- (uint8_t*)dstImpl->m_cudaMemory + dstOffset,
- (uint8_t*)srcImpl->m_cudaMemory + srcOffset,
- size,
- cudaMemcpyDefault);
+ cuMemcpy(
+ (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + dstOffset),
+ (CUdeviceptr)((uint8_t*)srcImpl->m_cudaMemory + srcOffset),
+ size);
}
void CommandQueueImpl::uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
{
auto dstImpl = static_cast<BufferResourceImpl*>(dst);
- cudaMemcpy((uint8_t*)dstImpl->m_cudaMemory + offset, data, size, cudaMemcpyDefault);
+ cuMemcpy(
+ (CUdeviceptr)((uint8_t*)dstImpl->m_cudaMemory + offset),
+ (CUdeviceptr)data,
+ size);
}
void CommandQueueImpl::writeTimestamp(IQueryPool* pool, SlangInt index)
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
index 0fcf9319e..b1b582d00 100644
--- a/tools/gfx/cuda/cuda-device.cpp
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -70,22 +70,21 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
int devicesProhibited = 0;
uint64_t maxComputePerf = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount));
// Find the best CUDA capable GPU device
for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
{
+ CUdevice device;
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, currentDevice));
int computeMode = -1, major = 0, minor = 0;
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
// If this GPU is not running on Compute Mode prohibited,
// then we can add it to the list
- if (computeMode != cudaComputeModeProhibited)
+ if (computeMode != CU_COMPUTEMODE_PROHIBITED)
{
if (major == 9999 && minor == 9999)
{
@@ -97,10 +96,8 @@ SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
}
int multiProcessorCount = 0, clockRate = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
- &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(
- cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
if (compute_perf > maxComputePerf)
@@ -172,8 +169,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
}
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
m_context = new CUDAContext();
SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
@@ -192,9 +187,6 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
m_features.add("has-ptr");
}
- cudaDeviceProp deviceProps;
- cudaGetDeviceProperties(&deviceProps, m_deviceIndex);
-
// Initialize DeviceInfo
{
m_info.deviceType = DeviceType::CUDA;
@@ -203,32 +195,53 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
m_info.apiName = "CUDA";
static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
- m_adapterName = deviceProps.name;
+ char deviceName[256];
+ cuDeviceGetName(deviceName, sizeof(deviceName), m_device);
+ m_adapterName = deviceName;
m_info.adapterName = m_adapterName.begin();
m_info.timestampFrequency = 1000000;
}
// Get device limits.
{
+ CUresult lastResult = CUDA_SUCCESS;
+ auto getAttribute = [&](CUdevice_attribute attribute) -> int
+ {
+ int value;
+ CUresult result = cuDeviceGetAttribute(&value, attribute, m_device);
+ if (result != CUDA_SUCCESS)
+ lastResult = result;
+ return value;
+ };
+
DeviceLimits limits = {};
- limits.maxTextureDimension1D = deviceProps.maxSurface1D;
- limits.maxTextureDimension2D = Math::Min(deviceProps.maxSurface2D[0], deviceProps.maxSurface2D[1]);
- limits.maxTextureDimension3D = Math::Min(deviceProps.maxSurface3D[0], Math::Min(deviceProps.maxSurface3D[1], deviceProps.maxSurface3D[2]));
- limits.maxTextureDimensionCube = deviceProps.maxSurfaceCubemap;
- limits.maxTextureArrayLayers = Math::Min(deviceProps.maxSurface1DLayered[1], deviceProps.maxSurface2DLayered[2]);
+
+ limits.maxTextureDimension1D = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH);
+ limits.maxTextureDimension2D = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT));
+ limits.maxTextureDimension3D = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH),
+ Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH)));
+ limits.maxTextureDimensionCube = getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH);
+ limits.maxTextureArrayLayers = Math::Min(
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS),
+ getAttribute(CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS));
// limits.maxVertexInputElements
// limits.maxVertexInputElementOffset
// limits.maxVertexStreams
// limits.maxVertexStreamStride
- limits.maxComputeThreadsPerGroup = deviceProps.maxThreadsPerBlock;
- limits.maxComputeThreadGroupSize[0] = deviceProps.maxThreadsDim[0];
- limits.maxComputeThreadGroupSize[1] = deviceProps.maxThreadsDim[1];
- limits.maxComputeThreadGroupSize[2] = deviceProps.maxThreadsDim[2];
- limits.maxComputeDispatchThreadGroups[0] = deviceProps.maxGridSize[0];
- limits.maxComputeDispatchThreadGroups[1] = deviceProps.maxGridSize[1];
- limits.maxComputeDispatchThreadGroups[2] = deviceProps.maxGridSize[2];
+ limits.maxComputeThreadsPerGroup = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+ limits.maxComputeThreadGroupSize[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X);
+ limits.maxComputeThreadGroupSize[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y);
+ limits.maxComputeThreadGroupSize[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z);
+ limits.maxComputeDispatchThreadGroups[0] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X);
+ limits.maxComputeDispatchThreadGroups[1] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y);
+ limits.maxComputeDispatchThreadGroups[2] = getAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z);
// limits.maxViewports
// limits.maxViewportDimensions
@@ -237,6 +250,8 @@ SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
// limits.maxShaderVisibleSamplers
m_info.limits = limits;
+
+ SLANG_CUDA_RETURN_ON_FAIL(lastResult);
}
return SLANG_OK;
@@ -728,10 +743,16 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
auto desc = fixupBufferDesc(descIn);
RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
resource->m_cudaContext = m_context;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAllocManaged(
+ (CUdeviceptr*)(&resource->m_cudaMemory),
+ desc.sizeInBytes,
+ CU_MEM_ATTACH_GLOBAL));
if (initData)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+ (CUdeviceptr)resource->m_cudaMemory,
+ (CUdeviceptr)initData,
+ desc.sizeInBytes));
}
returnComPtr(outResource, resource);
return SLANG_OK;
@@ -755,28 +776,28 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
// "external memory" object, which represents the relationship
// with another API's objects. In order to create this external
// memory association, we first need to fill in a descriptor struct.
- cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+ CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
switch (handle.api)
{
case InteropHandleAPI::D3D12:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
break;
case InteropHandleAPI::Vulkan:
- externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
break;
default:
return SLANG_FAIL;
}
externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
externalMemoryHandleDesc.size = desc.sizeInBytes;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+ externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
// Once we have filled in the descriptor, we can request
// that CUDA create the required association between the
// external buffer and its own memory.
- cudaExternalMemory_t externalMemory;
- SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ CUexternalMemory externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
resource->m_cudaExternalMemory = externalMemory;
// The CUDA "external memory" handle is not itself a device
@@ -786,13 +807,13 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
// Just as for the external memory, we fill in a descriptor
// structure (although in this case we only need to specify
// the size).
- cudaExternalMemoryBufferDesc bufferDesc;
+ CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc;
memset(&bufferDesc, 0, sizeof(bufferDesc));
bufferDesc.size = desc.sizeInBytes;
// Finally, we can "map" the buffer to get a device address.
void* deviceAddress;
- SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+ SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedBuffer((CUdeviceptr*)&deviceAddress, externalMemory, &bufferDesc));
resource->m_cudaMemory = deviceAddress;
returnComPtr(outResource, resource);
@@ -833,7 +854,7 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
}
externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
externalMemoryHandleDesc.size = size;
- externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+ externalMemoryHandleDesc.flags = CUDA_EXTERNAL_MEMORY_DEDICATED;
CUexternalMemory externalMemory;
SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
@@ -1165,11 +1186,10 @@ SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
List<uint8_t> blobData;
blobData.setCount((Index)size);
- cudaMemcpy(
- blobData.getBuffer(),
- (uint8_t*)bufferImpl->m_cudaMemory + offset,
- size,
- cudaMemcpyDefault);
+ cuMemcpy(
+ (CUdeviceptr)blobData.getBuffer(),
+ (CUdeviceptr)((uint8_t*)bufferImpl->m_cudaMemory + offset),
+ size);
auto blob = ListBlob::moveCreate(blobData);
diff --git a/tools/gfx/cuda/cuda-helper-functions.cpp b/tools/gfx/cuda/cuda-helper-functions.cpp
index 0a8d734d8..d478e8815 100644
--- a/tools/gfx/cuda/cuda-helper-functions.cpp
+++ b/tools/gfx/cuda/cuda-helper-functions.cpp
@@ -39,11 +39,6 @@ SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
return info.handle();
}
-SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
- return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
# ifdef RENDER_TEST_OPTIX
static bool _isError(OptixResult result)
@@ -71,41 +66,41 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message,
# endif
# endif
-AdapterLUID getAdapterLUID(int device)
+AdapterLUID getAdapterLUID(int deviceIndex)
{
+ CUdevice device;
+ cuDeviceGet(&device, deviceIndex);
AdapterLUID luid = {};
-#if SLANG_WIN32 || SLANG_WIN64
- // LUID reported by CUDA is undefined i not on windows platform.
- cudaDeviceProp prop;
- cudaGetDeviceProperties(&prop, device);
- SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(cudaDeviceProp::luid));
- memcpy(&luid, prop.luid, sizeof(cudaDeviceProp::luid));
-#else
- SLANG_ASSERT(sizeof(AdapterLUID) >= sizeof(int));
- memcpy(&luid, &device, sizeof(int));
-#endif
+ unsigned int deviceNodeMask;
+ cuDeviceGetLuid((char*)&luid, &deviceNodeMask, device);
return luid;
}
-} // namespace cuda
-
-Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters)
+Result SLANG_MCALL getAdapters(List<AdapterInfo>& outAdapters)
{
int deviceCount;
- cudaGetDeviceCount(&deviceCount);
- for (int device = 0; device < deviceCount; device++)
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetCount(&deviceCount));
+ for (int deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++)
{
- cudaDeviceProp prop;
- cudaGetDeviceProperties(&prop, device);
+ CUdevice device;
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&device, deviceIndex));
+
AdapterInfo info = {};
- memcpy(info.name, prop.name, Math::Min(strlen(prop.name), sizeof(AdapterInfo::name) - 1));
- info.luid = cuda::getAdapterLUID(device);
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGetName(info.name, sizeof(info.name), device));
+ info.luid = getAdapterLUID(deviceIndex);
outAdapters.add(info);
}
return SLANG_OK;
}
+} // namespace cuda
+
+Result SLANG_MCALL getCUDAAdapters(List<AdapterInfo>& outAdapters)
+{
+ return cuda::getAdapters(outAdapters);
+}
+
Result SLANG_MCALL createCUDADevice(const IDevice::Desc* desc, IDevice** outDevice)
{
RefPtr<cuda::DeviceImpl> result = new cuda::DeviceImpl();
diff --git a/tools/gfx/cuda/cuda-helper-functions.h b/tools/gfx/cuda/cuda-helper-functions.h
index 7417249d5..2217c727c 100644
--- a/tools/gfx/cuda/cuda-helper-functions.h
+++ b/tools/gfx/cuda/cuda-helper-functions.h
@@ -13,7 +13,6 @@ using namespace Slang;
namespace cuda
{
SLANG_FORCE_INLINE bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE bool _isError(cudaError_t result) { return result != 0; }
// A enum used to control if errors are reported on failure of CUDA call.
enum class CUDAReportStyle
@@ -45,7 +44,6 @@ struct CUDAErrorInfo
// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line);
-SlangResult _handleCUDAError(cudaError_t error, const char* file, int line);
# define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(x, __FILE__, __LINE__)
@@ -99,7 +97,7 @@ void _optixLogCallback(unsigned int level, const char* tag, const char* message,
# endif
-AdapterLUID getAdapterLUID(int device);
+AdapterLUID getAdapterLUID(int deviceIndex);
} // namespace cuda
#endif
diff --git a/tools/gfx/cuda/cuda-shader-object.cpp b/tools/gfx/cuda/cuda-shader-object.cpp
index 3fc55e401..02b4cf695 100644
--- a/tools/gfx/cuda/cuda-shader-object.cpp
+++ b/tools/gfx/cuda/cuda-shader-object.cpp
@@ -37,7 +37,7 @@ Result ShaderObjectData::setCount(Index count)
m_bufferResource = new BufferResourceImpl(desc);
if (count)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&m_bufferResource->m_cudaMemory, (size_t)count));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&m_bufferResource->m_cudaMemory, (size_t)count));
}
IResourceView::Desc viewDesc = {};
viewDesc.type = IResourceView::Type::UnorderedAccess;
@@ -51,17 +51,16 @@ Result ShaderObjectData::setCount(Index count)
void* newMemory = nullptr;
if (count)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&newMemory, (size_t)count));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc((CUdeviceptr*)&newMemory, (size_t)count));
}
if (oldSize)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- newMemory,
- m_bufferResource->m_cudaMemory,
- Math::Min((size_t)count, oldSize),
- cudaMemcpyDefault));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+ (CUdeviceptr)newMemory,
+ (CUdeviceptr)m_bufferResource->m_cudaMemory,
+ Math::Min((size_t)count, oldSize)));
}
- cudaFree(m_bufferResource->m_cudaMemory);
+ cuMemFree((CUdeviceptr)m_bufferResource->m_cudaMemory);
m_bufferResource->m_cudaMemory = newMemory;
m_bufferResource->getDesc()->sizeInBytes = count;
}
@@ -189,8 +188,10 @@ SLANG_NO_THROW Result SLANG_MCALL
{
Size temp = m_data.getCount() - (Size)offset.uniformOffset;
size = Math::Min(size, temp);
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (uint8_t*)m_data.getBuffer() + offset.uniformOffset, data, size, cudaMemcpyDefault));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy(
+ (CUdeviceptr)((uint8_t*)m_data.getBuffer() + offset.uniformOffset),
+ (CUdeviceptr)data,
+ size));
return SLANG_OK;
}