summaryrefslogtreecommitdiffstats
path: root/tools/gfx/cuda/cuda-device.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tools/gfx/cuda/cuda-device.cpp')
-rw-r--r--tools/gfx/cuda/cuda-device.cpp1115
1 files changed, 1115 insertions, 0 deletions
diff --git a/tools/gfx/cuda/cuda-device.cpp b/tools/gfx/cuda/cuda-device.cpp
new file mode 100644
index 000000000..1a4a142d0
--- /dev/null
+++ b/tools/gfx/cuda/cuda-device.cpp
@@ -0,0 +1,1115 @@
+// cuda-device.cpp
+#include "cuda-device.h"
+
+#include "cuda-buffer.h"
+#include "cuda-command-queue.h"
+#include "cuda-pipeline-state.h"
+#include "cuda-query.h"
+#include "cuda-shader-object.h"
+#include "cuda-shader-object-layout.h"
+#include "cuda-shader-program.h"
+#include "cuda-resource-views.h"
+#include "cuda-texture.h"
+
+namespace gfx
+{
+#ifdef GFX_ENABLE_CUDA
+using namespace Slang;
+
+namespace cuda
+{
+
+int DeviceImpl::_calcSMCountPerMultiProcessor(int major, int minor)
+{
+ // Defines for GPU Architecture types (using the SM version to determine
+ // the # of cores per SM
+ struct SMInfo
+ {
+ int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+ int coreCount;
+ };
+
+ static const SMInfo infos[] = {
+ {0x30, 192},
+ {0x32, 192},
+ {0x35, 192},
+ {0x37, 192},
+ {0x50, 128},
+ {0x52, 128},
+ {0x53, 128},
+ {0x60, 64},
+ {0x61, 128},
+ {0x62, 128},
+ {0x70, 64},
+ {0x72, 64},
+ {0x75, 64} };
+
+ const int sm = ((major << 4) + minor);
+ for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+ {
+ if (infos[i].sm == sm)
+ {
+ return infos[i].coreCount;
+ }
+ }
+
+ const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+ // It must be newer presumably
+ SLANG_ASSERT(sm > last.sm);
+
+ // Default to the last entry
+ return last.coreCount;
+}
+
+SlangResult DeviceImpl::_findMaxFlopsDeviceIndex(int* outDeviceIndex)
+{
+ int smPerMultiproc = 0;
+ int maxPerfDevice = -1;
+ int deviceCount = 0;
+ int devicesProhibited = 0;
+
+ uint64_t maxComputePerf = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+ // Find the best CUDA capable GPU device
+ for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+ {
+ int computeMode = -1, major = 0, minor = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+ // If this GPU is not running on Compute Mode prohibited,
+ // then we can add it to the list
+ if (computeMode != cudaComputeModeProhibited)
+ {
+ if (major == 9999 && minor == 9999)
+ {
+ smPerMultiproc = 1;
+ }
+ else
+ {
+ smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+ }
+
+ int multiProcessorCount = 0, clockRate = 0;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(
+ &multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+ uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+ if (compute_perf > maxComputePerf)
+ {
+ maxComputePerf = compute_perf;
+ maxPerfDevice = currentDevice;
+ }
+ }
+ else
+ {
+ devicesProhibited++;
+ }
+ }
+
+ if (maxPerfDevice < 0)
+ {
+ return SLANG_FAIL;
+ }
+
+ *outDeviceIndex = maxPerfDevice;
+ return SLANG_OK;
+}
+
+SlangResult DeviceImpl::_initCuda(CUDAReportStyle reportType)
+{
+ static CUresult res = cuInit(0);
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::getNativeDeviceHandles(InteropHandles* outHandles)
+{
+ outHandles->handles[0].handleValue = (uint64_t)m_device;
+ outHandles->handles[0].api = InteropHandleAPI::CUDA;
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::initialize(const Desc& desc)
+{
+ SLANG_RETURN_ON_FAIL(slangContext.initialize(
+ desc.slang,
+ SLANG_PTX,
+ "sm_5_1",
+ makeArray(slang::PreprocessorMacroDesc{ "__CUDA_COMPUTE__", "1" }).getView()));
+
+ SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
+
+ SLANG_RETURN_ON_FAIL(_initCuda(reportType));
+
+ SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
+
+ m_context = new CUDAContext();
+
+ int count = -1;
+ cuDeviceGetCount(&count);
+ SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
+
+ SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(
+ cuCtxCreate(&m_context->m_context, 0, m_device), reportType);
+
+ // Not clear how to detect half support on CUDA. For now we'll assume we have it
+ {
+ m_features.add("half");
+ }
+
+ // Initialize DeviceInfo
+ {
+ m_info.deviceType = DeviceType::CUDA;
+ m_info.bindingStyle = BindingStyle::CUDA;
+ m_info.projectionStyle = ProjectionStyle::DirectX;
+ m_info.apiName = "CUDA";
+ static const float kIdentity[] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
+ ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
+ cudaDeviceProp deviceProperties;
+ cudaGetDeviceProperties(&deviceProperties, m_deviceIndex);
+ m_adapterName = deviceProperties.name;
+ m_info.adapterName = m_adapterName.begin();
+ m_info.timestampFrequency = 1000000;
+ }
+
+ return SLANG_OK;
+}
+
+Result DeviceImpl::getCUDAFormat(Format format, CUarray_format* outFormat)
+{
+ // TODO: Expand to cover all available formats that can be supported in CUDA
+ switch (format)
+ {
+ case Format::R32G32B32A32_FLOAT:
+ case Format::R32G32B32_FLOAT:
+ case Format::R32G32_FLOAT:
+ case Format::R32_FLOAT:
+ case Format::D32_FLOAT:
+ *outFormat = CU_AD_FORMAT_FLOAT;
+ return SLANG_OK;
+ case Format::R16G16B16A16_FLOAT:
+ case Format::R16G16_FLOAT:
+ case Format::R16_FLOAT:
+ *outFormat = CU_AD_FORMAT_HALF;
+ return SLANG_OK;
+ case Format::R32G32B32A32_UINT:
+ case Format::R32G32B32_UINT:
+ case Format::R32G32_UINT:
+ case Format::R32_UINT:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT32;
+ return SLANG_OK;
+ case Format::R16G16B16A16_UINT:
+ case Format::R16G16_UINT:
+ case Format::R16_UINT:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT16;
+ return SLANG_OK;
+ case Format::R8G8B8A8_UINT:
+ case Format::R8G8_UINT:
+ case Format::R8_UINT:
+ case Format::R8G8B8A8_UNORM:
+ *outFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+ return SLANG_OK;
+ case Format::R32G32B32A32_SINT:
+ case Format::R32G32B32_SINT:
+ case Format::R32G32_SINT:
+ case Format::R32_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT32;
+ return SLANG_OK;
+ case Format::R16G16B16A16_SINT:
+ case Format::R16G16_SINT:
+ case Format::R16_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT16;
+ return SLANG_OK;
+ case Format::R8G8B8A8_SINT:
+ case Format::R8G8_SINT:
+ case Format::R8_SINT:
+ *outFormat = CU_AD_FORMAT_SIGNED_INT8;
+ return SLANG_OK;
+ default:
+ SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+ return SLANG_FAIL;
+ }
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureResource(
+ const ITextureResource::Desc& desc,
+ const ITextureResource::SubresourceData* initData,
+ ITextureResource** outResource)
+{
+ TextureResource::Desc srcDesc = fixupTextureDesc(desc);
+
+ RefPtr<TextureResourceImpl> tex = new TextureResourceImpl(srcDesc);
+ tex->m_cudaContext = m_context;
+
+ CUresourcetype resourceType;
+
+ // The size of the element/texel in bytes
+ size_t elementSize = 0;
+
+ // Our `ITextureResource::Desc` uses an enumeration to specify
+ // the "shape"/rank of a texture (1D, 2D, 3D, Cube), but CUDA's
+ // `cuMipmappedArrayCreate` seemingly relies on a policy where
+ // the extents of the array in dimenions above the rank are
+ // specified as zero (e.g., a 1D texture requires `height==0`).
+ //
+ // We will start by massaging the extents as specified by the
+ // user into a form that CUDA wants/expects, based on the
+ // texture shape as specified in the `desc`.
+ //
+ int width = desc.size.width;
+ int height = desc.size.height;
+ int depth = desc.size.depth;
+ switch (desc.type)
+ {
+ case IResource::Type::Texture1D:
+ height = 0;
+ depth = 0;
+ break;
+
+ case IResource::Type::Texture2D:
+ depth = 0;
+ break;
+
+ case IResource::Type::Texture3D:
+ break;
+
+ case IResource::Type::TextureCube:
+ depth = 1;
+ break;
+ }
+
+ {
+ CUarray_format format = CU_AD_FORMAT_FLOAT;
+ int numChannels = 0;
+
+ SLANG_RETURN_ON_FAIL(getCUDAFormat(desc.format, &format));
+ FormatInfo info;
+ gfxGetFormatInfo(desc.format, &info);
+ numChannels = info.channelCount;
+
+ switch (format)
+ {
+ case CU_AD_FORMAT_FLOAT:
+ {
+ elementSize = sizeof(float) * numChannels;
+ break;
+ }
+ case CU_AD_FORMAT_HALF:
+ {
+ elementSize = sizeof(uint16_t) * numChannels;
+ break;
+ }
+ case CU_AD_FORMAT_UNSIGNED_INT8:
+ {
+ elementSize = sizeof(uint8_t) * numChannels;
+ break;
+ }
+ default:
+ {
+ SLANG_ASSERT(!"Only support R32_FLOAT/R8G8B8A8_UNORM formats for now");
+ return SLANG_FAIL;
+ }
+ }
+
+ if (desc.numMipLevels > 1)
+ {
+ resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Width = width;
+ arrayDesc.Height = height;
+ arrayDesc.Depth = depth;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+ arrayDesc.Flags = 0;
+
+ if (desc.arraySize > 1)
+ {
+ if (desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
+ arrayDesc.Depth = desc.arraySize;
+ }
+ else
+ {
+ SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
+ return SLANG_FAIL;
+ }
+ }
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ arrayDesc.Depth *= 6;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, desc.numMipLevels));
+ }
+ else
+ {
+ resourceType = CU_RESOURCE_TYPE_ARRAY;
+
+ if (desc.arraySize > 1)
+ {
+ if (desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
+ return SLANG_FAIL;
+ }
+
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ // Set the depth as the array length
+ arrayDesc.Depth = desc.arraySize;
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Depth *= 6;
+ }
+
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ else if (desc.type == IResource::Type::Texture3D ||
+ desc.type == IResource::Type::TextureCube)
+ {
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Depth = depth;
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ arrayDesc.Flags = 0;
+
+ // Handle cube texture
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ arrayDesc.Depth = 6;
+ arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ else
+ {
+ CUDA_ARRAY_DESCRIPTOR arrayDesc;
+ memset(&arrayDesc, 0, sizeof(arrayDesc));
+
+ arrayDesc.Height = height;
+ arrayDesc.Width = width;
+ arrayDesc.Format = format;
+ arrayDesc.NumChannels = numChannels;
+
+ // Allocate the array, will work for 1D or 2D case
+ SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
+ }
+ }
+ }
+
+ // Work space for holding data for uploading if it needs to be rearranged
+ if (initData)
+ {
+ List<uint8_t> workspace;
+ for (int mipLevel = 0; mipLevel < desc.numMipLevels; ++mipLevel)
+ {
+ int mipWidth = width >> mipLevel;
+ int mipHeight = height >> mipLevel;
+ int mipDepth = depth >> mipLevel;
+
+ mipWidth = (mipWidth == 0) ? 1 : mipWidth;
+ mipHeight = (mipHeight == 0) ? 1 : mipHeight;
+ mipDepth = (mipDepth == 0) ? 1 : mipDepth;
+
+ // If it's a cubemap then the depth is always 6
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ mipDepth = 6;
+ }
+
+ auto dstArray = tex->m_cudaArray;
+ if (tex->m_cudaMipMappedArray)
+ {
+ // Get the array for the mip level
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
+ }
+ SLANG_ASSERT(dstArray);
+
+ // Check using the desc to see if it's plausible
+ {
+ CUDA_ARRAY_DESCRIPTOR arrayDesc;
+ SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
+
+ SLANG_ASSERT(mipWidth == arrayDesc.Width);
+ SLANG_ASSERT(
+ mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
+ }
+
+ const void* srcDataPtr = nullptr;
+
+ if (desc.arraySize > 1)
+ {
+ SLANG_ASSERT(
+ desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube);
+
+ // TODO(JS): Here I assume that arrays are just held contiguously within a
+ // 'face' This seems reasonable and works with the Copy3D.
+ const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+ Index faceCount = desc.arraySize;
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ faceCount *= 6;
+ }
+
+ const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
+ workspace.setCount(mipSizeInBytes);
+
+ // We need to add the face data from each mip
+ // We iterate over face count so we copy all of the cubemap faces
+ for (Index j = 0; j < faceCount; j++)
+ {
+ const auto srcData = initData[mipLevel + j * desc.numMipLevels].data;
+ // Copy over to the workspace to make contiguous
+ ::memcpy(
+ workspace.begin() + faceSizeInBytes * j, srcData, faceSizeInBytes);
+ }
+
+ srcDataPtr = workspace.getBuffer();
+ }
+ else
+ {
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
+
+ workspace.setCount(faceSizeInBytes * 6);
+ // Copy the data over to make contiguous
+ for (Index j = 0; j < 6; j++)
+ {
+ const auto srcData =
+ initData[mipLevel + j * desc.numMipLevels].data;
+ ::memcpy(
+ workspace.getBuffer() + faceSizeInBytes * j,
+ srcData,
+ faceSizeInBytes);
+ }
+ srcDataPtr = workspace.getBuffer();
+ }
+ else
+ {
+ const auto srcData = initData[mipLevel].data;
+ srcDataPtr = srcData;
+ }
+ }
+
+ if (desc.arraySize > 1)
+ {
+ SLANG_ASSERT(
+ desc.type == IResource::Type::Texture1D ||
+ desc.type == IResource::Type::Texture2D ||
+ desc.type == IResource::Type::TextureCube);
+
+ CUDA_MEMCPY3D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ // Set the depth to the array length
+ copyParam.Depth = desc.arraySize;
+
+ if (desc.type == IResource::Type::TextureCube)
+ {
+ copyParam.Depth *= 6;
+ }
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+ }
+ else
+ {
+ switch (desc.type)
+ {
+ case IResource::Type::Texture1D:
+ case IResource::Type::Texture2D:
+ {
+ CUDA_MEMCPY2D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+ break;
+ }
+ case IResource::Type::Texture3D:
+ case IResource::Type::TextureCube:
+ {
+ CUDA_MEMCPY3D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = dstArray;
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = srcDataPtr;
+ copyParam.srcPitch = mipWidth * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = mipHeight;
+ copyParam.Depth = mipDepth;
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
+ break;
+ }
+
+ default:
+ {
+ SLANG_ASSERT(!"Not implemented");
+ break;
+ }
+ }
+ }
+ }
+ }
+ // Set up texture sampling parameters, and create final texture obj
+
+ {
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+ resDesc.resType = resourceType;
+
+ if (tex->m_cudaArray)
+ {
+ resDesc.res.array.hArray = tex->m_cudaArray;
+ }
+ if (tex->m_cudaMipMappedArray)
+ {
+ resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
+ }
+
+ // If the texture might be used as a UAV, then we need to allocate
+ // a CUDA "surface" for it.
+ //
+ // Note: We cannot do this unconditionally, because it will fail
+ // on surfaces that are not usable as UAVs (e.g., those with
+ // mipmaps).
+ //
+ // TODO: We should really only be allocating the array at the
+ // time we create a resource, and then allocate the surface or
+ // texture objects as part of view creation.
+ //
+ if (desc.allowedStates.contains(ResourceState::UnorderedAccess))
+ {
+ // On CUDA surfaces only support a single MIP map
+ SLANG_ASSERT(desc.numMipLevels == 1);
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
+ }
+
+
+ // Create handle for sampling.
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+ texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+ texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ SLANG_CUDA_RETURN_ON_FAIL(
+ cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
+ }
+
+ returnComPtr(outResource, tex);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferResource(
+ const IBufferResource::Desc& descIn,
+ const void* initData,
+ IBufferResource** outResource)
+{
+ auto desc = fixupBufferDesc(descIn);
+ RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMallocManaged(&resource->m_cudaMemory, desc.sizeInBytes));
+ if (initData)
+ {
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(resource->m_cudaMemory, initData, desc.sizeInBytes, cudaMemcpyDefault));
+ }
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferFromSharedHandle(
+ InteropHandle handle,
+ const IBufferResource::Desc& desc,
+ IBufferResource** outResource)
+{
+ if (handle.handleValue == 0)
+ {
+ *outResource = nullptr;
+ return SLANG_OK;
+ }
+
+ RefPtr<BufferResourceImpl> resource = new BufferResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+
+ // CUDA manages sharing of buffers through the idea of an
+ // "external memory" object, which represents the relationship
+ // with another API's objects. In order to create this external
+ // memory association, we first need to fill in a descriptor struct.
+ cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
+ memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+ switch (handle.api)
+ {
+ case InteropHandleAPI::D3D12:
+ externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+ break;
+ case InteropHandleAPI::Vulkan:
+ externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+ break;
+ default:
+ return SLANG_FAIL;
+ }
+ externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+ externalMemoryHandleDesc.size = desc.sizeInBytes;
+ externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+ // Once we have filled in the descriptor, we can request
+ // that CUDA create the required association between the
+ // external buffer and its own memory.
+ cudaExternalMemory_t externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ resource->m_cudaExternalMemory = externalMemory;
+
+ // The CUDA "external memory" handle is not itself a device
+ // pointer, so we need to query for a suitable device address
+ // for the buffer with another call.
+ //
+ // Just as for the external memory, we fill in a descriptor
+ // structure (although in this case we only need to specify
+ // the size).
+ cudaExternalMemoryBufferDesc bufferDesc;
+ memset(&bufferDesc, 0, sizeof(bufferDesc));
+ bufferDesc.size = desc.sizeInBytes;
+
+ // Finally, we can "map" the buffer to get a device address.
+ void* deviceAddress;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaExternalMemoryGetMappedBuffer(&deviceAddress, externalMemory, &bufferDesc));
+ resource->m_cudaMemory = deviceAddress;
+
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureFromSharedHandle(
+ InteropHandle handle,
+ const ITextureResource::Desc& desc,
+ const size_t size,
+ ITextureResource** outResource)
+{
+ if (handle.handleValue == 0)
+ {
+ *outResource = nullptr;
+ return SLANG_OK;
+ }
+
+ RefPtr<TextureResourceImpl> resource = new TextureResourceImpl(desc);
+ resource->m_cudaContext = m_context;
+
+ // CUDA manages sharing of buffers through the idea of an
+ // "external memory" object, which represents the relationship
+ // with another API's objects. In order to create this external
+ // memory association, we first need to fill in a descriptor struct.
+ CUDA_EXTERNAL_MEMORY_HANDLE_DESC externalMemoryHandleDesc;
+ memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
+ switch (handle.api)
+ {
+ case InteropHandleAPI::D3D12:
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE;
+ break;
+ case InteropHandleAPI::Vulkan:
+ externalMemoryHandleDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32;
+ break;
+ default:
+ return SLANG_FAIL;
+ }
+ externalMemoryHandleDesc.handle.win32.handle = (void*)handle.handleValue;
+ externalMemoryHandleDesc.size = size;
+ externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
+
+ CUexternalMemory externalMemory;
+ SLANG_CUDA_RETURN_ON_FAIL(cuImportExternalMemory(&externalMemory, &externalMemoryHandleDesc));
+ resource->m_cudaExternalMemory = externalMemory;
+
+ FormatInfo formatInfo;
+ SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc.format, &formatInfo));
+ CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+ arrayDesc.Depth = desc.size.depth;
+ arrayDesc.Height = desc.size.height;
+ arrayDesc.Width = desc.size.width;
+ arrayDesc.NumChannels = formatInfo.channelCount;
+ getCUDAFormat(desc.format, &arrayDesc.Format);
+ arrayDesc.Flags = 0; // TODO: Flags? CUDA_ARRAY_LAYERED/SURFACE_LDST/CUBEMAP/TEXTURE_GATHER
+
+ CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC externalMemoryMipDesc;
+ memset(&externalMemoryMipDesc, 0, sizeof(externalMemoryMipDesc));
+ externalMemoryMipDesc.offset = 0;
+ externalMemoryMipDesc.arrayDesc = arrayDesc;
+ externalMemoryMipDesc.numLevels = desc.numMipLevels;
+
+ CUmipmappedArray mipArray;
+ SLANG_CUDA_RETURN_ON_FAIL(cuExternalMemoryGetMappedMipmappedArray(&mipArray, externalMemory, &externalMemoryMipDesc));
+ resource->m_cudaMipMappedArray = mipArray;
+
+ CUarray cuArray;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&cuArray, mipArray, 0));
+ resource->m_cudaArray = cuArray;
+
+ CUDA_RESOURCE_DESC surfDesc;
+ memset(&surfDesc, 0, sizeof(surfDesc));
+ surfDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ surfDesc.res.array.hArray = cuArray;
+
+ CUsurfObject surface;
+ SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&surface, &surfDesc));
+ resource->m_cudaSurfObj = surface;
+
+ returnComPtr(outResource, resource);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTextureView(
+ ITextureResource* texture, IResourceView::Desc const& desc, IResourceView** outView)
+{
+ RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+ view->m_desc = desc;
+ view->textureResource = dynamic_cast<TextureResourceImpl*>(texture);
+ returnComPtr(outView, view);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createBufferView(
+ IBufferResource* buffer,
+ IBufferResource* counterBuffer,
+ IResourceView::Desc const& desc,
+ IResourceView** outView)
+{
+ RefPtr<ResourceViewImpl> view = new ResourceViewImpl();
+ view->m_desc = desc;
+ view->memoryResource = dynamic_cast<BufferResourceImpl*>(buffer);
+ returnComPtr(outView, view);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createQueryPool(
+ const IQueryPool::Desc& desc,
+ IQueryPool** outPool)
+{
+ RefPtr<QueryPoolImpl> pool = new QueryPoolImpl();
+ SLANG_RETURN_ON_FAIL(pool->init(desc));
+ returnComPtr(outPool, pool);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObjectLayout(
+ slang::TypeLayoutReflection* typeLayout,
+ ShaderObjectLayoutBase** outLayout)
+{
+ RefPtr<ShaderObjectLayoutImpl> cudaLayout;
+ cudaLayout = new ShaderObjectLayoutImpl(this, typeLayout);
+ returnRefPtrMove(outLayout, cudaLayout);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject)
+{
+ RefPtr<ShaderObjectImpl> result = new ShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+ returnComPtr(outObject, result);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createMutableShaderObject(
+ ShaderObjectLayoutBase* layout,
+ IShaderObject** outObject)
+{
+ RefPtr<MutableShaderObjectImpl> result = new MutableShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, dynamic_cast<ShaderObjectLayoutImpl*>(layout)));
+ returnComPtr(outObject, result);
+ return SLANG_OK;
+}
+
+Result DeviceImpl::createRootShaderObject(IShaderProgram* program, ShaderObjectBase** outObject)
+{
+ auto cudaProgram = dynamic_cast<ShaderProgramImpl*>(program);
+ auto cudaLayout = cudaProgram->layout;
+
+ RefPtr<RootShaderObjectImpl> result = new RootShaderObjectImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, cudaLayout));
+ returnRefPtrMove(outObject, result);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createProgram(
+ const IShaderProgram::Desc& desc,
+ IShaderProgram** outProgram,
+ ISlangBlob** outDiagnosticBlob)
+{
+ // If this is a specializable program, we just keep a reference to the slang program and
+ // don't actually create any kernels. This program will be specialized later when we know
+ // the shader object bindings.
+ RefPtr<ShaderProgramImpl> cudaProgram = new ShaderProgramImpl();
+ cudaProgram->init(desc);
+ cudaProgram->cudaContext = m_context;
+ if (desc.slangGlobalScope->getSpecializationParamCount() != 0)
+ {
+ cudaProgram->layout = new RootShaderObjectLayoutImpl(this, desc.slangGlobalScope->getLayout());
+ returnComPtr(outProgram, cudaProgram);
+ return SLANG_OK;
+ }
+
+ ComPtr<ISlangBlob> kernelCode;
+ ComPtr<ISlangBlob> diagnostics;
+ auto compileResult = desc.slangGlobalScope->getEntryPointCode(
+ (SlangInt)0, 0, kernelCode.writeRef(), diagnostics.writeRef());
+ if (diagnostics)
+ {
+ getDebugCallback()->handleMessage(
+ compileResult == SLANG_OK ? DebugMessageType::Warning : DebugMessageType::Error,
+ DebugMessageSource::Slang,
+ (char*)diagnostics->getBufferPointer());
+ if (outDiagnosticBlob)
+ returnComPtr(outDiagnosticBlob, diagnostics);
+ }
+ SLANG_RETURN_ON_FAIL(compileResult);
+
+ SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&cudaProgram->cudaModule, kernelCode->getBufferPointer()));
+ cudaProgram->kernelName = desc.slangGlobalScope->getLayout()->getEntryPointByIndex(0)->getName();
+ SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(
+ &cudaProgram->cudaKernel, cudaProgram->cudaModule, cudaProgram->kernelName.getBuffer()));
+
+ auto slangGlobalScope = desc.slangGlobalScope;
+ if (slangGlobalScope)
+ {
+ cudaProgram->slangGlobalScope = slangGlobalScope;
+
+ auto slangProgramLayout = slangGlobalScope->getLayout();
+ if (!slangProgramLayout)
+ return SLANG_FAIL;
+
+ RefPtr<RootShaderObjectLayoutImpl> cudaLayout;
+ cudaLayout = new RootShaderObjectLayoutImpl(this, slangProgramLayout);
+ cudaLayout->programLayout = slangProgramLayout;
+ cudaProgram->layout = cudaLayout;
+ }
+
+ returnComPtr(outProgram, cudaProgram);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createComputePipelineState(
+ const ComputePipelineStateDesc& desc, IPipelineState** outState)
+{
+ RefPtr<ComputePipelineStateImpl> state = new ComputePipelineStateImpl();
+ state->shaderProgram = static_cast<ShaderProgramImpl*>(desc.program);
+ state->init(desc);
+ returnComPtr(outState, state);
+ return Result();
+}
+
+void* DeviceImpl::map(IBufferResource* buffer)
+{
+ return static_cast<BufferResourceImpl*>(buffer)->m_cudaMemory;
+}
+
+void DeviceImpl::unmap(IBufferResource* buffer)
+{
+ SLANG_UNUSED(buffer);
+}
+
+SLANG_NO_THROW const DeviceInfo& SLANG_MCALL DeviceImpl::getDeviceInfo() const
+{
+ return m_info;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createTransientResourceHeap(
+ const ITransientResourceHeap::Desc& desc,
+ ITransientResourceHeap** outHeap)
+{
+ RefPtr<TransientResourceHeapImpl> result = new TransientResourceHeapImpl();
+ SLANG_RETURN_ON_FAIL(result->init(this, desc));
+ returnComPtr(outHeap, result);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue)
+{
+ RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
+ queue->init(this);
+ returnComPtr(outQueue, queue);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createSwapchain(
+ const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(window);
+ SLANG_UNUSED(outSwapchain);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createFramebufferLayout(
+ const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outLayout);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outFramebuffer);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createRenderPassLayout(
+ const IRenderPassLayout::Desc& desc,
+ IRenderPassLayout** outRenderPassLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outRenderPassLayout);
+ return SLANG_FAIL;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL
+ DeviceImpl::createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler)
+{
+ SLANG_UNUSED(desc);
+ *outSampler = nullptr;
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createInputLayout(
+ IInputLayout::Desc const& desc,
+ IInputLayout** outLayout)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outLayout);
+ return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::createGraphicsPipelineState(
+ const GraphicsPipelineStateDesc& desc, IPipelineState** outState)
+{
+ SLANG_UNUSED(desc);
+ SLANG_UNUSED(outState);
+ return SLANG_E_NOT_AVAILABLE;
+}
+
+SLANG_NO_THROW SlangResult SLANG_MCALL DeviceImpl::readTextureResource(
+ ITextureResource* texture,
+ ResourceState state,
+ ISlangBlob** outBlob,
+ size_t* outRowPitch,
+ size_t* outPixelSize)
+{
+ auto textureImpl = static_cast<TextureResourceImpl*>(texture);
+ RefPtr<ListBlob> blob = new ListBlob();
+
+ auto desc = textureImpl->getDesc();
+ auto width = desc->size.width;
+ auto height = desc->size.height;
+ FormatInfo sizeInfo;
+ SLANG_RETURN_ON_FAIL(gfxGetFormatInfo(desc->format, &sizeInfo));
+ size_t pixelSize = sizeInfo.blockSizeInBytes / sizeInfo.pixelsPerBlock;
+ size_t rowPitch = width * pixelSize;
+ size_t size = height * rowPitch;
+ blob->m_data.setCount((Index)size);
+
+ CUDA_MEMCPY2D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+
+ copyParam.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.srcArray = textureImpl->m_cudaArray;
+
+ copyParam.dstMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.dstHost = blob->m_data.getBuffer();
+ copyParam.dstPitch = rowPitch;
+ copyParam.WidthInBytes = copyParam.dstPitch;
+ copyParam.Height = height;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+
+ *outRowPitch = rowPitch;
+ *outPixelSize = pixelSize;
+ returnComPtr(outBlob, blob);
+ return SLANG_OK;
+}
+
+SLANG_NO_THROW Result SLANG_MCALL DeviceImpl::readBufferResource(
+ IBufferResource* buffer,
+ size_t offset,
+ size_t size,
+ ISlangBlob** outBlob)
+{
+ auto bufferImpl = static_cast<BufferResourceImpl*>(buffer);
+ RefPtr<ListBlob> blob = new ListBlob();
+ blob->m_data.setCount((Index)size);
+ cudaMemcpy(
+ blob->m_data.getBuffer(),
+ (uint8_t*)bufferImpl->m_cudaMemory + offset,
+ size,
+ cudaMemcpyDefault);
+ returnComPtr(outBlob, blob);
+ return SLANG_OK;
+}
+
+} // namespace cuda
+#endif
+} // namespace gfx