#include "cuda-compute-util.h" #include "../../slang-com-helper.h" #include "../../source/core/slang-std-writers.h" #include "../../source/core/slang-token-reader.h" #include "../bind-location.h" #include #include namespace renderer_test { using namespace Slang; SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } #if 0 #define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_FAIL; } #else #define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); return SLANG_FAIL; } } #endif #define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; } class CUDAResource : public CUDAComputeUtil::ResourceBase { public: typedef CUDAComputeUtil::ResourceBase Super; /// Dtor ~CUDAResource() { if (m_cudaMemory) { SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory)); } } static CUDAResource* getCUDAResource(BindSet::Value* value) { return value ? dynamic_cast(value->m_target.Ptr()) : nullptr; } /// Helper function to get the cuda memory pointer when given a value static CUdeviceptr getCUDAData(BindSet::Value* value) { auto resource = getCUDAResource(value); return resource ? resource->m_cudaMemory : CUdeviceptr(); } CUdeviceptr m_cudaMemory = CUdeviceptr(); }; class CUDATextureResource : public CUDAComputeUtil::ResourceBase { public: typedef CUDAComputeUtil::ResourceBase Super; ~CUDATextureResource() { if (m_cudaTexObj) { SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); } if (m_cudaArray) { SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); } if (m_cudaMipMappedArray) { SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray)); } } static CUDATextureResource* getCUDATextureResource(BindSet::Value* value) { return value ? dynamic_cast(value->m_target.Ptr()) : nullptr; } static CUtexObject getCUDATexObject(BindSet::Value* value) { auto resource = getCUDATextureResource(value); // It's an assumption here that 0 is okay for null. Seems to work... return resource ? resource->m_cudaTexObj : CUtexObject(0); } // This is an opaque type, that's backed by a long long CUtexObject m_cudaTexObj = CUtexObject(); CUarray m_cudaArray = CUarray(); CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray(); }; class ScopeCUDAModule { public: operator CUmodule () const { return m_module; } ScopeCUDAModule(): m_module(nullptr) {} SlangResult load(const void* image) { release(); SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&m_module, image)); return SLANG_OK; } void release() { if (m_module) { cuModuleUnload(m_module); m_module = nullptr; } } ~ScopeCUDAModule() { release(); } CUmodule m_module; }; class ScopeCUDAStream { public: SlangResult init(unsigned int flags) { release(); SLANG_ASSERT(m_stream == nullptr); SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags)); return SLANG_OK; } SlangResult sync() { if (m_stream) { SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream)); } else { SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize()); } return SLANG_OK; } void release() { if (m_stream) { sync(); SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream)); m_stream = nullptr; } } ScopeCUDAStream():m_stream(nullptr) {} ~ScopeCUDAStream() { release(); } operator CUstream () const { return m_stream; } CUstream m_stream; }; static int _calcSMCountPerMultiProcessor(int major, int minor) { // Defines for GPU Architecture types (using the SM version to determine // the # of cores per SM struct SMInfo { int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version int coreCount; }; static const SMInfo infos[] = { {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128}, {0x52, 128}, {0x53, 128}, {0x60, 64}, {0x61, 128}, {0x62, 128}, {0x70, 64}, {0x72, 64}, {0x75, 64} }; const int sm = ((major << 4) + minor); for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i) { if (infos[i].sm == sm) { return infos[i].coreCount; } } const auto& last = infos[SLANG_COUNT_OF(infos) - 1]; // It must be newer presumably SLANG_ASSERT(sm > last.coreCount ); // Default to the last entry return last.coreCount; } static SlangResult _findMaxFlopsDeviceId(int* outDevice) { int smPerMultiproc = 0; int maxPerfDevice = -1; int deviceCount = 0; int devicesProhibited = 0; uint64_t maxComputePerf = 0; SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount)); // Find the best CUDA capable GPU device for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice) { int computeMode = -1, major = 0, minor = 0; SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice)); SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice)); SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice)); // If this GPU is not running on Compute Mode prohibited, // then we can add it to the list if (computeMode != cudaComputeModeProhibited) { if (major == 9999 && minor == 9999) { smPerMultiproc = 1; } else { smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor); } int multiProcessorCount = 0, clockRate = 0; SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice)); SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice)); uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate; if (compute_perf > maxComputePerf) { maxComputePerf = compute_perf; maxPerfDevice = currentDevice; } } else { devicesProhibited++; } } if (maxPerfDevice < 0) { return SLANG_FAIL; } *outDevice = maxPerfDevice; return SLANG_OK; } static SlangResult _initCuda() { static CUresult res = cuInit(0); SLANG_CUDA_RETURN_ON_FAIL(res); return SLANG_OK; } class ScopeCUDAContext { public: ScopeCUDAContext() : m_context(nullptr) {} SlangResult init(unsigned int flags, CUdevice device) { SLANG_RETURN_ON_FAIL(_initCuda()); if (m_context) { cuCtxDestroy(m_context); m_context = nullptr; } if (_isError(cuCtxCreate(&m_context, flags, device))) { return SLANG_FAIL; } return SLANG_OK; } SlangResult init(unsigned int flags) { SLANG_RETURN_ON_FAIL(_initCuda()); int deviceId; SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId)); SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId)); if (m_context) { cuCtxDestroy(m_context); m_context = nullptr; } if (_isError(cuCtxCreate(&m_context, flags, deviceId))) { return SLANG_FAIL; } return SLANG_OK; } ~ScopeCUDAContext() { if (m_context) { cuCtxDestroy(m_context); } } SLANG_FORCE_INLINE operator CUcontext () const { return m_context; } CUcontext m_context; }; /* static */bool CUDAComputeUtil::canCreateDevice() { ScopeCUDAContext context; return SLANG_SUCCEEDED(context.init(0)); } /* static */SlangResult CUDAComputeUtil::createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr& outResource) { auto type = typeLayout->getType(); auto shape = type->getResourceShape(); auto access = type->getResourceAccess(); CUresourcetype resourceType = CU_RESOURCE_TYPE_ARRAY; auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK; slang::TypeReflection* typeReflection = typeLayout->getResourceResultType(); const auto& textureDesc = srcEntry.textureDesc; // CUDA wants the unused dimensions to be 0. // Might need to specially handle elsewhere int width = textureDesc.size; int height = 0; int depth = 0; switch (baseShape) { case SLANG_TEXTURE_1D: { break; } case SLANG_TEXTURE_2D: { height = textureDesc.size; break; } case SLANG_TEXTURE_3D: { height = textureDesc.size; depth = textureDesc.size; break; } case SLANG_TEXTURE_CUBE: { height = width; depth = 1; break; } default: { SLANG_ASSERT(!"Type not supported"); return SLANG_FAIL; } } TextureData texData; generateTextureData(texData, textureDesc); auto mipLevels = texData.mipLevels; RefPtr tex = new CUDATextureResource; size_t elementSize = 0; { CUarray_format format = CU_AD_FORMAT_FLOAT; int numChannels = 0; switch (textureDesc.format) { case Format::R_Float32: { format = CU_AD_FORMAT_FLOAT; numChannels = 1; elementSize = sizeof(float); break; } case Format::RGBA_Unorm_UInt8: { format = CU_AD_FORMAT_UNSIGNED_INT8; numChannels = 4; elementSize = sizeof(uint32_t); break; } default: { SLANG_ASSERT(!"Only support R_Float32/RGBA_Unorm_UInt8 formats for now"); return SLANG_FAIL; } } if (mipLevels > 1) { resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; CUDA_ARRAY3D_DESCRIPTOR arrayDesc; memset(&arrayDesc, 0, sizeof(arrayDesc)); arrayDesc.Width = width; arrayDesc.Height = height; arrayDesc.Depth = depth; arrayDesc.Format = format; arrayDesc.NumChannels = numChannels; arrayDesc.Flags = 0; if (textureDesc.arrayLength > 1) { if (baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE) { arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED; arrayDesc.Depth = textureDesc.arrayLength; } else { SLANG_ASSERT(!"Arrays only supported for 1D and 2D"); return SLANG_FAIL; } } if (baseShape == SLANG_TEXTURE_CUBE) { arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; arrayDesc.Depth *= 6; } SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, mipLevels)); } else { resourceType = CU_RESOURCE_TYPE_ARRAY; if (textureDesc.arrayLength > 1) { if (baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE) { SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported"); return SLANG_FAIL; } CUDA_ARRAY3D_DESCRIPTOR arrayDesc; memset(&arrayDesc, 0, sizeof(arrayDesc)); // Set the depth as the array length arrayDesc.Depth = textureDesc.arrayLength; if (baseShape == SLANG_TEXTURE_CUBE) { arrayDesc.Depth *= 6; } arrayDesc.Height = height; arrayDesc.Width = width; arrayDesc.Format = format; arrayDesc.NumChannels = numChannels; SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); } else if (baseShape == SLANG_TEXTURE_3D || baseShape == SLANG_TEXTURE_CUBE) { CUDA_ARRAY3D_DESCRIPTOR arrayDesc; memset(&arrayDesc, 0, sizeof(arrayDesc)); arrayDesc.Depth = depth; arrayDesc.Height = height; arrayDesc.Width = width; arrayDesc.Format = format; arrayDesc.NumChannels = numChannels; arrayDesc.Flags = 0; // Handle cube texture if (baseShape == SLANG_TEXTURE_CUBE) { arrayDesc.Depth = 6; arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP; } SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc)); } else { CUDA_ARRAY_DESCRIPTOR arrayDesc; memset(&arrayDesc, 0, sizeof(arrayDesc)); arrayDesc.Width = width; arrayDesc.Height = height; arrayDesc.Format = format; arrayDesc.NumChannels = numChannels; // Allocate the array, will work for 1D or 2D case SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); } } } // Work space for holding data for uploading if it needs to be rearranged List workspace; for (int mipLevel = 0; mipLevel < mipLevels; ++mipLevel) { int mipWidth = width >> mipLevel; int mipHeight = height >> mipLevel; int mipDepth = depth >> mipLevel; mipWidth = (mipWidth == 0) ? 1 : mipWidth; mipHeight = (mipHeight == 0) ? 1 : mipHeight; mipDepth = (mipDepth == 0) ? 1 : mipDepth; // If it's a cubemap then the depth is always 6 if (baseShape == SLANG_TEXTURE_CUBE) { mipDepth = 6; } auto dstArray = tex->m_cudaArray; if (tex->m_cudaMipMappedArray) { // Get the array for the mip level SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel)); } SLANG_ASSERT(dstArray); // Check using the desc to see if it's plausible { CUDA_ARRAY_DESCRIPTOR arrayDesc; SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray)); SLANG_ASSERT(mipWidth == arrayDesc.Width); SLANG_ASSERT(mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0)); } const void* srcDataPtr = nullptr; if (textureDesc.arrayLength > 1) { SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE); // TODO(JS): Here I assume that arrays are just held contiguously within a 'face' // This seems reasonable and works with the Copy3D. const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; Index faceCount = textureDesc.arrayLength; if (baseShape == SLANG_TEXTURE_CUBE) { faceCount *= 6; } const size_t mipSizeInBytes = faceSizeInBytes * faceCount; workspace.setCount(mipSizeInBytes); // We need to add the face data from each mip // We iterate over face count so we copy all of the cubemap faces for (Index j = 0; j < faceCount; j++) { const auto& srcData = texData.dataBuffer[mipLevel + j * mipLevels]; // Copy over to the workspace to make contiguous ::memcpy(workspace.begin() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes); } srcDataPtr = workspace.getBuffer(); } else { if (baseShape == SLANG_TEXTURE_CUBE) { size_t faceSizeInBytes = elementSize * mipWidth * mipHeight; workspace.setCount(faceSizeInBytes * 6); // Copy the data over to make contiguous for (Index j = 0; j < 6; j++) { const auto& srcData = texData.dataBuffer[mipLevels * j + mipLevel]; SLANG_ASSERT(mipWidth * mipHeight == srcData.getCount()); ::memcpy(workspace.getBuffer() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes); } srcDataPtr = workspace.getBuffer(); } else { const auto& srcData = texData.dataBuffer[mipLevel]; SLANG_ASSERT(mipWidth * mipHeight * mipDepth == srcData.getCount()); srcDataPtr = srcData.getBuffer(); } } if (textureDesc.arrayLength > 1) { SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE); CUDA_MEMCPY3D copyParam; memset(©Param, 0, sizeof(copyParam)); copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; copyParam.dstArray = dstArray; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcHost = srcDataPtr; copyParam.srcPitch = mipWidth * elementSize; copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = mipHeight; // Set the depth to the array length copyParam.Depth = textureDesc.arrayLength; if (baseShape == SLANG_TEXTURE_CUBE) { copyParam.Depth *= 6; } SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); } else { switch (baseShape) { case SLANG_TEXTURE_1D: case SLANG_TEXTURE_2D: { CUDA_MEMCPY2D copyParam; memset(©Param, 0, sizeof(copyParam)); copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; copyParam.dstArray = dstArray; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcHost = srcDataPtr; copyParam.srcPitch = mipWidth * elementSize; copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = mipHeight; SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); break; } case SLANG_TEXTURE_3D: case SLANG_TEXTURE_CUBE: { CUDA_MEMCPY3D copyParam; memset(©Param, 0, sizeof(copyParam)); copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; copyParam.dstArray = dstArray; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcHost = srcDataPtr; copyParam.srcPitch = mipWidth * elementSize; copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = mipHeight; copyParam.Depth = mipDepth; SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(©Param)); break; } default: { SLANG_ASSERT(!"Not implemented"); break; } } } } // Set up texture sampling parameters, and create final texture obj { CUDA_RESOURCE_DESC resDesc; memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); resDesc.resType = resourceType; if (tex->m_cudaArray) { resDesc.res.array.hArray = tex->m_cudaArray; } if (tex->m_cudaMipMappedArray) { resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray; } CUDA_TEXTURE_DESC texDesc; memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); } outResource = tex; return SLANG_OK; } static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], CUDAComputeUtil::Context& outContext) { auto& bindSet = outContext.m_bindSet; auto& bindRoot = outContext.m_bindRoot; auto request = outputAndLayout.output.request; auto reflection = (slang::ShaderReflection*) spGetReflection(request); slang::EntryPointReflection* entryPoint = nullptr; auto entryPointCount = reflection->getEntryPointCount(); SLANG_ASSERT(entryPointCount == 1); entryPoint = reflection->getEntryPointByIndex(0); const char* entryPointName = entryPoint->getName(); // Get the entry point CUfunction kernel; SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName)); // A default stream, will act as a global stream. Calling sync will globally sync ScopeCUDAStream cudaStream; //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking)); { // Okay now we need to set up binding bindRoot.init(&bindSet, reflection, 0); // Will set up any root buffers bindRoot.addDefaultValues(); // Now set up the Values from the test auto outStream = StdWriters::getOut(); SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot)); ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers); // First create all of the resources for the values { const auto& values = bindSet.getValues(); const auto& entries = outputAndLayout.layout.entries; for (BindSet::Value* value : values) { auto typeLayout = value->m_type; // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer; switch (kind) { case slang::TypeReflection::Kind::ConstantBuffer: case slang::TypeReflection::Kind::ParameterBlock: { // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first RefPtr resource = new CUDAResource; SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes)); value->m_target = resource; break; } case slang::TypeReflection::Kind::Resource: { auto type = typeLayout->getType(); auto shape = type->getResourceShape(); auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK; switch (baseShape) { case SLANG_TEXTURE_1D: case SLANG_TEXTURE_2D: case SLANG_TEXTURE_3D: case SLANG_TEXTURE_CUBE: { RefPtr resource; SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource)); value->m_target = resource; break; } case SLANG_TEXTURE_BUFFER: { // Need a CUDA impl for these... // For now we can just leave as target will just be nullptr break; } case SLANG_BYTE_ADDRESS_BUFFER: case SLANG_STRUCTURED_BUFFER: { // On CPU we just use the memory in the BindSet buffer, so don't need to create anything RefPtr resource = new CUDAResource; SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes)); value->m_target = resource; break; } } } default: break; } } } // Now we need to go through all of the bindings and set the appropriate data { List locations; List values; bindSet.getBindings(locations, values); for (Index i = 0; i < locations.getCount(); ++i) { const auto& location = locations[i]; BindSet::Value* value = values[i]; // Okay now we need to set up the actual handles that CPU will follow. auto typeLayout = location.getTypeLayout(); const auto kind = typeLayout->getKind(); switch (kind) { case slang::TypeReflection::Kind::Array: { auto elementCount = int(typeLayout->getElementCount()); if (elementCount == 0) { CUDAComputeUtil::Array array = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) { array.data = resource->m_cudaMemory; array.count = value->m_elementCount; } location.setUniform(&array, sizeof(array)); } break; } case slang::TypeReflection::Kind::ConstantBuffer: case slang::TypeReflection::Kind::ParameterBlock: { // These map down to just pointers *location.getUniform() = CUDAResource::getCUDAData(value); break; } case slang::TypeReflection::Kind::Resource: { auto type = typeLayout->getType(); auto shape = type->getResourceShape(); //auto access = type->getResourceAccess(); switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) { case SLANG_STRUCTURED_BUFFER: { CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) { buffer.data = resource->m_cudaMemory; buffer.count = value->m_elementCount; } location.setUniform(&buffer, sizeof(buffer)); break; } case SLANG_BYTE_ADDRESS_BUFFER: { CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) { buffer.data = resource->m_cudaMemory; buffer.sizeInBytes = value->m_sizeInBytes; } location.setUniform(&buffer, sizeof(buffer)); break; } case SLANG_TEXTURE_1D: case SLANG_TEXTURE_2D: case SLANG_TEXTURE_3D: case SLANG_TEXTURE_CUBE: { *location.getUniform() = CUDATextureResource::getCUDATexObject(value); break; } } break; } default: break; } } } // Okay now the memory is all set up, we can copy everything over { const auto& values = bindSet.getValues(); for (BindSet::Value* value : values) { CUdeviceptr cudaMem = CUDAResource::getCUDAData(value); if (value && value->m_data && cudaMem) { // Okay copy the data over... SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes)); } } } // Now we can execute the kernel { // Get the max threads per block for this function int maxTheadsPerBlock; SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel)); int sharedSizeInBytes; SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel)); // Work out the args CUdeviceptr uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue()); CUdeviceptr entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue()); // NOTE! These are pointers to the cuda memory pointers void* args[] = { &entryPointCUDAData , &uniformCUDAData }; SlangUInt numThreadsPerAxis[3]; entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); // Launch auto cudaLaunchResult = cuLaunchKernel(kernel, dispatchSize[0], dispatchSize[1], dispatchSize[2], int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block 0, // Shared memory size cudaStream, // Stream. 0 is no stream. args, // Args nullptr); // extra SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult); // Do a sync here. Makes sure any issues are detected early and not on some implicit sync SLANG_RETURN_ON_FAIL(cudaStream.sync()); } // Finally we need to copy the data back { const auto& entries = outputAndLayout.layout.entries; for (Index i = 0; i < entries.getCount(); ++i) { const auto& entry = entries[i]; BindSet::Value* value = outContext.m_buffers[i]; if (entry.isOutput) { // Copy back to CPU memory CUdeviceptr cudaMem = CUDAResource::getCUDAData(value); if (value && value->m_data && cudaMem) { // Okay copy the data back... SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes)); } } } } } // Release all othe CUDA resource/allocations bindSet.releaseValueTargets(); return SLANG_OK; } /* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext) { ScopeCUDAContext cudaContext; SLANG_RETURN_ON_FAIL(cudaContext.init(0)); const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute); if (index < 0) { return SLANG_FAIL; } const auto& kernel = outputAndLayout.output.kernelDescs[index]; ScopeCUDAModule cudaModule; SLANG_RETURN_ON_FAIL(cudaModule.load(kernel.codeBegin)); SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, dispatchSize, outContext)); return SLANG_OK; } } // renderer_test