summaryrefslogtreecommitdiffstats
path: root/tools/render-test/cuda/cuda-compute-util.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tools/render-test/cuda/cuda-compute-util.cpp')
-rw-r--r--tools/render-test/cuda/cuda-compute-util.cpp1872
1 files changed, 0 insertions, 1872 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
deleted file mode 100644
index bd77919a2..000000000
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ /dev/null
@@ -1,1872 +0,0 @@
-
-#include "cuda-compute-util.h"
-
-#include "slang-com-helper.h"
-
-#include "source/core/slang-std-writers.h"
-#include "source/core/slang-token-reader.h"
-#include "source/core/slang-semantic-version.h"
-
-#include "../bind-location.h"
-
-#include <cuda.h>
-
-#include <cuda_runtime_api.h>
-
-// TODO: should conditionalize this on OptiX being present
-#ifdef RENDER_TEST_OPTIX
-
-// The `optix_stubs.h` header produces warnings when compiled with MSVC
-#ifdef _MSC_VER
-#pragma warning(disable: 4996)
-#endif
-#include <optix.h>
-#include <optix_function_table_definition.h>
-#include <optix_stubs.h>
-#endif
-
-namespace renderer_test {
-using namespace Slang;
-
-SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
-SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
-
-// A enum used to control if errors are reported on failure of CUDA call.
-enum class CUDAReportStyle
-{
- Normal,
- Silent,
-};
-
-struct CUDAErrorInfo
-{
- CUDAErrorInfo(const char* filePath, int lineNo, const char* errorName = nullptr, const char* errorString = nullptr):
- m_filePath(filePath),
- m_lineNo(lineNo),
- m_errorName(errorName),
- m_errorString(errorString)
- {
- }
- SlangResult handle() const
- {
- StringBuilder builder;
- builder << "Error: " << m_filePath << " (" << m_lineNo << ") :";
-
- if (m_errorName)
- {
- builder << m_errorName << " : ";
- }
- if (m_errorString)
- {
- builder << m_errorString;
- }
-
- StdWriters::getError().put(builder.getUnownedSlice());
-
- //Slang::signalUnexpectedError(builder.getBuffer());
- return SLANG_FAIL;
- }
-
- const char* m_filePath;
- int m_lineNo;
- const char* m_errorName;
- const char* m_errorString;
-};
-
-#if 1
-// If this code path is enabled, CUDA errors will be reported directly to StdWriter::out stream.
-
-static SlangResult _handleCUDAError(CUresult cuResult, const char* file, int line)
-{
- CUDAErrorInfo info(file, line);
- cuGetErrorString(cuResult, &info.m_errorString);
- cuGetErrorName(cuResult, &info.m_errorName);
- return info.handle();
-}
-
-static SlangResult _handleCUDAError(cudaError_t error, const char* file, int line)
-{
- return CUDAErrorInfo(file, line, cudaGetErrorName(error), cudaGetErrorString(error)).handle();
-}
-
-#define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res, __FILE__, __LINE__)
-
-#else
-// If this code path is enabled, errors are not reported, but can have an assert enabled
-
-static SlangResult _handleCUDAError(CUresult cuResult)
-{
- SLANG_UNUSED(cuResult);
- //SLANG_ASSERT(!"Failed CUDA call");
- return SLANG_FAIL;
-}
-
-static SlangResult _handleCUDAError(cudaError_t error)
-{
- SLANG_UNUSED(error);
- //SLANG_ASSERT(!"Failed CUDA call");
- return SLANG_FAIL;
-}
-
-#define SLANG_CUDA_HANDLE_ERROR(x) _handleCUDAError(_res)
-#endif
-
-#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_CUDA_HANDLE_ERROR(_res); }
-#define SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(x, r) \
- { \
- auto _res = x; \
- if (_isError(_res)) \
- { \
- return (r == CUDAReportStyle::Normal) ? SLANG_CUDA_HANDLE_ERROR(_res) : SLANG_FAIL; \
- } \
- } \
-
-#define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; }
-
-#ifdef RENDER_TEST_OPTIX
-
-static bool _isError(OptixResult result) { return result != OPTIX_SUCCESS; }
-
-#if 1
-static SlangResult _handleOptixError(OptixResult result, char const* file, int line)
-{
- fprintf(stderr, "%s(%d): optix: %s (%s)\n",
- file,
- line,
- optixGetErrorString(result),
- optixGetErrorName(result));
- return SLANG_FAIL;
-}
-#define SLANG_OPTIX_HANDLE_ERROR(RESULT) _handleOptixError(RESULT, __FILE__, __LINE__)
-#else
-#define SLANG_OPTIX_HANDLE_ERROR(RESULT) SLANG_FAIL
-#endif
-
-#define SLANG_OPTIX_RETURN_ON_FAIL(EXPR) do { auto _res = EXPR; if(_isError(_res)) return SLANG_OPTIX_HANDLE_ERROR(_res); } while(0)
-
-void _optixLogCallback(unsigned int level, const char* tag, const char* message, void* userData)
-{
- fprintf(stderr, "optix: %s (%s)\n",
- message,
- tag);
-}
-
-#endif
-
-class MemoryCUDAResource : public CUDAResource
-{
-public:
- typedef CUDAResource Super;
-
- /// Dtor
- ~MemoryCUDAResource()
- {
- if (m_cudaMemory)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
- }
- }
-
- static MemoryCUDAResource* asResource(BindSet::Value* value)
- {
- return value ? dynamic_cast<MemoryCUDAResource*>(value->m_target.Ptr()) : nullptr;
- }
- /// Helper function to get the CUDA memory pointer when given a value
- static CUdeviceptr getCUDAData(BindSet::Value* value)
- {
- auto resource = asResource(value);
- return resource ? resource->m_cudaMemory : CUdeviceptr();
- }
-
- virtual uint64_t getBindlessHandle() override
- {
- return (uint64_t)m_cudaMemory;
- }
-
- CUdeviceptr m_cudaMemory = CUdeviceptr();
-};
-
-class TextureCUDAResource : public CUDAResource
-{
-public:
- typedef CUDAResource Super;
-
- ~TextureCUDAResource()
- {
- if (m_cudaSurfObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuSurfObjectDestroy(m_cudaSurfObj));
- }
- if (m_cudaTexObj)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
- }
- if (m_cudaArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
- }
- if (m_cudaMipMappedArray)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuMipmappedArrayDestroy(m_cudaMipMappedArray));
- }
- }
-
- static TextureCUDAResource* asResource(BindSet::Value* value)
- {
- return value ? dynamic_cast<TextureCUDAResource*>(value->m_target.Ptr()) : nullptr;
- }
-
- static CUtexObject getTexObject(BindSet::Value* value)
- {
- auto resource = asResource(value);
- // It's an assumption here that 0 is okay for null. Seems to work...
- return resource ? resource->m_cudaTexObj : CUtexObject(0);
- }
-
- static CUsurfObject getSurfObject(BindSet::Value* value)
- {
- auto resource = asResource(value);
- return resource ? resource->m_cudaSurfObj : CUsurfObject(0);
- }
-
- virtual uint64_t getBindlessHandle() override
- {
- return (uint64_t)m_cudaTexObj;
- }
-
- // The texObject is for reading 'texture' like things. This is an opaque type, that's backed by a long long
- CUtexObject m_cudaTexObj = CUtexObject();
-
- // The surfObj is for reading/writing 'texture like' things, but not for sampling.
- CUsurfObject m_cudaSurfObj = CUsurfObject();
-
- CUarray m_cudaArray = CUarray();
- CUmipmappedArray m_cudaMipMappedArray = CUmipmappedArray();
-};
-
-class ScopeCUDAModule
-{
-public:
-
- operator CUmodule () const { return m_module; }
-
- ScopeCUDAModule(): m_module(nullptr) {}
- SlangResult load(const void* image)
- {
- release();
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&m_module, image));
- return SLANG_OK;
- }
- void release()
- {
- if (m_module)
- {
- cuModuleUnload(m_module);
- m_module = nullptr;
- }
- }
-
- ~ScopeCUDAModule() { release(); }
-
- CUmodule m_module;
-};
-
-class ScopeCUDAStream
-{
-public:
-
- SlangResult init(unsigned int flags)
- {
- release();
- SLANG_ASSERT(m_stream == nullptr);
- SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags));
- return SLANG_OK;
- }
-
- SlangResult sync()
- {
- if (m_stream)
- {
- SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream));
- }
- else
- {
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize());
- }
- return SLANG_OK;
- }
-
- void release()
- {
- if (m_stream)
- {
- sync();
- SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream));
- m_stream = nullptr;
- }
- }
-
- ScopeCUDAStream():m_stream(nullptr) {}
-
- ~ScopeCUDAStream() { release(); }
-
- operator CUstream () const { return m_stream; }
-
- CUstream m_stream;
-};
-
-static int _calcSMCountPerMultiProcessor(int major, int minor)
-{
- // Defines for GPU Architecture types (using the SM version to determine
- // the # of cores per SM
- struct SMInfo
- {
- int sm; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
- int coreCount;
- };
-
- static const SMInfo infos[] =
- {
- {0x30, 192},
- {0x32, 192},
- {0x35, 192},
- {0x37, 192},
- {0x50, 128},
- {0x52, 128},
- {0x53, 128},
- {0x60, 64},
- {0x61, 128},
- {0x62, 128},
- {0x70, 64},
- {0x72, 64},
- {0x75, 64}
- };
-
- const int sm = ((major << 4) + minor);
- for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
- {
- if (infos[i].sm == sm)
- {
- return infos[i].coreCount;
- }
- }
-
- const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
-
- // It must be newer presumably
- SLANG_ASSERT(sm > last.sm);
-
- // Default to the last entry
- return last.coreCount;
-}
-
-static SlangResult _findMaxFlopsDeviceIndex(int* outDeviceIndex)
-{
- int smPerMultiproc = 0;
- int maxPerfDevice = -1;
- int deviceCount = 0;
- int devicesProhibited = 0;
-
- uint64_t maxComputePerf = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
-
- // Find the best CUDA capable GPU device
- for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
- {
- int computeMode = -1, major = 0, minor = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
-
- // If this GPU is not running on Compute Mode prohibited,
- // then we can add it to the list
- if (computeMode != cudaComputeModeProhibited)
- {
- if (major == 9999 && minor == 9999)
- {
- smPerMultiproc = 1;
- }
- else
- {
- smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
- }
-
- int multiProcessorCount = 0, clockRate = 0;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
- uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
-
- if (compute_perf > maxComputePerf)
- {
- maxComputePerf = compute_perf;
- maxPerfDevice = currentDevice;
- }
- }
- else
- {
- devicesProhibited++;
- }
- }
-
- if (maxPerfDevice < 0)
- {
- return SLANG_FAIL;
- }
-
- *outDeviceIndex = maxPerfDevice;
- return SLANG_OK;
-}
-
-static SlangResult _initCuda(CUDAReportStyle reportType = CUDAReportStyle::Normal)
-{
- static CUresult res = cuInit(0);
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(res, reportType);
- return SLANG_OK;
-}
-
-class ScopeCUDAContext
-{
-public:
- ScopeCUDAContext() :
- m_context(nullptr),
- m_device(-1),
- m_deviceIndex(-1)
- {}
-
- SlangResult init(unsigned int flags, int deviceIndex, CUDAReportStyle reportType = CUDAReportStyle::Normal)
- {
- SLANG_RETURN_ON_FAIL(_initCuda(reportType));
-
- if (m_context)
- {
- cuCtxDestroy(m_context);
- m_context = nullptr;
- }
-
- m_deviceIndex = deviceIndex;
- SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, deviceIndex));
-
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, flags, m_device), reportType);
- return SLANG_OK;
- }
-
- SlangResult init(unsigned int flags, CUDAReportStyle reportType = CUDAReportStyle::Normal)
- {
- SLANG_RETURN_ON_FAIL(_initCuda(reportType));
-
- SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceIndex(&m_deviceIndex));
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cudaSetDevice(m_deviceIndex), reportType);
-
- if (m_context)
- {
- cuCtxDestroy(m_context);
- m_context = nullptr;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuDeviceGet(&m_device, m_deviceIndex));
-
- SLANG_CUDA_RETURN_WITH_REPORT_ON_FAIL(cuCtxCreate(&m_context, flags, m_device), reportType);
- return SLANG_OK;
- }
-
- ~ScopeCUDAContext()
- {
- if (m_context)
- {
- cuCtxDestroy(m_context);
- }
- }
- SLANG_FORCE_INLINE operator CUcontext () const { return m_context; }
-
- int m_deviceIndex;
- CUdevice m_device;
- CUcontext m_context;
-};
-
-/* static */SlangResult CUDAComputeUtil::parseFeature(const Slang::UnownedStringSlice& feature, bool& outResult)
-{
- outResult = false;
-
- if (feature.startsWith("cuda_sm_"))
- {
- const UnownedStringSlice versionSlice = UnownedStringSlice(feature.begin() + 8, feature.end());
- SemanticVersion requiredVersion;
- SLANG_RETURN_ON_FAIL(SemanticVersion::parse(versionSlice, '_', requiredVersion));
-
- // Need to get the version from the cuda device
- ScopeCUDAContext context;
- SLANG_RETURN_ON_FAIL(context.init(0, CUDAReportStyle::Silent));
-
- const int deviceIndex = context.m_deviceIndex;
-
- int computeMode = -1;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, deviceIndex));
-
- // If we don't have compute mode availability, we can't execute
- if (computeMode == cudaComputeModeProhibited)
- {
- return SLANG_FAIL;
- }
-
- int major, minor;
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex));
- SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex));
-
- SemanticVersion actualVersion;
- actualVersion.set(major, minor);
-
- outResult = actualVersion >= requiredVersion;
-
- return SLANG_OK;
- }
-
- return SLANG_FAIL;
-}
-
-/* static */bool CUDAComputeUtil::hasFeature(const Slang::UnownedStringSlice& feature)
-{
- bool res;
- return SLANG_SUCCEEDED(parseFeature(feature, res)) ? res : false;
-}
-
-/* static */bool CUDAComputeUtil::canCreateDevice()
-{
- ScopeCUDAContext context;
- return SLANG_SUCCEEDED(context.init(0, CUDAReportStyle::Silent));
-}
-
-static bool _hasReadAccess(SlangResourceAccess access)
-{
- return access = SLANG_RESOURCE_ACCESS_READ || access == SLANG_RESOURCE_ACCESS_READ_WRITE;
-}
-
-static bool _hasWriteAccess(SlangResourceAccess access)
-{
- return access == SLANG_RESOURCE_ACCESS_READ_WRITE;
-}
-
-/* static */SlangResult CUDAComputeUtil::createTextureResource(const ShaderInputLayoutEntry& srcEntry, slang::TypeLayoutReflection* typeLayout, RefPtr<CUDAResource>& outResource)
-{
- SlangResourceAccess access = SLANG_RESOURCE_ACCESS_READ;
- SlangResourceShape baseShape = SLANG_TEXTURE_2D;
- if (typeLayout)
- {
- auto type = typeLayout->getType();
- auto shape = type->getResourceShape();
- access = type->getResourceAccess();
-
- if (!(access == SLANG_RESOURCE_ACCESS_READ || access == SLANG_RESOURCE_ACCESS_READ_WRITE))
- {
- SLANG_ASSERT(!"Only read or read write currently supported");
- return SLANG_FAIL;
- }
- baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
- }
- else
- {
- if (srcEntry.textureDesc.isCube)
- {
- baseShape = SLANG_TEXTURE_CUBE;
- }
- else
- {
- switch (srcEntry.textureDesc.dimension)
- {
- case 1:
- baseShape = SLANG_TEXTURE_1D;
- break;
- case 2:
- baseShape = SLANG_TEXTURE_2D;
- break;
- case 3:
- baseShape = SLANG_TEXTURE_3D;
- break;
- default:
- break;
- }
- }
- if (srcEntry.textureDesc.isRWTexture)
- access = SLANG_RESOURCE_ACCESS_READ_WRITE;
- }
- CUresourcetype resourceType = CU_RESOURCE_TYPE_ARRAY;
-
- InputTextureDesc textureDesc = srcEntry.textureDesc;
-
- if (_hasWriteAccess(access))
- {
- textureDesc.mipMapCount = 1;
- }
-
- // CUDA wants the unused dimensions to be 0.
- // Might need to specially handle elsewhere
- int width = textureDesc.size;
- int height = 0;
- int depth = 0;
-
- switch (baseShape)
- {
- case SLANG_TEXTURE_1D:
- {
- break;
- }
- case SLANG_TEXTURE_2D:
- {
- height = textureDesc.size;
- break;
- }
- case SLANG_TEXTURE_3D:
- {
- height = textureDesc.size;
- depth = textureDesc.size;
- break;
- }
- case SLANG_TEXTURE_CUBE:
- {
- height = width;
- depth = 1;
- break;
- }
- default:
- {
- SLANG_ASSERT(!"Type not supported");
- return SLANG_FAIL;
- }
- }
-
- TextureData texData;
- generateTextureData(texData, textureDesc);
-
- auto mipLevels = texData.mipLevels;
-
- RefPtr<TextureCUDAResource> tex = new TextureCUDAResource;
-
- size_t elementSize = 0;
-
- {
- CUarray_format format = CU_AD_FORMAT_FLOAT;
- int numChannels = 0;
-
- switch (textureDesc.format)
- {
- case Format::R_Float32:
- {
- format = CU_AD_FORMAT_FLOAT;
- numChannels = 1;
- elementSize = sizeof(float);
- break;
- }
- case Format::RGBA_Unorm_UInt8:
- {
- format = CU_AD_FORMAT_UNSIGNED_INT8;
- numChannels = 4;
- elementSize = sizeof(uint32_t);
- break;
- }
- default:
- {
- SLANG_ASSERT(!"Only support R_Float32/RGBA_Unorm_UInt8 formats for now");
- return SLANG_FAIL;
- }
- }
-
- if (mipLevels > 1)
- {
- resourceType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Width = width;
- arrayDesc.Height = height;
- arrayDesc.Depth = depth;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
- arrayDesc.Flags = 0;
-
- if (textureDesc.arrayLength > 1)
- {
- if (baseShape == SLANG_TEXTURE_1D ||
- baseShape == SLANG_TEXTURE_2D ||
- baseShape == SLANG_TEXTURE_CUBE)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_LAYERED;
- arrayDesc.Depth = textureDesc.arrayLength;
- }
- else
- {
- SLANG_ASSERT(!"Arrays only supported for 1D and 2D");
- return SLANG_FAIL;
- }
- }
-
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- arrayDesc.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayCreate(&tex->m_cudaMipMappedArray, &arrayDesc, mipLevels));
- }
- else
- {
- resourceType = CU_RESOURCE_TYPE_ARRAY;
-
- if (textureDesc.arrayLength > 1)
- {
- if (baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE)
- {
- SLANG_ASSERT(!"Only 1D, 2D and Cube arrays supported");
- return SLANG_FAIL;
- }
-
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- // Set the depth as the array length
- arrayDesc.Depth = textureDesc.arrayLength;
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- arrayDesc.Depth *= 6;
- }
-
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else if (baseShape == SLANG_TEXTURE_3D || baseShape == SLANG_TEXTURE_CUBE)
- {
- CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Depth = depth;
- arrayDesc.Height = height;
- arrayDesc.Width = width;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- arrayDesc.Flags = 0;
-
- // Handle cube texture
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- arrayDesc.Depth = 6;
- arrayDesc.Flags |= CUDA_ARRAY3D_CUBEMAP;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuArray3DCreate(&tex->m_cudaArray, &arrayDesc));
- }
- else
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- memset(&arrayDesc, 0, sizeof(arrayDesc));
-
- arrayDesc.Width = width;
- arrayDesc.Height = height;
- arrayDesc.Format = format;
- arrayDesc.NumChannels = numChannels;
-
- // Allocate the array, will work for 1D or 2D case
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
- }
- }
- }
-
- // Work space for holding data for uploading if it needs to be rearranged
- List<uint8_t> workspace;
-
- for (int mipLevel = 0; mipLevel < mipLevels; ++mipLevel)
- {
- int mipWidth = width >> mipLevel;
- int mipHeight = height >> mipLevel;
- int mipDepth = depth >> mipLevel;
-
- mipWidth = (mipWidth == 0) ? 1 : mipWidth;
- mipHeight = (mipHeight == 0) ? 1 : mipHeight;
- mipDepth = (mipDepth == 0) ? 1 : mipDepth;
-
- // If it's a cubemap then the depth is always 6
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- mipDepth = 6;
- }
-
- auto dstArray = tex->m_cudaArray;
- if (tex->m_cudaMipMappedArray)
- {
- // Get the array for the mip level
- SLANG_CUDA_RETURN_ON_FAIL(cuMipmappedArrayGetLevel(&dstArray, tex->m_cudaMipMappedArray, mipLevel));
- }
- SLANG_ASSERT(dstArray);
-
- // Check using the desc to see if it's plausible
- {
- CUDA_ARRAY_DESCRIPTOR arrayDesc;
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayGetDescriptor(&arrayDesc, dstArray));
-
- SLANG_ASSERT(mipWidth == arrayDesc.Width);
- SLANG_ASSERT(mipHeight == arrayDesc.Height || (mipHeight == 1 && arrayDesc.Height == 0));
- }
-
- const void* srcDataPtr = nullptr;
-
- if (textureDesc.arrayLength > 1)
- {
- SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE);
-
- // TODO(JS): Here I assume that arrays are just held contiguously within a 'face'
- // This seems reasonable and works with the Copy3D.
- const size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- Index faceCount = textureDesc.arrayLength;
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- faceCount *= 6;
- }
-
- const size_t mipSizeInBytes = faceSizeInBytes * faceCount;
- workspace.setCount(mipSizeInBytes);
-
- // We need to add the face data from each mip
- // We iterate over face count so we copy all of the cubemap faces
- for (Index j = 0; j < faceCount; j++)
- {
- const auto& srcData = texData.dataBuffer[mipLevel + j * mipLevels];
- // Copy over to the workspace to make contiguous
- ::memcpy(workspace.begin() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes);
- }
-
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- size_t faceSizeInBytes = elementSize * mipWidth * mipHeight;
-
- workspace.setCount(faceSizeInBytes * 6);
-
- // Copy the data over to make contiguous
- for (Index j = 0; j < 6; j++)
- {
- const auto& srcData = texData.dataBuffer[mipLevels * j + mipLevel];
- SLANG_ASSERT(mipWidth * mipHeight == srcData.getCount());
-
- ::memcpy(workspace.getBuffer() + faceSizeInBytes * j, srcData.getBuffer(), faceSizeInBytes);
- }
-
- srcDataPtr = workspace.getBuffer();
- }
- else
- {
- const auto& srcData = texData.dataBuffer[mipLevel];
- SLANG_ASSERT(mipWidth * mipHeight * mipDepth == srcData.getCount());
-
- srcDataPtr = srcData.getBuffer();
- }
- }
-
- if (textureDesc.arrayLength > 1)
- {
- SLANG_ASSERT(baseShape == SLANG_TEXTURE_1D || baseShape == SLANG_TEXTURE_2D || baseShape == SLANG_TEXTURE_CUBE);
-
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- // Set the depth to the array length
- copyParam.Depth = textureDesc.arrayLength;
-
- if (baseShape == SLANG_TEXTURE_CUBE)
- {
- copyParam.Depth *= 6;
- }
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- }
- else
- {
- switch (baseShape)
- {
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_2D:
- {
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
- break;
- }
- case SLANG_TEXTURE_3D:
- case SLANG_TEXTURE_CUBE:
- {
- CUDA_MEMCPY3D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
-
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = dstArray;
-
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = srcDataPtr;
- copyParam.srcPitch = mipWidth * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = mipHeight;
- copyParam.Depth = mipDepth;
-
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy3D(&copyParam));
- break;
- }
-
- default:
- {
- SLANG_ASSERT(!"Not implemented");
- break;
- }
- }
- }
- }
-
- // Set up texture sampling parameters, and create final texture obj
-
- {
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
- resDesc.resType = resourceType;
-
- if (tex->m_cudaArray)
- {
- resDesc.res.array.hArray = tex->m_cudaArray;
- }
- if (tex->m_cudaMipMappedArray)
- {
- resDesc.res.mipmap.hMipmappedArray = tex->m_cudaMipMappedArray;
- }
-
- if (_hasWriteAccess(access))
- {
- // If has write access it's effectively UAV, and so doesn't have sampling available
- SLANG_CUDA_RETURN_ON_FAIL(cuSurfObjectCreate(&tex->m_cudaSurfObj, &resDesc));
- }
- else
- {
- // If read only it's a SRV and can sample, but cannot write
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
- texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
- texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
- }
-
- }
-
- outResource = tex;
- return SLANG_OK;
-}
-
- /// Load kernel code and invoke a compute program
- ///
- /// Assumes that data for binding the kernel parameters is already
- /// set up in `outContext.`
- ///
-static SlangResult _invokeComputeProgram(
- CUcontext cudaContext,
- ScopeCUDAStream& cudaStream,
- ScopeCUDAModule& cudaModule,
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- const uint32_t dispatchSize[3],
- CUDAComputeUtil::Context& outContext)
-{
- auto reflection = slang::ProgramLayout::get(outputAndLayout.output.getRequestForReflection());
-
- auto& bindSet = outContext.m_bindSet;
- auto& bindRoot = outContext.m_bindRoot;
-
- // The global-scope shader parameters in the input Slang program
- // will be collected into a single `__constant__` global variable
- // in the output CUDA module.
- //
- // We need to query the address of the `__constant__` variable
- // so that we can copy parameter data into it when invoking
- // a kernel.
- //
- // The Slang compiler always names this symbol `SLANG_globalParams`
- // so that it is easy to look up independent of the module or
- // entry point in question.
- //
- CUdeviceptr globalParamsSymbol = 0;
- size_t globalParamsSymbolSize = 0;
- cuModuleGetGlobal(&globalParamsSymbol, &globalParamsSymbolSize, cudaModule, "SLANG_globalParams");
-
- slang::EntryPointReflection* entryPoint = nullptr;
- auto entryPointCount = reflection->getEntryPointCount();
- SLANG_ASSERT(entryPointCount == 1);
-
- entryPoint = reflection->getEntryPointByIndex(0);
-
- const char* entryPointName = entryPoint->getName();
-
- // Get the entry point
- CUfunction cudaEntryPoint;
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&cudaEntryPoint, cudaModule, entryPointName));
-
- // Get the max threads per block for this function
-
- int maxTheadsPerBlock;
- SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cudaEntryPoint));
-
- int sharedSizeInBytes;
- SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));
-
- // A single CUDA kernel can be invoked with thread groups
- // of different shapes/sizes, but an HLSL/Slang compute
- // kernel always has a fixed thread group shape baked in.
- // We use reflection to query the thread-group size that
- // the kernel expects, so that we can use the right size
- // when invoking the kernel.
- //
- SlangUInt numThreadsPerAxis[3];
- entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
-
- // The argument data for the kernel has been set up in `bindRoot`,
- // which encapsulates global buffers for both the global and
- // entry-point parameter data.
- //
- // In the case of global parameters, we just need to extract the
- // device address of the parameter data, so we can copy it into
- // the `SLANG_globalParams` symbol.
- //
- {
- CUdeviceptr globalParamsCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
- cudaMemcpyAsync(
- (void*) globalParamsSymbol,
- (void*) globalParamsCUDAData,
- globalParamsSymbolSize,
- cudaMemcpyDeviceToDevice,
- cudaStream);
- }
- //
- // In the case of the entry-point parameters, we have to deal with
- // two different wrinkles.
- //
- // First, the `bindRoot` will have the entry-point argument data
- // stored in a GPU-memory buffer, but we actually need it to be
- // in host CPU memory. We handle that for now by allocating a
- // temporary host memory buffer (if needed) and copying the data
- // from device to host.
- //
- auto entryPointBindValue = bindRoot.getEntryPointValue();
- CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(entryPointBindValue);
- size_t entryPointDataSize = entryPointBindValue ? entryPointBindValue->m_sizeInBytes : 0;
- void* entryPointHostData = nullptr;
- if(entryPointDataSize)
- {
- entryPointHostData = alloca(entryPointDataSize);
- cudaMemcpy(entryPointHostData, (void*)entryPointCUDAData, entryPointDataSize, cudaMemcpyDeviceToHost);
- }
- //
- // Second, the argument data for the entry-point parameters has
- // been allocated and filled in as a single buffer, but `cuLaunchKernel`
- // defaults to taking pointers to each of the kernel arguments.
- //
- // We could loop over the entry-point parameters using the refleciton
- // information, and set up a pointer to each using the offset stored
- // for it in the reflection data. Such an approach would require
- // us to create and fill in a dynamically-sized array here.
- //
- // Instead, we take advantage of a documented but seldom-used feature
- // of `cuLaunchKernel` that allows the argument data for all of the
- // kernel "launch parameters" to be specified as a single buffer.
- //
- void* extraOptions[] = {
- CU_LAUNCH_PARAM_BUFFER_POINTER, (void*) entryPointHostData,
- CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize,
- CU_LAUNCH_PARAM_END,
- };
-
- // Once we have all the decessary data extracted and/or
- // set up, we can launch the kernel and see what happens.
- //
- auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
- dispatchSize[0], dispatchSize[1], dispatchSize[2],
- int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
- 0, // Shared memory size
- cudaStream, // Stream. 0 is no stream.
- nullptr, // Not using traditional argument passing
- extraOptions); // Instead passing kernel arguments via "extra" options
- SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
-
- // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
- SLANG_RETURN_ON_FAIL(cudaStream.sync());
-
- return SLANG_OK;
-}
-
-#ifdef RENDER_TEST_OPTIX
- /// Load kernel code and invoke a ray-tracing program
- ///
- /// Assumes that data for binding the kernel parameters is already
- /// set up in `outContext.`
- ///
- /// Currently only works for programs that have a single
- /// ray generation shader and no other entry points.
- ///
-static SlangResult _loadAndInvokeRayTracingProgram(
- CUcontext cudaContext,
- ScopeCUDAStream& cudaStream,
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- const uint32_t dispatchSize[3],
- CUDAComputeUtil::Context& outContext)
-{
- SLANG_OPTIX_RETURN_ON_FAIL(optixInit());
-
- OptixDeviceContextOptions optixOptions = {};
-
-#if _DEBUG
- optixOptions.logCallbackFunction = &_optixLogCallback;
- optixOptions.logCallbackLevel = 4;
-#endif
-
- OptixDeviceContext optixContext = nullptr;
- SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));
-
- enum { kOptixLogSize = 2*1024 };
- char log[kOptixLogSize];
- size_t logSize = sizeof(log);
-
- OptixPipelineCompileOptions optixPipelineCompileOptions = {};
- optixPipelineCompileOptions.pipelineLaunchParamsVariableName = "SLANG_globalParams";
-
- // We need to load modules from the PTX code available to us,
- // and then also create program groups from the kernels
- // in those modules.
- //
- // For now we will only support program groups with a single
- // kernel in them, and will create one per entry point.
- //
- Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
- List<OptixProgramGroup> optixProgramGroups;
- List<String> names;
-
- OptixShaderBindingTable optixSBT = {};
-
- for( Index ee = 0; ee < entryPointCount; ++ee )
- {
- auto& kernel = outputAndLayout.output.kernelDescs[ee];
-
- // TODO: The logic here assumes that each kernel will
- // come from its own independent module, and has no
- // provisiion for loading modules that might contain
- // multiple entry points.
- //
- OptixModuleCompileOptions optixModuleCompileOptions = {};
- OptixModule optixModule;
- SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
- optixContext,
- &optixModuleCompileOptions,
- &optixPipelineCompileOptions,
- (char const*) kernel.codeBegin,
- kernel.getCodeSize(),
- log,
- &logSize,
- &optixModule));
-
- // TODO: The logic here only handles ray-generation entry points.
- //
- // It would seem simple to extend this to handle other entry
- // point types, by inspecting the stage of the entry points
- // being loaded, and this is indeed true for the subset of
- // stages that map one-to-one with OptiX "program groups."
- //
- // The sticking point is "hit groups" which require a collection
- // of entry points to be specified together (insersection,
- // any hit, and closest hit). A hit group can comprise between
- // zero and three entry points.
- //
- // The catch for us is how to determine which entry points
- // should be grouped to form hit groups. Should this be
- // implied in the input code (either by naming convention
- // or by new Slang language features)? Should this be set
- // up via command-line arguments or something akin to
- // `//TEST_INPUT` lines?
-
- OptixProgramGroupOptions optixProgramGroupOptions = {};
-
- OptixProgramGroupDesc optixProgramGroupDesc = {};
- optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- optixProgramGroupDesc.raygen.module = optixModule;
-
- String name = String("__raygen__") + kernel.entryPointName;
- names.add(name);
- optixProgramGroupDesc.raygen.entryFunctionName = name.begin();
-
- OptixProgramGroup optixProgramGroup = nullptr;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &optixProgramGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &optixProgramGroup));
-
- optixProgramGroups.add(optixProgramGroup);
-
- {
- CUdeviceptr rayGenRecordPtr;
- size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) rayGenRecordPtr,
- &rayGenRecordData,
- rayGenRecordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.raygenRecord = rayGenRecordPtr;
- }
- }
-
- OptixPipeline optixPipeline = nullptr;
-
- OptixPipelineLinkOptions optixPipelineLinkOptions = {};
- optixPipelineLinkOptions.maxTraceDepth = 5;
- optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
- optixPipelineLinkOptions.overrideUsesMotionBlur = false;
- SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
- optixContext,
- &optixPipelineCompileOptions,
- &optixPipelineLinkOptions,
- optixProgramGroups.getBuffer(),
- (unsigned int)optixProgramGroups.getCount(),
- log,
- &logSize,
- &optixPipeline));
-
-
- {
- // The OptiX API complains if we don't fill in a miss record
- // in the SBT, so we will create a dummy one here to represent
- // the lack of any miss shaders.
- //
- OptixProgramGroupOptions optixProgramGroupOptions = {};
- OptixProgramGroupDesc missGroupDesc = {};
- missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
- OptixProgramGroup missProgramGroup;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &missGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &missProgramGroup));
-
-
- CUdeviceptr missRecordPtr;
- size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) missRecordPtr,
- &missRecordData,
- missRecordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.missRecordBase = missRecordPtr;
- optixSBT.missRecordCount = 1;
- optixSBT.missRecordStrideInBytes = (unsigned int)missRecordSize;
- }
- {
- // Okay, we also need a dummy hit group.
-
- OptixProgramGroupOptions optixProgramGroupOptions = {};
- OptixProgramGroupDesc hitGroupDesc = {};
- hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- OptixProgramGroup programGroup;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &hitGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &programGroup));
-
-
- CUdeviceptr recordPtr;
- size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) recordPtr,
- &recordData,
- recordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.hitgroupRecordBase = recordPtr;
- optixSBT.hitgroupRecordCount = 1;
- optixSBT.hitgroupRecordStrideInBytes = (unsigned int)recordSize;
- }
-
- // Work out the args
-
- auto& bindRoot = outContext.m_bindRoot;
-
- CUdeviceptr globalParams = 0;
- size_t globalParamsSize;
-
- if( auto globalArg = bindRoot.getRootValue() )
- {
- globalParams = MemoryCUDAResource::getCUDAData(globalArg);
- globalParamsSize = globalArg->m_sizeInBytes;
- }
-
- // TODO: The data for entry point parameters needs to be stored
- // into the SBT.
- //
- // The simplest solution here would be to copy data from the `bindRoot`
- // into the SBT at the point where we are setting up the SBT, but
- // a more optimized approach (more similar to what a real applicaiton
- // would do) would be to allocate the SBT first and then have the
- // binding logic write directly into its entries.
- //
- // One big complication here is that there need not necessarily be
- // a one-to-one relationship between the entry points (or entry-point
- // groups) in a compiled ray-tracing pipeline and the entries in
- // the SBT. Each SBT entry is conceptually an "instance" of one
- // of the entry-point groups in the program, and there can be
- // zero, one, or many instances of a given group.
- //
- // Modelling this more completely in `render-test` requires that
- // we start having a model for the "scene" that is being rendered,
- // and how entry point groups are associated with the objects in
- // that scene.
- //
- CUdeviceptr entryPointParams = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
-
- SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
- optixPipeline,
- cudaStream,
- globalParams,
- globalParamsSize,
- &optixSBT,
- dispatchSize[0],
- dispatchSize[1],
- dispatchSize[2]));
-
- SLANG_RETURN_ON_FAIL(cudaStream.sync());
-
- return SLANG_OK;
-}
-#endif
-
- // Fill in runtime handles (e.g. RTTI pointers values and bindless resource handles) in input buffers.
-static SlangResult _fillRuntimeHandlesInBuffers(
- const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout,
- CUDAComputeUtil::Context& context,
- ScopeCUDAModule& cudaModule)
-{
- Slang::ComPtr<slang::ISession> linkage;
- spCompileRequest_getSession(compilationAndLayout.output.getRequestForReflection(), linkage.writeRef());
- auto& inputLayout = compilationAndLayout.layout;
- for (auto& entry : inputLayout.entries)
- {
- for (auto& rtti : entry.rttiEntries)
- {
- uint64_t ptrValue = 0;
- switch (rtti.type)
- {
- case RTTIDataEntryType::RTTIObject:
- {
- auto reflection =
- slang::ShaderReflection::get(compilationAndLayout.output.getRequestForReflection());
- auto concreteType = reflection->findTypeByName(rtti.typeName.getBuffer());
- ComPtr<ISlangBlob> outName;
- linkage->getTypeRTTIMangledName(concreteType, outName.writeRef());
- if (!outName)
- return SLANG_FAIL;
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetGlobal(
- (CUdeviceptr*)&ptrValue,
- nullptr,
- cudaModule.m_module,
- (char*)outName->getBufferPointer()));
- }
- break;
- case RTTIDataEntryType::WitnessTable:
- {
- auto reflection =
- slang::ShaderReflection::get(compilationAndLayout.output.getRequestForReflection());
- auto concreteType = reflection->findTypeByName(rtti.typeName.getBuffer());
- if (!concreteType)
- return SLANG_FAIL;
- auto interfaceType = reflection->findTypeByName(rtti.interfaceName.getBuffer());
- if (!interfaceType)
- return SLANG_FAIL;
- uint32_t id = 0xFFFFFFFF;
- linkage->getTypeConformanceWitnessSequentialID(
- concreteType, interfaceType, &id);
- ptrValue = id;
- break;
- }
- default:
- break;
- }
- if (rtti.offset >= 0 &&
- rtti.offset + sizeof(ptrValue) <=
- entry.bufferData.getCount() * sizeof(decltype(entry.bufferData[0])))
- {
- memcpy(
- ((char*)entry.bufferData.getBuffer()) + rtti.offset,
- &ptrValue,
- sizeof(ptrValue));
- }
- else
- {
- return SLANG_FAIL;
- }
- }
-
- for (auto& handle : entry.bindlessHandleEntry)
- {
- RefPtr<CUDAResource> resource;
- uint64_t handleValue = 0;
- if (context.m_bindlessResources.TryGetValue(handle.name, resource))
- {
- handleValue = resource->getBindlessHandle();
- }
- else
- {
- return SLANG_FAIL;
- }
- if (handle.offset >= 0 &&
- handle.offset + sizeof(uint64_t) <=
- entry.bufferData.getCount() * sizeof(decltype(entry.bufferData[0])))
- {
- memcpy(
- ((char*)entry.bufferData.getBuffer()) + handle.offset,
- &handleValue,
- sizeof(handleValue));
- }
- else
- {
- return SLANG_FAIL;
- }
- }
- }
- return SLANG_OK;
-}
-
-static SlangResult _createBindlessResources(
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- CUDAComputeUtil::Context& outContext)
-{
- auto outStream = StdWriters::getOut();
- for (auto& entry : outputAndLayout.layout.entries)
- {
- if (!entry.isBindlessObject)
- continue;
- switch (entry.type)
- {
- case ShaderInputType::Texture:
- {
- RefPtr<CUDAResource> resource;
- CUDAComputeUtil::createTextureResource(entry, nullptr, resource);
- outContext.m_bindlessResources.Add(entry.name, resource);
- break;
- }
- default:
- outStream.print("Unsupported bindless resource type.\n");
- return SLANG_FAIL;
- }
- }
- return SLANG_OK;
-}
-
- /// Fill in the binding information for arguments of a CUDA program.
-static SlangResult _setUpArguments(
- CUcontext cudaContext,
- ScopeCUDAStream& cudaStream,
- ScopeCUDAModule& cudaModule,
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- const uint32_t dispatchSize[3],
- CUDAComputeUtil::Context& outContext)
-{
- auto reflection = slang::ProgramLayout::get(outputAndLayout.output.getRequestForReflection());
-
- auto& bindSet = outContext.m_bindSet;
- auto& bindRoot = outContext.m_bindRoot;
-
- // Okay now we need to set up binding
- bindRoot.init(&bindSet, reflection, 0);
-
- // Will set up any root buffers
- bindRoot.addDefaultValues();
-
- // Now set up the Values from the test
-
- auto outStream = StdWriters::getOut();
-
- _createBindlessResources(outputAndLayout, outContext);
-
- // Fill in RTTI pointers and bindless handles in input buffers before copying
- // it to GPU memory.
- // TODO: enable this for Optix path after it is refactored so that context
- // creation and module loading happens before _setUpArguments.
- if (outputAndLayout.output.desc.pipelineType == PipelineType::Compute)
- {
- SLANG_RETURN_ON_FAIL(_fillRuntimeHandlesInBuffers(outputAndLayout, outContext, cudaModule));
- }
-
- SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
-
- ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
-
- // First create all of the resources for the values
-
- {
- const auto& values = bindSet.getValues();
- const auto& entries = outputAndLayout.layout.entries;
-
- for (BindSet::Value* value : values)
- {
- auto typeLayout = value->m_type;
-
- // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
- slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
-
- switch (kind)
- {
- case slang::TypeReflection::Kind::ConstantBuffer:
- case slang::TypeReflection::Kind::ParameterBlock:
- {
- // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
- value->m_target = resource;
- break;
- }
- case slang::TypeReflection::Kind::Resource:
- {
- auto type = typeLayout->getType();
- auto shape = type->getResourceShape();
-
- auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
-
- switch (baseShape)
- {
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_2D:
- case SLANG_TEXTURE_3D:
- case SLANG_TEXTURE_CUBE:
- {
- RefPtr<CUDAResource> resource;
- SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
- value->m_target = resource;
- break;
- }
- case SLANG_TEXTURE_BUFFER:
- {
- // Need a CUDA impl for these...
- // For now we can just leave as target will just be nullptr
- break;
- }
-
- case SLANG_BYTE_ADDRESS_BUFFER:
- case SLANG_STRUCTURED_BUFFER:
- {
- // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
- value->m_target = resource;
- break;
- }
- }
- }
- default: break;
- }
- }
- }
-
- // Now we need to go through all of the bindings and set the appropriate data
-
- {
- List<BindLocation> locations;
- List<BindSet::Value*> values;
- bindSet.getBindings(locations, values);
-
- for (Index i = 0; i < locations.getCount(); ++i)
- {
- const auto& location = locations[i];
- BindSet::Value* value = values[i];
-
- // Okay now we need to set up the actual handles that CPU will follow.
- auto typeLayout = location.getTypeLayout();
-
- const auto kind = typeLayout->getKind();
- switch (kind)
- {
- case slang::TypeReflection::Kind::Array:
- {
- auto elementCount = int(typeLayout->getElementCount());
- if (elementCount == 0)
- {
- CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
- auto resource = MemoryCUDAResource::asResource(value);
- if (resource)
- {
- array.data = resource->m_cudaMemory;
- array.count = value->m_elementCount;
- }
-
- location.setUniform(&array, sizeof(array));
- }
- break;
- }
- case slang::TypeReflection::Kind::ConstantBuffer:
- case slang::TypeReflection::Kind::ParameterBlock:
- {
- // These map down to just pointers
- *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
- break;
- }
- case slang::TypeReflection::Kind::Resource:
- {
- auto type = typeLayout->getType();
- auto shape = type->getResourceShape();
-
- auto access = type->getResourceAccess();
-
- const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
-
- switch (baseShape)
- {
- case SLANG_STRUCTURED_BUFFER:
- {
- CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
- auto resource = MemoryCUDAResource::asResource(value);
- if (resource)
- {
- buffer.data = resource->m_cudaMemory;
- buffer.count = value->m_elementCount;
- }
-
- location.setUniform(&buffer, sizeof(buffer));
- break;
- }
- case SLANG_BYTE_ADDRESS_BUFFER:
- {
- CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
-
- auto resource = MemoryCUDAResource::asResource(value);
- if (resource)
- {
- buffer.data = resource->m_cudaMemory;
- buffer.sizeInBytes = value->m_sizeInBytes;
- }
-
- location.setUniform(&buffer, sizeof(buffer));
- break;
- }
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_2D:
- case SLANG_TEXTURE_3D:
- case SLANG_TEXTURE_CUBE:
- {
- if (_hasWriteAccess(access))
- {
- *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
- }
- else
- {
- *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
- }
- break;
- }
-
- }
- break;
- }
- default: break;
- }
- }
- }
-
- // Okay now the memory is all set up, we can copy everything over
- {
- const auto& values = bindSet.getValues();
- for (BindSet::Value* value : values)
- {
- CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
- if (value && value->m_data && cudaMem)
- {
- // Okay copy the data over...
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
- }
- }
- }
-
- return SLANG_OK;
-}
-
- /// Read back any output arguments from a CUDA program.
-static SlangResult _readBackOutputs(
- CUcontext cudaContext,
- ScopeCUDAStream& cudaStream,
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- const uint32_t dispatchSize[3],
- CUDAComputeUtil::Context& outContext)
-{
- const auto& entries = outputAndLayout.layout.entries;
-
- for (Index i = 0; i < entries.getCount(); ++i)
- {
- const auto& entry = entries[i];
- BindSet::Value* value = outContext.m_buffers[i];
-
- if (entry.isOutput)
- {
- // Copy back to CPU memory
- CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
- if (value && value->m_data && cudaMem)
- {
- // Okay copy the data back...
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
- }
- }
- }
-
- return SLANG_OK;
-}
-
-SlangResult _loadCUDAModule(
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- ScopeCUDAModule& outModule)
-{
- const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
- if (index < 0)
- {
- return SLANG_FAIL;
- }
- const auto& kernelDesc = outputAndLayout.output.kernelDescs[index];
- SLANG_RETURN_ON_FAIL(outModule.load(kernelDesc.codeBegin));
- return SLANG_OK;
-}
-
- /// Load and invoke a CUDA program (either compute or ray-tracing)
-SlangResult _loadAndInvokeKernel(
- CUcontext cudaContext,
- ScopeCUDAStream& cudaStream,
- ScopeCUDAModule& cudaModule,
- const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
- const uint32_t dispatchSize[3],
- CUDAComputeUtil::Context& outContext)
-{
- switch( outputAndLayout.output.desc.pipelineType )
- {
- case PipelineType::Compute:
- return _invokeComputeProgram(
- cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext);
-
- case PipelineType::RayTracing:
-#ifdef RENDER_TEST_OPTIX
- return _loadAndInvokeRayTracingProgram(
- cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
-#endif
- break;
-
- default: break;
- }
-
- return SLANG_FAIL;
-}
-
- /// Execute a CUDA program (either compute or ray-tracing)
- ///
- /// This function handles loading code and argument data,
- /// invoking the kernel(s), and reading back results.
- ///
-/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
-{
- ScopeCUDAContext cudaContext;
- SLANG_RETURN_ON_FAIL(cudaContext.init(0));
-
- // A default stream, will act as a global stream. Calling sync will globally sync
- ScopeCUDAStream cudaStream;
- //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
-
- ScopeCUDAModule cudaModule;
-
- auto& bindSet = outContext.m_bindSet;
- auto& bindRoot = outContext.m_bindRoot;
-
- auto request = outputAndLayout.output.getRequestForReflection();
- auto reflection = (slang::ShaderReflection*) spGetReflection(request);
-
- // Load cuda module first so its symbols may be queried and filled into argument buffers.
- // TODO: refactor optix path to also front-load its context creation and module loading here.
- // For now just front-load compute kernels.
- if (outputAndLayout.output.desc.pipelineType == PipelineType::Compute)
- {
- SLANG_RETURN_ON_FAIL(_loadCUDAModule(outputAndLayout, cudaModule));
- }
-
- SLANG_RETURN_ON_FAIL(_setUpArguments(
- cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext));
-
- SLANG_RETURN_ON_FAIL(_loadAndInvokeKernel(
- cudaContext, cudaStream, cudaModule, outputAndLayout, dispatchSize, outContext));
-
- // Finally we need to copy the data back
- SLANG_RETURN_ON_FAIL(_readBackOutputs(
- cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
-
- // Release all othe CUDA resource/allocations
- bindSet.releaseValueTargets();
- outContext.releaseBindlessResources();
-
- return SLANG_OK;
-}
-
-
-void CUDAComputeUtil::Context::releaseBindlessResources()
-{
- m_bindlessResources = decltype(m_bindlessResources)();
-}
-
-} // namespace renderer_test