summaryrefslogtreecommitdiffstats
path: root/tools/render-test
diff options
context:
space:
mode:
Diffstat (limited to 'tools/render-test')
-rw-r--r--tools/render-test/cuda/cuda-compute-util.cpp977
1 files changed, 549 insertions, 428 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index b2006a7e8..5d6f82499 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -15,9 +15,13 @@
// TODO: should conditionalize this on OptiX being present
#ifdef RENDER_TEST_OPTIX
+
+// The `optix_stubs.h` header produces warnings when compiled with MSVC
+#ifdef _MSC_VER
+#pragma warning(disable: 4996)
+#endif
#include <optix.h>
#include <optix_function_table_definition.h>
-#define _CRT_SECURE_NO_WARNINGS 1
#include <optix_stubs.h>
#endif
@@ -947,13 +951,33 @@ static bool _hasWriteAccess(SlangResourceAccess access)
return SLANG_OK;
}
-static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], CUDAComputeUtil::Context& outContext)
+ /// Load kernel code and invoke a compute program
+ ///
+ /// Assumes that data for binding the kernel parameters is already
+ /// set up in `outContext.`
+ ///
+static SlangResult _loadAndInvokeComputeProgram(
+ CUcontext cudaContext,
+ ScopeCUDAStream& cudaStream,
+ const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+ const uint32_t dispatchSize[3],
+ CUDAComputeUtil::Context& outContext)
{
+ auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);
+
auto& bindSet = outContext.m_bindSet;
auto& bindRoot = outContext.m_bindRoot;
- auto request = outputAndLayout.output.request;
- auto reflection = (slang::ShaderReflection*) spGetReflection(request);
+ const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
+ if (index < 0)
+ {
+ return SLANG_FAIL;
+ }
+
+ const auto& kernelDesc = outputAndLayout.output.kernelDescs[index];
+
+ ScopeCUDAModule cudaModule;
+ SLANG_RETURN_ON_FAIL(cudaModule.load(kernelDesc.codeBegin));
slang::EntryPointReflection* entryPoint = nullptr;
auto entryPointCount = reflection->getEntryPointCount();
@@ -964,514 +988,611 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
const char* entryPointName = entryPoint->getName();
// Get the entry point
- CUfunction kernel;
- SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName));
+ CUfunction cudaEntryPoint;
+ SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&cudaEntryPoint, cudaModule, entryPointName));
- // A default stream, will act as a global stream. Calling sync will globally sync
- ScopeCUDAStream cudaStream;
- //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
+ // Get the max threads per block for this function
- {
- // Okay now we need to set up binding
- bindRoot.init(&bindSet, reflection, 0);
+ int maxTheadsPerBlock;
+ SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cudaEntryPoint));
+
+ int sharedSizeInBytes;
+ SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));
+
+ // Work out the args
+ CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
+ CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+
+ // NOTE! These are pointers to the cuda memory pointers
+ void* args[] = { &entryPointCUDAData , &uniformCUDAData };
+
+ SlangUInt numThreadsPerAxis[3];
+ entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
+
+ // Launch
+ auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
+ dispatchSize[0], dispatchSize[1], dispatchSize[2],
+ int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
+ 0, // Shared memory size
+ cudaStream, // Stream. 0 is no stream.
+ args, // Args
+ nullptr); // extra
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
+
+ // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
+ SLANG_RETURN_ON_FAIL(cudaStream.sync());
+
+ return SLANG_OK;
+}
+
+#ifdef RENDER_TEST_OPTIX
+ /// Load kernel code and invoke a ray-tracing program
+ ///
+ /// Assumes that data for binding the kernel parameters is already
+ /// set up in `outContext.`
+ ///
+ /// Currently only works for programs that have a single
+ /// ray generation shader and no other entry points.
+ ///
+static SlangResult _loadAndInvokeRayTracingProgram(
+ CUcontext cudaContext,
+ ScopeCUDAStream& cudaStream,
+ const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+ const uint32_t dispatchSize[3],
+ CUDAComputeUtil::Context& outContext)
+{
+ SLANG_OPTIX_RETURN_ON_FAIL(optixInit());
+
+ OptixDeviceContextOptions optixOptions = {};
- // Will set up any root buffers
- bindRoot.addDefaultValues();
+#if _DEBUG
+ optixOptions.logCallbackFunction = &_optixLogCallback;
+ optixOptions.logCallbackLevel = 4;
+#endif
+
+ OptixDeviceContext optixContext = nullptr;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));
- // Now set up the Values from the test
+ enum { kOptixLogSize = 2*1024 };
+ char log[kOptixLogSize];
+ size_t logSize = sizeof(log);
- auto outStream = StdWriters::getOut();
- SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
+ OptixPipelineCompileOptions optixPipelineCompileOptions = {};
+ optixPipelineCompileOptions.pipelineLaunchParamsVariableName = "SLANG_globalParams";
- ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
+ // We need to load modules from the PTX code available to us,
+ // and then also create program groups from the kernels
+ // in those modules.
+ //
+ // For now we will only support program groups with a single
+ // kernel in them, and will create one per entry point.
+ //
+ Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
+ List<OptixProgramGroup> optixProgramGroups;
+ List<String> names;
- // First create all of the resources for the values
+ OptixShaderBindingTable optixSBT = {};
+
+ for( Index ee = 0; ee < entryPointCount; ++ee )
+ {
+ auto& kernel = outputAndLayout.output.kernelDescs[ee];
+
+ // TODO: The logic here assumes that each kernel will
+ // come from its own independent module, and has no
+ // provisiion for loading modules that might contain
+ // multiple entry points.
+ //
+ OptixModuleCompileOptions optixModuleCompileOptions = {};
+ OptixModule optixModule;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
+ optixContext,
+ &optixModuleCompileOptions,
+ &optixPipelineCompileOptions,
+ (char const*) kernel.codeBegin,
+ kernel.getCodeSize(),
+ log,
+ &logSize,
+ &optixModule));
+
+ // TODO: The logic here only handles ray-generation entry points.
+ //
+ // It would seem simple to extend this to handle other entry
+ // point types, by inspecting the stage of the entry points
+ // being loaded, and this is indeed true for the subset of
+ // stages that map one-to-one with OptiX "program groups."
+ //
+ // The sticking point is "hit groups" which require a collection
+ // of entry points to be specified together (insersection,
+ // any hit, and closest hit). A hit group can comprise between
+ // zero and three entry points.
+ //
+ // The catch for us is how to determine which entry points
+ // should be grouped to form hit groups. Should this be
+ // implied in the input code (either by naming convention
+ // or by new Slang language features)? Should this be set
+ // up via command-line arguments or something akin to
+ // `//TEST_INPUT` lines?
+
+ OptixProgramGroupOptions optixProgramGroupOptions = {};
+
+ OptixProgramGroupDesc optixProgramGroupDesc = {};
+ optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ optixProgramGroupDesc.raygen.module = optixModule;
+
+ String name = String("__raygen__") + kernel.entryPointName;
+ names.add(name);
+ optixProgramGroupDesc.raygen.entryFunctionName = name.begin();
+
+ OptixProgramGroup optixProgramGroup = nullptr;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+ optixContext,
+ &optixProgramGroupDesc,
+ 1,
+ &optixProgramGroupOptions,
+ log,
+ &logSize,
+ &optixProgramGroup));
+
+ optixProgramGroups.add(optixProgramGroup);
{
- const auto& values = bindSet.getValues();
- const auto& entries = outputAndLayout.layout.entries;
+ CUdeviceptr rayGenRecordPtr;
+ size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
- for (BindSet::Value* value : values)
- {
- auto typeLayout = value->m_type;
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));
+
+ struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+ (void*) rayGenRecordPtr,
+ &rayGenRecordData,
+ rayGenRecordSize,
+ cudaMemcpyHostToDevice));
+
+ optixSBT.raygenRecord = rayGenRecordPtr;
+ }
+ }
+
+ OptixPipeline optixPipeline = nullptr;
+
+ OptixPipelineLinkOptions optixPipelineLinkOptions = {};
+ optixPipelineLinkOptions.maxTraceDepth = 5;
+ optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ optixPipelineLinkOptions.overrideUsesMotionBlur = false;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
+ optixContext,
+ &optixPipelineCompileOptions,
+ &optixPipelineLinkOptions,
+ optixProgramGroups.getBuffer(),
+ (unsigned int)optixProgramGroups.getCount(),
+ log,
+ &logSize,
+ &optixPipeline));
+
+
+ {
+ // The OptiX API complains if we don't fill in a miss record
+ // in the SBT, so we will create a dummy one here to represent
+ // the lack of any miss shaders.
+ //
+ OptixProgramGroupOptions optixProgramGroupOptions = {};
+ OptixProgramGroupDesc missGroupDesc = {};
+ missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+ OptixProgramGroup missProgramGroup;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+ optixContext,
+ &missGroupDesc,
+ 1,
+ &optixProgramGroupOptions,
+ log,
+ &logSize,
+ &missProgramGroup));
+
+
+ CUdeviceptr missRecordPtr;
+ size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));
+
+ struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+ (void*) missRecordPtr,
+ &missRecordData,
+ missRecordSize,
+ cudaMemcpyHostToDevice));
+
+ optixSBT.missRecordBase = missRecordPtr;
+ optixSBT.missRecordCount = 1;
+ optixSBT.missRecordStrideInBytes = (unsigned int)missRecordSize;
+ }
+ {
+ // Okay, we also need a dummy hit group.
+
+ OptixProgramGroupOptions optixProgramGroupOptions = {};
+ OptixProgramGroupDesc hitGroupDesc = {};
+ hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ OptixProgramGroup programGroup;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+ optixContext,
+ &hitGroupDesc,
+ 1,
+ &optixProgramGroupOptions,
+ log,
+ &logSize,
+ &programGroup));
+
+
+ CUdeviceptr recordPtr;
+ size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));
+
+ struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
+ SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));
+
+ SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+ (void*) recordPtr,
+ &recordData,
+ recordSize,
+ cudaMemcpyHostToDevice));
+
+ optixSBT.hitgroupRecordBase = recordPtr;
+ optixSBT.hitgroupRecordCount = 1;
+ optixSBT.hitgroupRecordStrideInBytes = (unsigned int)recordSize;
+ }
+
+ // Work out the args
+
+ auto& bindRoot = outContext.m_bindRoot;
+
+ CUdeviceptr globalParams = 0;
+ size_t globalParamsSize;
+
+ if( auto globalArg = bindRoot.getRootValue() )
+ {
+ globalParams = MemoryCUDAResource::getCUDAData(globalArg);
+ globalParamsSize = globalArg->m_sizeInBytes;
+ }
+
+ // TODO: The data for entry point parameters needs to be stored
+ // into the SBT.
+ //
+ // The simplest solution here would be to copy data from the `bindRoot`
+ // into the SBT at the point where we are setting up the SBT, but
+ // a more optimized approach (more similar to what a real applicaiton
+ // would do) would be to allocate the SBT first and then have the
+ // binding logic write directly into its entries.
+ //
+ // One big complication here is that there need not necessarily be
+ // a one-to-one relationship between the entry points (or entry-point
+ // groups) in a compiled ray-tracing pipeline and the entries in
+ // the SBT. Each SBT entry is conceptually an "instance" of one
+ // of the entry-point groups in the program, and there can be
+ // zero, one, or many instances of a given group.
+ //
+ // Modelling this more completely in `render-test` requires that
+ // we start having a model for the "scene" that is being rendered,
+ // and how entry point groups are associated with the objects in
+ // that scene.
+ //
+ CUdeviceptr entryPointParams = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+
+ SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
+ optixPipeline,
+ cudaStream,
+ globalParams,
+ globalParamsSize,
+ &optixSBT,
+ dispatchSize[0],
+ dispatchSize[1],
+ dispatchSize[2]));
+
+ SLANG_RETURN_ON_FAIL(cudaStream.sync());
+
+ return SLANG_OK;
+}
+#endif
+
+ /// Fill in the binding information for arguments of a CUDA program.
+static SlangResult _setUpArguments(
+ CUcontext cudaContext,
+ ScopeCUDAStream& cudaStream,
+ const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+ const uint32_t dispatchSize[3],
+ CUDAComputeUtil::Context& outContext)
+{
+ auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);
+
+ auto& bindSet = outContext.m_bindSet;
+ auto& bindRoot = outContext.m_bindRoot;
+
+ // Okay now we need to set up binding
+ bindRoot.init(&bindSet, reflection, 0);
+
+ // Will set up any root buffers
+ bindRoot.addDefaultValues();
+
+ // Now set up the Values from the test
+
+ auto outStream = StdWriters::getOut();
+ SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
+
+ ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
+
+ // First create all of the resources for the values
+
+ {
+ const auto& values = bindSet.getValues();
+ const auto& entries = outputAndLayout.layout.entries;
+
+ for (BindSet::Value* value : values)
+ {
+ auto typeLayout = value->m_type;
- // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
- slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
+ // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
+ slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
- switch (kind)
+ switch (kind)
+ {
+ case slang::TypeReflection::Kind::ConstantBuffer:
+ case slang::TypeReflection::Kind::ParameterBlock:
{
- case slang::TypeReflection::Kind::ConstantBuffer:
- case slang::TypeReflection::Kind::ParameterBlock:
- {
- // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
- value->m_target = resource;
- break;
- }
- case slang::TypeReflection::Kind::Resource:
- {
- auto type = typeLayout->getType();
- auto shape = type->getResourceShape();
+ // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
+ RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+ value->m_target = resource;
+ break;
+ }
+ case slang::TypeReflection::Kind::Resource:
+ {
+ auto type = typeLayout->getType();
+ auto shape = type->getResourceShape();
- auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+ auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
- switch (baseShape)
+ switch (baseShape)
+ {
+ case SLANG_TEXTURE_1D:
+ case SLANG_TEXTURE_2D:
+ case SLANG_TEXTURE_3D:
+ case SLANG_TEXTURE_CUBE:
{
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_2D:
- case SLANG_TEXTURE_3D:
- case SLANG_TEXTURE_CUBE:
- {
- RefPtr<CUDAResource> resource;
- SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
- value->m_target = resource;
- break;
- }
- case SLANG_TEXTURE_BUFFER:
- {
- // Need a CUDA impl for these...
- // For now we can just leave as target will just be nullptr
- break;
- }
+ RefPtr<CUDAResource> resource;
+ SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
+ value->m_target = resource;
+ break;
+ }
+ case SLANG_TEXTURE_BUFFER:
+ {
+ // Need a CUDA impl for these...
+ // For now we can just leave as target will just be nullptr
+ break;
+ }
- case SLANG_BYTE_ADDRESS_BUFFER:
- case SLANG_STRUCTURED_BUFFER:
- {
- // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
- RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
- value->m_target = resource;
- break;
- }
+ case SLANG_BYTE_ADDRESS_BUFFER:
+ case SLANG_STRUCTURED_BUFFER:
+ {
+ // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
+ RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+ value->m_target = resource;
+ break;
}
}
- default: break;
}
+ default: break;
}
}
+ }
- // Now we need to go through all of the bindings and set the appropriate data
+ // Now we need to go through all of the bindings and set the appropriate data
+
+ {
+ List<BindLocation> locations;
+ List<BindSet::Value*> values;
+ bindSet.getBindings(locations, values);
+ for (Index i = 0; i < locations.getCount(); ++i)
{
- List<BindLocation> locations;
- List<BindSet::Value*> values;
- bindSet.getBindings(locations, values);
+ const auto& location = locations[i];
+ BindSet::Value* value = values[i];
- for (Index i = 0; i < locations.getCount(); ++i)
- {
- const auto& location = locations[i];
- BindSet::Value* value = values[i];
+ // Okay now we need to set up the actual handles that CPU will follow.
+ auto typeLayout = location.getTypeLayout();
- // Okay now we need to set up the actual handles that CPU will follow.
- auto typeLayout = location.getTypeLayout();
+ const auto kind = typeLayout->getKind();
+ switch (kind)
+ {
+ case slang::TypeReflection::Kind::Array:
+ {
+ auto elementCount = int(typeLayout->getElementCount());
+ if (elementCount == 0)
+ {
+ CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
+ auto resource = MemoryCUDAResource::asResource(value);
+ if (resource)
+ {
+ array.data = resource->m_cudaMemory;
+ array.count = value->m_elementCount;
+ }
- const auto kind = typeLayout->getKind();
- switch (kind)
+ location.setUniform(&array, sizeof(array));
+ }
+ break;
+ }
+ case slang::TypeReflection::Kind::ConstantBuffer:
+ case slang::TypeReflection::Kind::ParameterBlock:
+ {
+ // These map down to just pointers
+ *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
+ break;
+ }
+ case slang::TypeReflection::Kind::Resource:
{
- case slang::TypeReflection::Kind::Array:
+ auto type = typeLayout->getType();
+ auto shape = type->getResourceShape();
+
+ auto access = type->getResourceAccess();
+
+ const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+
+ switch (baseShape)
{
- auto elementCount = int(typeLayout->getElementCount());
- if (elementCount == 0)
+ case SLANG_STRUCTURED_BUFFER:
{
- CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
+ CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
auto resource = MemoryCUDAResource::asResource(value);
if (resource)
{
- array.data = resource->m_cudaMemory;
- array.count = value->m_elementCount;
+ buffer.data = resource->m_cudaMemory;
+ buffer.count = value->m_elementCount;
}
- location.setUniform(&array, sizeof(array));
+ location.setUniform(&buffer, sizeof(buffer));
+ break;
}
- break;
- }
- case slang::TypeReflection::Kind::ConstantBuffer:
- case slang::TypeReflection::Kind::ParameterBlock:
- {
- // These map down to just pointers
- *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
- break;
- }
- case slang::TypeReflection::Kind::Resource:
- {
- auto type = typeLayout->getType();
- auto shape = type->getResourceShape();
-
- auto access = type->getResourceAccess();
-
- const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
-
- switch (baseShape)
+ case SLANG_BYTE_ADDRESS_BUFFER:
{
- case SLANG_STRUCTURED_BUFFER:
+ CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
+
+ auto resource = MemoryCUDAResource::asResource(value);
+ if (resource)
{
- CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
- auto resource = MemoryCUDAResource::asResource(value);
- if (resource)
- {
- buffer.data = resource->m_cudaMemory;
- buffer.count = value->m_elementCount;
- }
-
- location.setUniform(&buffer, sizeof(buffer));
- break;
+ buffer.data = resource->m_cudaMemory;
+ buffer.sizeInBytes = value->m_sizeInBytes;
}
- case SLANG_BYTE_ADDRESS_BUFFER:
- {
- CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
-
- auto resource = MemoryCUDAResource::asResource(value);
- if (resource)
- {
- buffer.data = resource->m_cudaMemory;
- buffer.sizeInBytes = value->m_sizeInBytes;
- }
- location.setUniform(&buffer, sizeof(buffer));
- break;
+ location.setUniform(&buffer, sizeof(buffer));
+ break;
+ }
+ case SLANG_TEXTURE_1D:
+ case SLANG_TEXTURE_2D:
+ case SLANG_TEXTURE_3D:
+ case SLANG_TEXTURE_CUBE:
+ {
+ if (_hasWriteAccess(access))
+ {
+ *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
}
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_2D:
- case SLANG_TEXTURE_3D:
- case SLANG_TEXTURE_CUBE:
+ else
{
- if (_hasWriteAccess(access))
- {
- *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
- }
- else
- {
- *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
- }
- break;
+ *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
}
-
+ break;
}
- break;
+
}
- default: break;
+ break;
}
+ default: break;
}
}
+ }
- // Okay now the memory is all set up, we can copy everything over
+ // Okay now the memory is all set up, we can copy everything over
+ {
+ const auto& values = bindSet.getValues();
+ for (BindSet::Value* value : values)
{
- const auto& values = bindSet.getValues();
- for (BindSet::Value* value : values)
+ CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
+ if (value && value->m_data && cudaMem)
{
- CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
- if (value && value->m_data && cudaMem)
- {
- // Okay copy the data over...
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
- }
+ // Okay copy the data over...
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
}
}
+ }
- // Now we can execute the kernel
-
- {
- // Get the max threads per block for this function
-
- int maxTheadsPerBlock;
- SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel));
-
- int sharedSizeInBytes;
- SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
-
- // Work out the args
- CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
- CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
-
- // NOTE! These are pointers to the cuda memory pointers
- void* args[] = { &entryPointCUDAData , &uniformCUDAData };
-
- SlangUInt numThreadsPerAxis[3];
- entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
-
- // Launch
- auto cudaLaunchResult = cuLaunchKernel(kernel,
- dispatchSize[0], dispatchSize[1], dispatchSize[2],
- int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
- 0, // Shared memory size
- cudaStream, // Stream. 0 is no stream.
- args, // Args
- nullptr); // extra
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
+ return SLANG_OK;
+}
- // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
- SLANG_RETURN_ON_FAIL(cudaStream.sync());
- }
+ /// Read back any output arguments from a CUDA program.
+static SlangResult _readBackOutputs(
+ CUcontext cudaContext,
+ ScopeCUDAStream& cudaStream,
+ const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+ const uint32_t dispatchSize[3],
+ CUDAComputeUtil::Context& outContext)
+{
+ const auto& entries = outputAndLayout.layout.entries;
- // Finally we need to copy the data back
+ for (Index i = 0; i < entries.getCount(); ++i)
+ {
+ const auto& entry = entries[i];
+ BindSet::Value* value = outContext.m_buffers[i];
+ if (entry.isOutput)
{
- const auto& entries = outputAndLayout.layout.entries;
-
- for (Index i = 0; i < entries.getCount(); ++i)
+ // Copy back to CPU memory
+ CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
+ if (value && value->m_data && cudaMem)
{
- const auto& entry = entries[i];
- BindSet::Value* value = outContext.m_buffers[i];
-
- if (entry.isOutput)
- {
- // Copy back to CPU memory
- CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
- if (value && value->m_data && cudaMem)
- {
- // Okay copy the data back...
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
- }
- }
+ // Okay copy the data back...
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
}
}
}
- // Release all othe CUDA resource/allocations
- bindSet.releaseValueTargets();
-
return SLANG_OK;
}
-/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
+ /// Load and invoke a CUDA program (either compute or ray-tracing)
+SlangResult _loadAndInvokeKernel(
+ CUcontext cudaContext,
+ ScopeCUDAStream& cudaStream,
+ const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+ const uint32_t dispatchSize[3],
+ CUDAComputeUtil::Context& outContext)
{
- ScopeCUDAContext cudaContext;
- SLANG_RETURN_ON_FAIL(cudaContext.init(0));
-
-
switch( outputAndLayout.output.desc.pipelineType )
{
default:
return SLANG_FAIL;
case PipelineType::Compute:
- {
- const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
- if (index < 0)
- {
- return SLANG_FAIL;
- }
-
- const auto& kernel = outputAndLayout.output.kernelDescs[index];
-
- ScopeCUDAModule cudaModule;
- SLANG_RETURN_ON_FAIL(cudaModule.load(kernel.codeBegin));
- SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, dispatchSize, outContext));
- }
- break;
+ return _loadAndInvokeComputeProgram(cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
case PipelineType::RayTracing:
- {
-#ifdef RENDER_TEST_OPTIX
- SLANG_OPTIX_RETURN_ON_FAIL(optixInit());
-
- OptixDeviceContextOptions optixOptions = {};
-
- // TODO: set log callback
- optixOptions.logCallbackFunction = &_optixLogCallback;
- optixOptions.logCallbackLevel = 4;
-
- OptixDeviceContext optixContext = nullptr;
- SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));
-
- enum { kOptixLogSize = 2*1024 };
- char log[kOptixLogSize];
- size_t logSize = sizeof(log);
-
- OptixPipelineCompileOptions optixPipelineCompileOptions = {};
-
- // We need to load modules from the PTX code available to us,
- // and then also create program groups from the kernels
- // in those modules.
- //
- // For now we will only support program groups with a single
- // kernel in them, and will create one per entry point.
- //
- Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
- List<OptixProgramGroup> optixProgramGroups;
- List<String> names;
-
- OptixShaderBindingTable optixSBT = {};
-
- for( Index ee = 0; ee < entryPointCount; ++ee )
- {
- auto& kernel = outputAndLayout.output.kernelDescs[ee];
-
- OptixModuleCompileOptions optixModuleCompileOptions = {};
-
- OptixModule optixModule;
- SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
- optixContext,
- &optixModuleCompileOptions,
- &optixPipelineCompileOptions,
- (char const*) kernel.codeBegin,
- kernel.getCodeSize(),
- log,
- &logSize,
- &optixModule));
-
-
- OptixProgramGroupOptions optixProgramGroupOptions = {};
-
- OptixProgramGroupDesc optixProgramGroupDesc = {};
- optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- optixProgramGroupDesc.raygen.module = optixModule;
-
- String name = String("__raygen__") + kernel.entryPointName;
- names.add(name);
- optixProgramGroupDesc.raygen.entryFunctionName = name.begin();
-
- OptixProgramGroup optixProgramGroup = nullptr;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &optixProgramGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &optixProgramGroup));
-
- optixProgramGroups.add(optixProgramGroup);
-
- {
- CUdeviceptr rayGenRecordPtr;
- size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) rayGenRecordPtr,
- &rayGenRecordData,
- rayGenRecordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.raygenRecord = rayGenRecordPtr;
- }
- }
-
-
-
- OptixPipeline optixPipeline = nullptr;
-
- OptixPipelineLinkOptions optixPipelineLinkOptions = {};
- optixPipelineLinkOptions.maxTraceDepth = 5;
- optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
- optixPipelineLinkOptions.overrideUsesMotionBlur = false;
- SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
- optixContext,
- &optixPipelineCompileOptions,
- &optixPipelineLinkOptions,
- optixProgramGroups.getBuffer(),
- (unsigned int)optixProgramGroups.getCount(),
- log,
- &logSize,
- &optixPipeline));
-
+ return _loadAndInvokeRayTracingProgram(cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
+ }
+}
- {
- // The OptiX API complains if we don't fill in a miss record
- // in the SBT, so we will create a dummy one here to represent
- // the lack of any miss shaders.
- //
- OptixProgramGroupOptions optixProgramGroupOptions = {};
- OptixProgramGroupDesc missGroupDesc = {};
- missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
- OptixProgramGroup missProgramGroup;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &missGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &missProgramGroup));
-
-
- CUdeviceptr missRecordPtr;
- size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) missRecordPtr,
- &missRecordData,
- missRecordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.missRecordBase = missRecordPtr;
- optixSBT.missRecordCount = 1;
- optixSBT.missRecordStrideInBytes = missRecordSize;
- }
- {
- // Okay, we also need a dummy hit group.
-
- OptixProgramGroupOptions optixProgramGroupOptions = {};
- OptixProgramGroupDesc hitGroupDesc = {};
- hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- OptixProgramGroup programGroup;
- SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
- optixContext,
- &hitGroupDesc,
- 1,
- &optixProgramGroupOptions,
- log,
- &logSize,
- &programGroup));
-
-
- CUdeviceptr recordPtr;
- size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));
-
- struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
- SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));
-
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
- (void*) recordPtr,
- &recordData,
- recordSize,
- cudaMemcpyHostToDevice));
-
- optixSBT.hitgroupRecordBase = recordPtr;
- optixSBT.hitgroupRecordCount = 1;
- optixSBT.hitgroupRecordStrideInBytes = recordSize;
- }
+ /// Execute a CUDA program (either compute or ray-tracing)
+ ///
+ /// This function handles loading code and argument data,
+ /// invoking the kernel(s), and reading back results.
+ ///
+/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
+{
+ ScopeCUDAContext cudaContext;
+ SLANG_RETURN_ON_FAIL(cudaContext.init(0));
- ScopeCUDAStream cudaStream;
+ // A default stream, will act as a global stream. Calling sync will globally sync
+ ScopeCUDAStream cudaStream;
+ //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
- CUdeviceptr globalParams = 0;
- size_t globalParamsSize = 0;
+ auto& bindSet = outContext.m_bindSet;
+ auto& bindRoot = outContext.m_bindRoot;
- unsigned int gridSizeX = 1;
- unsigned int gridSizeY = 1;
- unsigned int gridSizeZ = 1;
+ auto request = outputAndLayout.output.request;
+ auto reflection = (slang::ShaderReflection*) spGetReflection(request);
+ SLANG_RETURN_ON_FAIL(_setUpArguments(
+ cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
- SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
- optixPipeline,
- cudaStream,
- globalParams,
- globalParamsSize,
- &optixSBT,
- gridSizeX,
- gridSizeY,
- gridSizeZ));
+ SLANG_RETURN_ON_FAIL(_loadAndInvokeKernel(
+ cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
+ // Finally we need to copy the data back
+ SLANG_RETURN_ON_FAIL(_readBackOutputs(
+ cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
- SLANG_RETURN_ON_FAIL(cudaStream.sync());
-#endif
- }
- break;
- }
+ // Release all othe CUDA resource/allocations
+ bindSet.releaseValueTargets();
return SLANG_OK;
}