Add support for global shader parameters to OptiX path (#1323)

There are two main pieces here. First, we specialize the code generaiton for CUDA kernels to account for the way that shader parameters are passed differently for ordinary compute kernels vs. ray-tracing kernels. Both global and entry-point shader parameters in Slang are translated to kernel function parameters for CUDA compute kernels, while for OptiX ray tracing kernels we need to use a global `__constant__` variable for the global parameters, and the SBT data (accessed via an OptiX API function) for entry-point shader parameters. This choice bakes in a few pieces of policy when it comes to how Slang ray-tracing shaders translate to OptiX: * It fixes the name used for the global `__constant__` variable for global shader parameters to be `SLANG_globalParams`. Since that name has to be specified when creating a pipeline with the OptiX API, the choice of name effectively becomes an ABI contract for Slang's code generation. * It fixes the choice that global parameters in Slang map to per-launch parameters in OptiX, and entry-point parameters in Slang map to SBT-backed parameters in OptiX. This is a reasonable policy, and it is also one that we are likely to need to codify for Vulkan as well, but it is always a bit unfortunate to bake policy choices like this into the compiler (especially when shaders compiled for D3D can often decouple the form of their HLSL/Slang code from how things are bound in the API). The second piece is a lot of refactoring of the logic in `render-test/cuda/cuda-compute-util.cpp`, so that the logic for setting up (and reading back) the buffers of parameter data can be shared between the compute and ray-tracing paths. The result may not be a true global optimum for how the code is organized, but it at least serves the goal of not duplicating the parameter-binding logic between compute and ray-tracing.
author: Tim Foley <tfoleyNV@users.noreply.github.com> 2020-04-17 08:53:41 -0700
committer: GitHub <noreply@github.com> 2020-04-17 08:53:41 -0700
commit: acb1c39b4e29358cf496c07dc325e52f39be71f4 (patch)
tree: d76c44aded40d46cdb0d76af91112a1a3fc34d2f /tools/render-test
parent: 12b30afb24ac03d69f091f18c25ed2bbefae1acd (diff)
1 files changed, 549 insertions, 428 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index b2006a7e8..5d6f82499 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -15,9 +15,13 @@
 
 // TODO: should conditionalize this on OptiX being present
 #ifdef RENDER_TEST_OPTIX
+
+// The `optix_stubs.h` header produces warnings when compiled with MSVC
+#ifdef _MSC_VER
+#pragma warning(disable: 4996)
+#endif
 #include <optix.h>
 #include <optix_function_table_definition.h>
-#define _CRT_SECURE_NO_WARNINGS 1
 #include <optix_stubs.h>
 #endif
 
@@ -947,13 +951,33 @@ static bool _hasWriteAccess(SlangResourceAccess access)
     return SLANG_OK;
 }
 
-static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], CUDAComputeUtil::Context& outContext)
+    /// Load kernel code and invoke a compute program
+    ///
+    /// Assumes that data for binding the kernel parameters is already
+    /// set up in `outContext.`
+    ///
+static SlangResult _loadAndInvokeComputeProgram(
+    CUcontext cudaContext,
+    ScopeCUDAStream& cudaStream,
+    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+    const uint32_t dispatchSize[3],
+    CUDAComputeUtil::Context& outContext)
 {
+    auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);
+
     auto& bindSet = outContext.m_bindSet;
     auto& bindRoot = outContext.m_bindRoot;
 
-    auto request = outputAndLayout.output.request;
-    auto reflection = (slang::ShaderReflection*) spGetReflection(request);
+    const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
+    if (index < 0)
+    {
+        return SLANG_FAIL;
+    }
+
+    const auto& kernelDesc = outputAndLayout.output.kernelDescs[index];
+
+    ScopeCUDAModule cudaModule;
+    SLANG_RETURN_ON_FAIL(cudaModule.load(kernelDesc.codeBegin));
 
     slang::EntryPointReflection* entryPoint = nullptr;
     auto entryPointCount = reflection->getEntryPointCount();
@@ -964,514 +988,611 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
     const char* entryPointName = entryPoint->getName();
 
     // Get the entry point
-    CUfunction kernel;
-    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName));
+    CUfunction cudaEntryPoint;
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&cudaEntryPoint, cudaModule, entryPointName));
 
-    // A default stream, will act as a global stream. Calling sync will globally sync
-    ScopeCUDAStream cudaStream;
-    //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
+    // Get the max threads per block for this function
 
-    {
-        // Okay now we need to set up binding
-        bindRoot.init(&bindSet, reflection, 0);
+    int maxTheadsPerBlock;
+    SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cudaEntryPoint));
+
+    int sharedSizeInBytes;
+    SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));
+
+    // Work out the args
+    CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
+    CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+
+    // NOTE! These are pointers to the cuda memory pointers
+    void* args[] = { &entryPointCUDAData , &uniformCUDAData };
+
+    SlangUInt numThreadsPerAxis[3];
+    entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
+
+    // Launch
+    auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
+        dispatchSize[0], dispatchSize[1], dispatchSize[2], 
+        int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
+        0,              // Shared memory size
+        cudaStream,     // Stream. 0 is no stream.
+        args,           // Args
+        nullptr);       // extra
+
+    SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
+
+    // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
+    SLANG_RETURN_ON_FAIL(cudaStream.sync());
+
+    return SLANG_OK;
+}
+
+#ifdef RENDER_TEST_OPTIX
+    /// Load kernel code and invoke a ray-tracing program
+    ///
+    /// Assumes that data for binding the kernel parameters is already
+    /// set up in `outContext.`
+    ///
+    /// Currently only works for programs that have a single
+    /// ray generation shader and no other entry points.
+    ///
+static SlangResult _loadAndInvokeRayTracingProgram(
+    CUcontext cudaContext,
+    ScopeCUDAStream& cudaStream,
+    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+    const uint32_t dispatchSize[3],
+    CUDAComputeUtil::Context& outContext)
+{
+    SLANG_OPTIX_RETURN_ON_FAIL(optixInit());
+
+    OptixDeviceContextOptions optixOptions = {};
 
-        // Will set up any root buffers
-        bindRoot.addDefaultValues();
+#if _DEBUG
+    optixOptions.logCallbackFunction = &_optixLogCallback;
+    optixOptions.logCallbackLevel = 4;
+#endif
+
+    OptixDeviceContext optixContext = nullptr;
+    SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));
 
-        // Now set up the Values from the test
+    enum { kOptixLogSize = 2*1024 };
+    char log[kOptixLogSize];
+    size_t logSize = sizeof(log);
 
-        auto outStream = StdWriters::getOut();
-        SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
+    OptixPipelineCompileOptions optixPipelineCompileOptions = {};
+    optixPipelineCompileOptions.pipelineLaunchParamsVariableName = "SLANG_globalParams";
 
-        ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
+    // We need to load modules from the PTX code available to us,
+    // and then also create program groups from the kernels
+    // in those modules.
+    //
+    // For now we will only support program groups with a single
+    // kernel in them, and will create one per entry point.
+    //
+    Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
+    List<OptixProgramGroup> optixProgramGroups;
+    List<String> names;
 
-        // First create all of the resources for the values
+    OptixShaderBindingTable optixSBT = {};
+
+    for( Index ee = 0; ee < entryPointCount; ++ee )
+    {
+        auto& kernel = outputAndLayout.output.kernelDescs[ee];
+
+        // TODO: The logic here assumes that each kernel will
+        // come from its own independent module, and has no
+        // provisiion for loading modules that might contain
+        // multiple entry points.
+        //
+        OptixModuleCompileOptions optixModuleCompileOptions = {};
+        OptixModule optixModule;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
+            optixContext,
+            &optixModuleCompileOptions,
+            &optixPipelineCompileOptions,
+            (char const*) kernel.codeBegin,
+            kernel.getCodeSize(),
+            log,
+            &logSize,
+            &optixModule));
+
+        // TODO: The logic here only handles ray-generation entry points.
+        //
+        // It would seem simple to extend this to handle other entry
+        // point types, by inspecting the stage of the entry points
+        // being loaded, and this is indeed true for the subset of
+        // stages that map one-to-one with OptiX "program groups."
+        //
+        // The sticking point is "hit groups" which require a collection
+        // of entry points to be specified together (insersection,
+        // any hit, and closest hit). A hit group can comprise between
+        // zero and three entry points.
+        //
+        // The catch for us is how to determine which entry points
+        // should be grouped to form hit groups. Should this be
+        // implied in the input code (either by naming convention
+        // or by new Slang language features)? Should this be set
+        // up via command-line arguments or something akin to
+        // `//TEST_INPUT` lines?
+
+        OptixProgramGroupOptions optixProgramGroupOptions = {};
+
+        OptixProgramGroupDesc optixProgramGroupDesc = {};
+        optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+        optixProgramGroupDesc.raygen.module = optixModule;
+
+        String name = String("__raygen__") + kernel.entryPointName;
+        names.add(name);
+        optixProgramGroupDesc.raygen.entryFunctionName = name.begin();
+
+        OptixProgramGroup optixProgramGroup = nullptr;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+            optixContext,
+            &optixProgramGroupDesc,
+            1,
+            &optixProgramGroupOptions,
+            log,
+            &logSize,
+            &optixProgramGroup));
+
+        optixProgramGroups.add(optixProgramGroup);
 
         {
-            const auto& values = bindSet.getValues();
-            const auto& entries = outputAndLayout.layout.entries;
+            CUdeviceptr rayGenRecordPtr;
+            size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
 
-            for (BindSet::Value* value : values)
-            {
-                auto typeLayout = value->m_type;
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));
+
+            struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
+            SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));
+
+            SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+                (void*) rayGenRecordPtr,
+                &rayGenRecordData,
+                rayGenRecordSize,
+                cudaMemcpyHostToDevice));
+
+            optixSBT.raygenRecord = rayGenRecordPtr;
+        }
+    }
+
+    OptixPipeline optixPipeline = nullptr;
+
+    OptixPipelineLinkOptions optixPipelineLinkOptions = {};
+    optixPipelineLinkOptions.maxTraceDepth = 5;
+    optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+    optixPipelineLinkOptions.overrideUsesMotionBlur = false;
+    SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
+        optixContext,
+        &optixPipelineCompileOptions,
+        &optixPipelineLinkOptions,
+        optixProgramGroups.getBuffer(),
+        (unsigned int)optixProgramGroups.getCount(),
+        log,
+        &logSize,
+        &optixPipeline));
+
+
+    {
+        // The OptiX API complains if we don't fill in a miss record
+        // in the SBT, so we will create a dummy one here to represent
+        // the lack of any miss shaders.
+        //
+        OptixProgramGroupOptions optixProgramGroupOptions = {};
+        OptixProgramGroupDesc missGroupDesc = {};
+        missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+        OptixProgramGroup missProgramGroup;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+            optixContext,
+            &missGroupDesc,
+            1,
+            &optixProgramGroupOptions,
+            log,
+            &logSize,
+            &missProgramGroup));
+
+
+        CUdeviceptr missRecordPtr;
+        size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
+
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));
+
+        struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));
+
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+            (void*) missRecordPtr,
+            &missRecordData,
+            missRecordSize,
+            cudaMemcpyHostToDevice));
+
+        optixSBT.missRecordBase = missRecordPtr;
+        optixSBT.missRecordCount = 1;
+        optixSBT.missRecordStrideInBytes = (unsigned int)missRecordSize;
+    }
+    {
+        // Okay, we also need a dummy hit group.
+
+        OptixProgramGroupOptions optixProgramGroupOptions = {};
+        OptixProgramGroupDesc hitGroupDesc = {};
+        hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+        OptixProgramGroup programGroup;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
+            optixContext,
+            &hitGroupDesc,
+            1,
+            &optixProgramGroupOptions,
+            log,
+            &logSize,
+            &programGroup));
+
+
+        CUdeviceptr recordPtr;
+        size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
+
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));
+
+        struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
+        SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));
+
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+            (void*) recordPtr,
+            &recordData,
+            recordSize,
+            cudaMemcpyHostToDevice));
+
+        optixSBT.hitgroupRecordBase = recordPtr;
+        optixSBT.hitgroupRecordCount = 1;
+        optixSBT.hitgroupRecordStrideInBytes = (unsigned int)recordSize;
+    }
+
+    // Work out the args
+
+    auto& bindRoot = outContext.m_bindRoot;
+
+    CUdeviceptr globalParams = 0;
+    size_t globalParamsSize;
+
+    if( auto globalArg = bindRoot.getRootValue() )
+    {
+        globalParams = MemoryCUDAResource::getCUDAData(globalArg);
+        globalParamsSize = globalArg->m_sizeInBytes;
+    }
+
+    // TODO: The data for entry point parameters needs to be stored
+    // into the SBT.
+    //
+    // The simplest solution here would be to copy data from the `bindRoot`
+    // into the SBT at the point where we are setting up the SBT, but
+    // a more optimized approach (more similar to what a real applicaiton
+    // would do) would be to allocate the SBT first and then have the
+    // binding logic write directly into its entries.
+    //
+    // One big complication here is that there need not necessarily be
+    // a one-to-one relationship between the entry points (or entry-point
+    // groups) in a compiled ray-tracing pipeline and the entries in
+    // the SBT. Each SBT entry is conceptually an "instance" of one
+    // of the entry-point groups in the program, and there can be
+    // zero, one, or many instances of a given group.
+    //
+    // Modelling this more completely in `render-test` requires that
+    // we start having a model for the "scene" that is being rendered,
+    // and how entry point groups are associated with the objects in
+    // that scene.
+    //
+    CUdeviceptr entryPointParams = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+
+    SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
+        optixPipeline,
+        cudaStream,
+        globalParams,
+        globalParamsSize,
+        &optixSBT,
+        dispatchSize[0],
+        dispatchSize[1],
+        dispatchSize[2]));
+
+    SLANG_RETURN_ON_FAIL(cudaStream.sync());
+
+    return SLANG_OK;
+}
+#endif
+
+    /// Fill in the binding information for arguments of a CUDA program.
+static SlangResult _setUpArguments(
+    CUcontext cudaContext,
+    ScopeCUDAStream& cudaStream,
+    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+    const uint32_t dispatchSize[3],
+    CUDAComputeUtil::Context& outContext)
+{
+    auto reflection = slang::ProgramLayout::get(outputAndLayout.output.request);
+
+    auto& bindSet = outContext.m_bindSet;
+    auto& bindRoot = outContext.m_bindRoot;
+
+    // Okay now we need to set up binding
+    bindRoot.init(&bindSet, reflection, 0);
+
+    // Will set up any root buffers
+    bindRoot.addDefaultValues();
+
+    // Now set up the Values from the test
+
+    auto outStream = StdWriters::getOut();
+    SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
+
+    ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
+
+    // First create all of the resources for the values
+
+    {
+        const auto& values = bindSet.getValues();
+        const auto& entries = outputAndLayout.layout.entries;
+
+        for (BindSet::Value* value : values)
+        {
+            auto typeLayout = value->m_type;
                
-                // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
-                slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
+            // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
+            slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
                
-                switch (kind)
+            switch (kind)
+            {
+                case slang::TypeReflection::Kind::ConstantBuffer:
+                case slang::TypeReflection::Kind::ParameterBlock:
                 {
-                    case slang::TypeReflection::Kind::ConstantBuffer:
-                    case slang::TypeReflection::Kind::ParameterBlock:
-                    {
-                        // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
-                        RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
-                        SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
-                        value->m_target = resource;
-                        break;
-                    }
-                    case slang::TypeReflection::Kind::Resource:
-                    {
-                        auto type = typeLayout->getType();
-                        auto shape = type->getResourceShape();
+                    // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
+                    RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
+                    SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+                    value->m_target = resource;
+                    break;
+                }
+                case slang::TypeReflection::Kind::Resource:
+                {
+                    auto type = typeLayout->getType();
+                    auto shape = type->getResourceShape();
 
-                        auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+                    auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
 
-                        switch (baseShape)
+                    switch (baseShape)
+                    {
+                        case SLANG_TEXTURE_1D: 
+                        case SLANG_TEXTURE_2D:
+                        case SLANG_TEXTURE_3D:
+                        case SLANG_TEXTURE_CUBE:
                         {
-                            case SLANG_TEXTURE_1D: 
-                            case SLANG_TEXTURE_2D:
-                            case SLANG_TEXTURE_3D:
-                            case SLANG_TEXTURE_CUBE:
-                            {
-                                RefPtr<CUDAResource> resource;
-                                SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
-                                value->m_target = resource;
-                                break;
-                            }
-                            case SLANG_TEXTURE_BUFFER:
-                            {
-                                // Need a CUDA impl for these...
-                                // For now we can just leave as target will just be nullptr
-                                break;
-                            }
+                            RefPtr<CUDAResource> resource;
+                            SLANG_RETURN_ON_FAIL(CUDAComputeUtil::createTextureResource(entries[value->m_userIndex], typeLayout, resource));
+                            value->m_target = resource;
+                            break;
+                        }
+                        case SLANG_TEXTURE_BUFFER:
+                        {
+                            // Need a CUDA impl for these...
+                            // For now we can just leave as target will just be nullptr
+                            break;
+                        }
 
-                            case SLANG_BYTE_ADDRESS_BUFFER:
-                            case SLANG_STRUCTURED_BUFFER:
-                            {
-                                // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
-                                RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
-                                SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
-                                value->m_target = resource;
-                                break;
-                            }
+                        case SLANG_BYTE_ADDRESS_BUFFER:
+                        case SLANG_STRUCTURED_BUFFER:
+                        {
+                            // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
+                            RefPtr<MemoryCUDAResource> resource = new MemoryCUDAResource;
+                            SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+                            value->m_target = resource;
+                            break;
                         }
                     }
-                    default: break;
                 }
+                default: break;
             }
         }
+    }
     
-        // Now we need to go through all of the bindings and set the appropriate data
+    // Now we need to go through all of the bindings and set the appropriate data
+
+    {
+        List<BindLocation> locations;
+        List<BindSet::Value*> values;
+        bindSet.getBindings(locations, values);
 
+        for (Index i = 0; i < locations.getCount(); ++i)
         {
-            List<BindLocation> locations;
-            List<BindSet::Value*> values;
-            bindSet.getBindings(locations, values);
+            const auto& location = locations[i];
+            BindSet::Value* value = values[i];
 
-            for (Index i = 0; i < locations.getCount(); ++i)
-            {
-                const auto& location = locations[i];
-                BindSet::Value* value = values[i];
+            // Okay now we need to set up the actual handles that CPU will follow.
+            auto typeLayout = location.getTypeLayout();
 
-                // Okay now we need to set up the actual handles that CPU will follow.
-                auto typeLayout = location.getTypeLayout();
+            const auto kind = typeLayout->getKind();
+            switch (kind)
+            {
+                case slang::TypeReflection::Kind::Array:
+                {
+                    auto elementCount = int(typeLayout->getElementCount());
+                    if (elementCount == 0)
+                    {
+                        CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
+                        auto resource = MemoryCUDAResource::asResource(value);
+                        if (resource)
+                        {
+                            array.data = resource->m_cudaMemory;
+                            array.count = value->m_elementCount;
+                        }
 
-                const auto kind = typeLayout->getKind();
-                switch (kind)
+                        location.setUniform(&array, sizeof(array));
+                    }
+                    break;
+                }
+                case slang::TypeReflection::Kind::ConstantBuffer:
+                case slang::TypeReflection::Kind::ParameterBlock:
+                {
+                    // These map down to just pointers
+                    *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
+                    break;
+                }
+                case slang::TypeReflection::Kind::Resource:
                 {
-                    case slang::TypeReflection::Kind::Array:
+                    auto type = typeLayout->getType();
+                    auto shape = type->getResourceShape();
+
+                    auto access = type->getResourceAccess();
+
+                    const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+
+                    switch (baseShape)
                     {
-                        auto elementCount = int(typeLayout->getElementCount());
-                        if (elementCount == 0)
+                        case SLANG_STRUCTURED_BUFFER:
                         {
-                            CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
+                            CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
                             auto resource = MemoryCUDAResource::asResource(value);
                             if (resource)
                             {
-                                array.data = resource->m_cudaMemory;
-                                array.count = value->m_elementCount;
+                                buffer.data = resource->m_cudaMemory;
+                                buffer.count = value->m_elementCount;
                             }
 
-                            location.setUniform(&array, sizeof(array));
+                            location.setUniform(&buffer, sizeof(buffer));
+                            break;
                         }
-                        break;
-                    }
-                    case slang::TypeReflection::Kind::ConstantBuffer:
-                    case slang::TypeReflection::Kind::ParameterBlock:
-                    {
-                        // These map down to just pointers
-                        *location.getUniform<CUdeviceptr>() = MemoryCUDAResource::getCUDAData(value);
-                        break;
-                    }
-                    case slang::TypeReflection::Kind::Resource:
-                    {
-                        auto type = typeLayout->getType();
-                        auto shape = type->getResourceShape();
-
-                        auto access = type->getResourceAccess();
-
-                        const auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
-
-                        switch (baseShape)
+                        case SLANG_BYTE_ADDRESS_BUFFER:
                         {
-                            case SLANG_STRUCTURED_BUFFER:
+                            CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
+
+                            auto resource = MemoryCUDAResource::asResource(value);
+                            if (resource)
                             {
-                                CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
-                                auto resource = MemoryCUDAResource::asResource(value);
-                                if (resource)
-                                {
-                                    buffer.data = resource->m_cudaMemory;
-                                    buffer.count = value->m_elementCount;
-                                }
-
-                                location.setUniform(&buffer, sizeof(buffer));
-                                break;
+                                buffer.data = resource->m_cudaMemory;
+                                buffer.sizeInBytes = value->m_sizeInBytes;
                             }
-                            case SLANG_BYTE_ADDRESS_BUFFER:
-                            {
-                                CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
-
-                                auto resource = MemoryCUDAResource::asResource(value);
-                                if (resource)
-                                {
-                                    buffer.data = resource->m_cudaMemory;
-                                    buffer.sizeInBytes = value->m_sizeInBytes;
-                                }
 
-                                location.setUniform(&buffer, sizeof(buffer));
-                                break;
+                            location.setUniform(&buffer, sizeof(buffer));
+                            break;
+                        }
+                        case SLANG_TEXTURE_1D:
+                        case SLANG_TEXTURE_2D:
+                        case SLANG_TEXTURE_3D:
+                        case SLANG_TEXTURE_CUBE:
+                        {
+                            if (_hasWriteAccess(access))
+                            {
+                                *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
                             }
-                            case SLANG_TEXTURE_1D:
-                            case SLANG_TEXTURE_2D:
-                            case SLANG_TEXTURE_3D:
-                            case SLANG_TEXTURE_CUBE:
+                            else
                             {
-                                if (_hasWriteAccess(access))
-                                {
-                                    *location.getUniform<CUsurfObject>() = TextureCUDAResource::getSurfObject(value);
-                                }
-                                else
-                                {
-                                    *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
-                                }
-                                break;
+                                *location.getUniform<CUtexObject>() = TextureCUDAResource::getTexObject(value);
                             }
-
+                            break;
                         }
-                        break;
+
                     }
-                    default: break;
+                    break;
                 }
+                default: break;
             }
         }
+    }
 
-        // Okay now the memory is all set up, we can copy everything over
+    // Okay now the memory is all set up, we can copy everything over
+    {
+        const auto& values = bindSet.getValues();
+        for (BindSet::Value* value : values)
         {
-            const auto& values = bindSet.getValues();
-            for (BindSet::Value* value : values)
+            CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
+            if (value && value->m_data && cudaMem)
             {
-                CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
-                if (value && value->m_data && cudaMem)
-                {
-                    // Okay copy the data over...
-                    SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
-                }
+                // Okay copy the data over...
+                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
             }
         }
+    }
 
-        // Now we can execute the kernel
-
-        {
-            // Get the max threads per block for this function
-
-            int maxTheadsPerBlock;
-            SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel));
-
-            int sharedSizeInBytes;
-            SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
-
-            // Work out the args
-            CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
-            CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
-
-            // NOTE! These are pointers to the cuda memory pointers
-            void* args[] = { &entryPointCUDAData , &uniformCUDAData };
-
-            SlangUInt numThreadsPerAxis[3];
-            entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
-
-            // Launch
-            auto cudaLaunchResult = cuLaunchKernel(kernel,
-                dispatchSize[0], dispatchSize[1], dispatchSize[2], 
-                int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
-                0,              // Shared memory size
-                cudaStream,     // Stream. 0 is no stream.
-                args,           // Args
-                nullptr);       // extra
-
-            SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
+    return SLANG_OK;
+}
 
-            // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
-            SLANG_RETURN_ON_FAIL(cudaStream.sync());
-        }
+    /// Read back any output arguments from a CUDA program.
+static SlangResult _readBackOutputs(
+    CUcontext cudaContext,
+    ScopeCUDAStream& cudaStream,
+    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+    const uint32_t dispatchSize[3],
+    CUDAComputeUtil::Context& outContext)
+{
+    const auto& entries = outputAndLayout.layout.entries;
 
-        // Finally we need to copy the data back
+    for (Index i = 0; i < entries.getCount(); ++i)
+    {
+        const auto& entry = entries[i];
+        BindSet::Value* value = outContext.m_buffers[i];
 
+        if (entry.isOutput)
         {
-            const auto& entries = outputAndLayout.layout.entries;
-
-            for (Index i = 0; i < entries.getCount(); ++i)
+            // Copy back to CPU memory
+            CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
+            if (value && value->m_data && cudaMem)
             {
-                const auto& entry = entries[i];
-                BindSet::Value* value = outContext.m_buffers[i];
-
-                if (entry.isOutput)
-                {
-                    // Copy back to CPU memory
-                   CUdeviceptr cudaMem = MemoryCUDAResource::getCUDAData(value);
-                    if (value && value->m_data && cudaMem)
-                    {
-                        // Okay copy the data back...
-                        SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
-                    }
-                }
+                // Okay copy the data back...
+                SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
             }
         }
     }
 
-    // Release all othe CUDA resource/allocations
-    bindSet.releaseValueTargets();
-
     return SLANG_OK;
 }
 
-/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
+    /// Load and invoke a CUDA program (either compute or ray-tracing)
+SlangResult _loadAndInvokeKernel(
+    CUcontext cudaContext,
+    ScopeCUDAStream& cudaStream,
+    const ShaderCompilerUtil::OutputAndLayout& outputAndLayout,
+    const uint32_t dispatchSize[3],
+    CUDAComputeUtil::Context& outContext)
 {
-    ScopeCUDAContext cudaContext;
-    SLANG_RETURN_ON_FAIL(cudaContext.init(0));
-
-
     switch( outputAndLayout.output.desc.pipelineType )
     {
     default:
         return SLANG_FAIL;
 
     case PipelineType::Compute:
-        {
-            const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
-            if (index < 0)
-            {
-                return SLANG_FAIL;
-            }
-
-            const auto& kernel = outputAndLayout.output.kernelDescs[index];
-
-            ScopeCUDAModule cudaModule;
-            SLANG_RETURN_ON_FAIL(cudaModule.load(kernel.codeBegin));
-            SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, dispatchSize, outContext));
-        }
-        break;
+        return _loadAndInvokeComputeProgram(cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
 
     case PipelineType::RayTracing:
-        {
-#ifdef RENDER_TEST_OPTIX
-            SLANG_OPTIX_RETURN_ON_FAIL(optixInit());
-
-            OptixDeviceContextOptions optixOptions = {};
-
-            // TODO: set log callback
-            optixOptions.logCallbackFunction = &_optixLogCallback;
-            optixOptions.logCallbackLevel = 4;
-
-            OptixDeviceContext optixContext = nullptr;
-            SLANG_OPTIX_RETURN_ON_FAIL(optixDeviceContextCreate(cudaContext, &optixOptions, &optixContext));
-
-            enum { kOptixLogSize = 2*1024 };
-            char log[kOptixLogSize];
-            size_t logSize = sizeof(log);
-
-            OptixPipelineCompileOptions optixPipelineCompileOptions = {};
-
-            // We need to load modules from the PTX code available to us,
-            // and then also create program groups from the kernels
-            // in those modules.
-            //
-            // For now we will only support program groups with a single
-            // kernel in them, and will create one per entry point.
-            //
-            Index entryPointCount = outputAndLayout.output.kernelDescs.getCount();
-            List<OptixProgramGroup> optixProgramGroups;
-            List<String> names;
-
-            OptixShaderBindingTable optixSBT = {};
-
-            for( Index ee = 0; ee < entryPointCount; ++ee )
-            {
-                auto& kernel = outputAndLayout.output.kernelDescs[ee];
-
-                OptixModuleCompileOptions optixModuleCompileOptions = {};
-
-                OptixModule optixModule;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixModuleCreateFromPTX(
-                    optixContext,
-                    &optixModuleCompileOptions,
-                    &optixPipelineCompileOptions,
-                    (char const*) kernel.codeBegin,
-                    kernel.getCodeSize(),
-                    log,
-                    &logSize,
-                    &optixModule));
-
-
-                OptixProgramGroupOptions optixProgramGroupOptions = {};
-
-                OptixProgramGroupDesc optixProgramGroupDesc = {};
-                optixProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-                optixProgramGroupDesc.raygen.module = optixModule;
-
-                String name = String("__raygen__") + kernel.entryPointName;
-                names.add(name);
-                optixProgramGroupDesc.raygen.entryFunctionName = name.begin();
-
-                OptixProgramGroup optixProgramGroup = nullptr;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
-                    optixContext,
-                    &optixProgramGroupDesc,
-                    1,
-                    &optixProgramGroupOptions,
-                    log,
-                    &logSize,
-                    &optixProgramGroup));
-
-                optixProgramGroups.add(optixProgramGroup);
-
-                {
-                    CUdeviceptr rayGenRecordPtr;
-                    size_t rayGenRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
-                    SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &rayGenRecordPtr, rayGenRecordSize));
-
-                    struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } rayGenRecordData;
-                    SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(optixProgramGroup, &rayGenRecordData));
-
-                    SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-                        (void*) rayGenRecordPtr,
-                        &rayGenRecordData,
-                        rayGenRecordSize,
-                        cudaMemcpyHostToDevice));
-
-                    optixSBT.raygenRecord = rayGenRecordPtr;
-                }
-            }
-
-
-
-            OptixPipeline optixPipeline = nullptr;
-
-            OptixPipelineLinkOptions optixPipelineLinkOptions = {};
-            optixPipelineLinkOptions.maxTraceDepth = 5;
-            optixPipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-            optixPipelineLinkOptions.overrideUsesMotionBlur = false;
-            SLANG_OPTIX_RETURN_ON_FAIL(optixPipelineCreate(
-                optixContext,
-                &optixPipelineCompileOptions,
-                &optixPipelineLinkOptions,
-                optixProgramGroups.getBuffer(),
-                (unsigned int)optixProgramGroups.getCount(),
-                log,
-                &logSize,
-                &optixPipeline));
-
+        return _loadAndInvokeRayTracingProgram(cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext);
+    }
+}
 
-            {
-                // The OptiX API complains if we don't fill in a miss record
-                // in the SBT, so we will create a dummy one here to represent
-                // the lack of any miss shaders.
-                //
-                OptixProgramGroupOptions optixProgramGroupOptions = {};
-                OptixProgramGroupDesc missGroupDesc = {};
-                missGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-                OptixProgramGroup missProgramGroup;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
-                    optixContext,
-                    &missGroupDesc,
-                    1,
-                    &optixProgramGroupOptions,
-                    log,
-                    &logSize,
-                    &missProgramGroup));
-
-
-                CUdeviceptr missRecordPtr;
-                size_t missRecordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &missRecordPtr, missRecordSize));
-
-                struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } missRecordData;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(missProgramGroup, &missRecordData));
-
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-                    (void*) missRecordPtr,
-                    &missRecordData,
-                    missRecordSize,
-                    cudaMemcpyHostToDevice));
-
-                optixSBT.missRecordBase = missRecordPtr;
-                optixSBT.missRecordCount = 1;
-                optixSBT.missRecordStrideInBytes = missRecordSize;
-            }
-            {
-                // Okay, we also need a dummy hit group.
-
-                OptixProgramGroupOptions optixProgramGroupOptions = {};
-                OptixProgramGroupDesc hitGroupDesc = {};
-                hitGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-                OptixProgramGroup programGroup;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixProgramGroupCreate(
-                    optixContext,
-                    &hitGroupDesc,
-                    1,
-                    &optixProgramGroupOptions,
-                    log,
-                    &logSize,
-                    &programGroup));
-
-
-                CUdeviceptr recordPtr;
-                size_t recordSize = OPTIX_SBT_RECORD_HEADER_SIZE;
-
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc((void**) &recordPtr, recordSize));
-
-                struct { char data[OPTIX_SBT_RECORD_HEADER_SIZE]; } recordData;
-                SLANG_OPTIX_RETURN_ON_FAIL(optixSbtRecordPackHeader(programGroup, &recordData));
-
-                SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
-                    (void*) recordPtr,
-                    &recordData,
-                    recordSize,
-                    cudaMemcpyHostToDevice));
-
-                optixSBT.hitgroupRecordBase = recordPtr;
-                optixSBT.hitgroupRecordCount = 1;
-                optixSBT.hitgroupRecordStrideInBytes = recordSize;
-            }
+    /// Execute a CUDA program (either compute or ray-tracing)
+    ///
+    /// This function handles loading code and argument data,
+    /// invoking the kernel(s), and reading back results.
+    ///
+/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
+{
+    ScopeCUDAContext cudaContext;
+    SLANG_RETURN_ON_FAIL(cudaContext.init(0));
 
-            ScopeCUDAStream cudaStream;
+    // A default stream, will act as a global stream. Calling sync will globally sync
+    ScopeCUDAStream cudaStream;
+    //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
 
-            CUdeviceptr globalParams = 0;
-            size_t globalParamsSize = 0;
+    auto& bindSet = outContext.m_bindSet;
+    auto& bindRoot = outContext.m_bindRoot;
 
-            unsigned int gridSizeX = 1;
-            unsigned int gridSizeY = 1;
-            unsigned int gridSizeZ = 1;
+    auto request = outputAndLayout.output.request;
+    auto reflection = (slang::ShaderReflection*) spGetReflection(request);
 
+    SLANG_RETURN_ON_FAIL(_setUpArguments(
+        cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
 
-            SLANG_OPTIX_RETURN_ON_FAIL(optixLaunch(
-                optixPipeline,
-                cudaStream,
-                globalParams,
-                globalParamsSize,
-                &optixSBT,
-                gridSizeX,
-                gridSizeY,
-                gridSizeZ));
+    SLANG_RETURN_ON_FAIL(_loadAndInvokeKernel(
+        cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
 
+        // Finally we need to copy the data back
+    SLANG_RETURN_ON_FAIL(_readBackOutputs(
+        cudaContext, cudaStream, outputAndLayout, dispatchSize, outContext));
 
-            SLANG_RETURN_ON_FAIL(cudaStream.sync());
-#endif
-        }
-        break;
-    }
+    // Release all othe CUDA resource/allocations
+    bindSet.releaseValueTargets();
 
     return SLANG_OK;
 }
author	Tim Foley <tfoleyNV@users.noreply.github.com>	2020-04-17 08:53:41 -0700
committer	GitHub <noreply@github.com>	2020-04-17 08:53:41 -0700
commit	acb1c39b4e29358cf496c07dc325e52f39be71f4 (patch)
tree	d76c44aded40d46cdb0d76af91112a1a3fc34d2f /tools/render-test
parent	12b30afb24ac03d69f091f18c25ed2bbefae1acd (diff)