8 files changed, 223 insertions, 83 deletions
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index ce3dc8957..4c6c89ef5 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -660,6 +660,11 @@ String CLikeSourceEmitter::scrubName(const String& name)
     return sb.ProduceString();
 }
 
+String CLikeSourceEmitter::generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor)
+{
+    return entryPointDecor->getName()->getStringSlice();
+}
+
 String CLikeSourceEmitter::generateName(IRInst* inst)
 {
     // If the instruction names something
@@ -686,7 +691,7 @@ String CLikeSourceEmitter::generateName(IRInst* inst)
             return "main";
         }
 
-        return entryPointDecor->getName()->getStringSlice();
+        return generateEntryPointNameImpl(entryPointDecor);
     }
 
     // If we have a name hint on the instruction, then we will try to use that
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index d813a819e..b89d5d1c4 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -156,6 +156,8 @@ public:
     String scrubName(const String& name);
 
     String generateName(IRInst* inst);
+    virtual String generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor);
+
     String getName(IRInst* inst);
 
     void emitDeclarator(IRDeclaratorInfo* declarator);    
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 6f24d5b74..acd913865 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -239,7 +239,15 @@ void CUDASourceEmitter::emitLayoutSemanticsImpl(IRInst* inst, char const* unifor
 
 void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniformParameterGroupType* type)
 {
-    Super::emitParameterGroupImpl(varDecl, type);
+    auto elementType = type->getElementType();
+
+    m_writer->emit("extern \"C\" __constant__ ");
+    emitType(elementType, "SLANG_globalParams");
+    m_writer->emit(";\n");
+
+    m_writer->emit("#define ");
+    m_writer->emit(getName(varDecl));
+    m_writer->emit(" (&SLANG_globalParams)\n");
 }
 
 void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor)
@@ -260,6 +268,59 @@ void CUDASourceEmitter::emitFunctionPreambleImpl(IRInst* inst)
     }
 }
 
+String CUDASourceEmitter::generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor)
+{
+    // We have an entry-point function in the IR module, which we
+    // will want to emit as a `__global__` function in the generated
+    // CUDA C++.
+    //
+    // The most common case will be a compute kernel, in which case
+    // we will emit the function more or less as-is, including
+    // usingits original name as the name of the global symbol.
+    //
+    String funcName = Super::generateEntryPointNameImpl(entryPointDecor);
+    String globalSymbolName = funcName;
+
+    // We also suport emitting ray tracing kernels for use with
+    // OptiX, and in that case the name of the global symbol
+    // must be prefixed to indicate to the OptiX runtime what
+    // stage it is to be compiled for.
+    //
+    auto stage = entryPointDecor->getProfile().getStage();
+    switch( stage )
+    {
+    default:
+        break;
+
+#define CASE(STAGE, PREFIX) \
+    case Stage::STAGE: globalSymbolName = #PREFIX + funcName; break
+
+    // Optix 7 Guide, Section 6.1 (Program input)
+    //
+    // > The input PTX should include one or more NVIDIA OptiX programs.
+    // > The type of program affects how the program can be used during
+    // > the execution of the pipeline. These program types are specified
+    // by prefixing the program’s name with the following:
+    //
+    // >    Program type        Function name prefix
+    CASE(   RayGeneration,      __raygen__);
+    CASE(   Intersection,       __intersection__);
+    CASE(   AnyHit,             __anyhit__);
+    CASE(   ClosestHit,         __closesthit__);
+    CASE(   Miss,               __miss__);
+    CASE(   Callable,           __direct_callable__);
+    //
+    // There are two stages (or "program types") supported by OptiX
+    // that Slang currently cannot target:
+    //
+    // CASE(ContinuationCallable,   __continuation_callable__);
+    // CASE(Exception,              __exception__);
+    //
+#undef CASE
+    }
+
+    return globalSymbolName;
+}
 
 void CUDASourceEmitter::emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec)
 {
@@ -642,6 +703,24 @@ void CUDASourceEmitter::emitPreprocessorDirectivesImpl()
     }
 }
 
+bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType)
+{
+    // A global shader parameter in the IR for CUDA output will
+    // either be the unique constant buffer that wraps all the
+    // global-scope parameters in the original code (which is
+    // handled as a special-case before this routine would be
+    // called), or it is one of the system-defined varying inputs
+    // like `threadIdx`. We won't need to emit anything in the
+    // output code for the latter case, so we need to emit
+    // nothing here and return `true` so that the base class
+    // uses our logic instead of the default.
+    //
+    SLANG_UNUSED(varDecl);
+    SLANG_UNUSED(varType);
+    return true;
+}
+
+
 void CUDASourceEmitter::emitModuleImpl(IRModule* module)
 {
     // Setup all built in types used in the module
@@ -660,51 +739,7 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module)
 
     // TODO(JS): We may need to generate types (for example for matrices)
 
-    // TODO(JS): We need to determine which functions we need to inline
-
-    // The IR will usually come in an order that respects
-    // dependencies between global declarations, but this
-    // isn't guaranteed, so we need to be careful about
-    // the order in which we emit things.
-
-    List<EmitAction> actions;
-
-    computeEmitActions(module, actions);
-
-
-    _emitForwardDeclarations(actions);
-
-    // Output group shared variables
-
-    {
-        for (auto action : actions)
-        {
-            if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && as<IRGroupSharedRate>(action.inst->getRate()))
-            {
-                emitGlobalInst(action.inst);   
-            }
-        }
-    }
-
-    {
-        // Output all the thread locals 
-        for (auto action : actions)
-        {
-            if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && !as<IRGroupSharedRate>(action.inst->getRate()))
-            {
-                emitGlobalInst(action.inst);
-            }
-        }
-
-        // Finally output the functions as methods on the context
-        for (auto action : actions)
-        {
-            if (action.level == EmitAction::Level::Definition && as<IRFunc>(action.inst))
-            {
-                emitGlobalInst(action.inst);
-            }
-        }
-    }
+    CLikeSourceEmitter::emitModuleImpl(module);
 }
 
 
diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h
index ae78a5e57..156d5fab1 100644
--- a/source/slang/slang-emit-cuda.h
+++ b/source/slang/slang-emit-cuda.h
@@ -57,12 +57,13 @@ protected:
     virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE;
     virtual void emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE;
     virtual void emitFunctionPreambleImpl(IRInst* inst) SLANG_OVERRIDE;
+    virtual String generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor) SLANG_OVERRIDE;
 
     virtual void emitLoopControlDecorationImpl(IRLoopControlDecoration* decl) SLANG_OVERRIDE;
 
     virtual void handleCallExprDecorationsImpl(IRInst* funcValue) SLANG_OVERRIDE;
 
-    //virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE;
+    virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE;
     virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE;
 
     virtual void emitPreprocessorDirectivesImpl() SLANG_OVERRIDE;
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 7d8a4074e..af01e4b0d 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -242,20 +242,19 @@ Result linkAndOptimizeIR(
         CollectEntryPointUniformParamsOptions passOptions;
         switch( target )
         {
-        default:
+        case CodeGenTarget::CUDASource:
             break;
 
         case CodeGenTarget::CPPSource:
-        case CodeGenTarget::CUDASource:
             passOptions.alwaysCreateCollectedParam = true;
+        default:
+            collectEntryPointUniformParams(irModule, passOptions);
+        #if 0
+            dumpIRIfEnabled(compileRequest, irModule, "ENTRY POINT UNIFORMS COLLECTED");
+        #endif
+            validateIRModuleIfEnabled(compileRequest, irModule);
             break;
         }
-
-        collectEntryPointUniformParams(irModule, passOptions);
-    #if 0
-        dumpIRIfEnabled(compileRequest, irModule, "ENTRY POINT UNIFORMS COLLECTED");
-    #endif
-        validateIRModuleIfEnabled(compileRequest, irModule);
     }
 
     switch( target )
@@ -637,7 +636,10 @@ Result linkAndOptimizeIR(
     case CodeGenTarget::CUDASource:
         moveGlobalVarInitializationToEntryPoints(irModule);
         introduceExplicitGlobalContext(irModule, target);
-        convertEntryPointPtrParamsToRawPtrs(irModule);
+        if(target == CodeGenTarget::CPPSource)
+        {
+            convertEntryPointPtrParamsToRawPtrs(irModule);
+        }
     #if 0
         dumpIRIfEnabled(compileRequest, irModule, "EXPLICIT GLOBAL CONTEXT INTRODUCED");
     #endif
diff --git a/source/slang/slang-ir-explicit-global-context.cpp b/source/slang/slang-ir-explicit-global-context.cpp
index 8f11bce2c..32efd51e8 100644
--- a/source/slang/slang-ir-explicit-global-context.cpp
+++ b/source/slang/slang-ir-explicit-global-context.cpp
@@ -31,17 +31,6 @@ struct IntroduceExplicitGlobalContextPass
 
         IRBuilder builder(&sharedBuilder);
 
-        // The global context will be represneted by a `struct`
-        // type with a name hint of `KernelContext`.
-        //
-        m_contextStructType = builder.createStructType();
-        builder.addNameHintDecoration(m_contextStructType, UnownedTerminatedStringSlice("KernelContext"));
-
-        // The context will usually be passed around by pointer,
-        // so we get and cache that pointer type up front.
-        //
-        m_contextStructPtrType = builder.getPtrType(m_contextStructType);
-
         // The transformation we will perform will need to affect
         // global variables, global shader parameters, and entry-point
         // function (at the very least), and we start with an explicit
@@ -107,7 +96,13 @@ struct IntroduceExplicitGlobalContextPass
                     // Note: If we ever changed out mind about the representation
                     // and wanted to support multiple global parameters, we could
                     // easily generalize this code to work with a list.
-                    //
+
+                    // For CUDA output, we want to leave the global uniform
+                    // parameter where it is, because it will translate to
+                    // a global `__constant__` variable.
+                    if(m_target == CodeGenTarget::CUDASource)
+                        continue;
+
                     SLANG_ASSERT(!m_globalUniformsParam);
                     m_globalUniformsParam = globalParam;
                 }
@@ -132,9 +127,36 @@ struct IntroduceExplicitGlobalContextPass
             }
         }
 
+        // If there are no global-scope entities that require processing,
+        // then we can completely skip the work of this pass for CUDA.
+        //
+        // Note: We cannot skip the rest of the pass for CPU, because
+        // it is responsible for introducing the explicit entry-point
+        // parameter that is used for passing in the global param(s).
+        //
+        if( m_target == CodeGenTarget::CUDASource )
+        {
+            if( !m_globalUniformsParam && (m_globalVars.getCount() == 0) )
+            {
+                return;
+            }
+        }
+
         // Now that we've capture all the relevant global entities from the IR,
         // we can being to transform them in an appropriate order.
         //
+        // The global context will be represneted by a `struct`
+        // type with a name hint of `KernelContext`.
+        //
+        m_contextStructType = builder.createStructType();
+        builder.addNameHintDecoration(m_contextStructType, UnownedTerminatedStringSlice("KernelContext"));
+
+        // The context will usually be passed around by pointer,
+        // so we get and cache that pointer type up front.
+        //
+        m_contextStructPtrType = builder.getPtrType(m_contextStructType);
+
+
         // The first step will be to create fields in the `KernelContext`
         // type to represent any global parameters or global variables.
         //
@@ -270,9 +292,9 @@ struct IntroduceExplicitGlobalContextPass
             //
             globalUniformsParam->insertBefore(firstOrdinary);
         }
-        else
+        else if(m_target == CodeGenTarget::CPPSource)
         {
-            // The nature of our current ABI for entry points on CPU/CUDA
+            // The nature of our current ABI for entry points on CPU
             // means that we need an explicit parameter to be *declared*
             // for the global uniforms, even if it is never used.
             //
diff --git a/source/slang/slang.vcxproj b/source/slang/slang.vcxproj
index f20a4a322..aaece7095 100644
--- a/source/slang/slang.vcxproj
+++ b/source/slang/slang.vcxproj
@@ -404,4 +404,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project> 
-\ No newline at end of file
+</Project>
+\ No newline at end of file
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index 304784518..5acddf94f 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -979,6 +979,22 @@ static SlangResult _loadAndInvokeComputeProgram(
     ScopeCUDAModule cudaModule;
     SLANG_RETURN_ON_FAIL(cudaModule.load(kernelDesc.codeBegin));
 
+    // The global-scope shader parameters in the input Slang program
+    // will be collected into a single `__constant__` global variable
+    // in the output CUDA module.
+    //
+    // We need to query the address of the `__constant__` variable
+    // so that we can copy parameter data into it when invoking
+    // a kernel.
+    //
+    // The Slang compiler always names this symbol `SLANG_globalParams`
+    // so that it is easy to look up independent of the module or
+    // entry point in question.
+    //
+    CUdeviceptr globalParamsSymbol = 0;
+    size_t globalParamsSymbolSize = 0;
+    cuModuleGetGlobal(&globalParamsSymbol, &globalParamsSymbolSize, cudaModule, "SLANG_globalParams");
+
     slang::EntryPointReflection* entryPoint = nullptr;
     auto entryPointCount = reflection->getEntryPointCount();
     SLANG_ASSERT(entryPointCount == 1);
@@ -999,25 +1015,82 @@ static SlangResult _loadAndInvokeComputeProgram(
     int sharedSizeInBytes;
     SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));
 
-    // Work out the args
-    CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
-    CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
-
-    // NOTE! These are pointers to the cuda memory pointers
-    void* args[] = { &entryPointCUDAData , &uniformCUDAData };
-
+    // A single CUDA kernel can be invoked with thread groups
+    // of different shapes/sizes, but an HLSL/Slang compute
+    // kernel always has a fixed thread group shape baked in.
+    // We use reflection to query the thread-group size that
+    // the kernel expects, so that we can use the right size
+    // when invoking the kernel.
+    //
     SlangUInt numThreadsPerAxis[3];
     entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
 
-    // Launch
+    // The argument data for the kernel has been set up in `bindRoot`,
+    // which encapsulates global buffers for both the global and
+    // entry-point parameter data.
+    //
+    // In the case of global parameters, we just need to extract the
+    // device address of the parameter data, so we can copy it into
+    // the `SLANG_globalParams` symbol.
+    //
+    {
+        CUdeviceptr globalParamsCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
+        cudaMemcpyAsync(
+            (void*) globalParamsSymbol,
+            (void*) globalParamsCUDAData,
+            globalParamsSymbolSize,
+            cudaMemcpyDeviceToDevice,
+            cudaStream);
+    }
+    //
+    // In the case of the entry-point parameters, we have to deal with
+    // two different wrinkles.
+    //
+    // First, the `bindRoot` will have the entry-point argument data
+    // stored in a GPU-memory buffer, but we actually need it to be
+    // in host CPU memory. We handle that for now by allocating a
+    // temporary host memory buffer (if needed) and copying the data
+    // from device to host.
+    //
+    auto entryPointBindValue = bindRoot.getEntryPointValue();
+    CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(entryPointBindValue);
+    size_t entryPointDataSize = entryPointBindValue ? entryPointBindValue->m_sizeInBytes : 0;
+    void* entryPointHostData = nullptr;
+    if(entryPointDataSize)
+    {
+        entryPointHostData = alloca(entryPointDataSize);
+        cudaMemcpy(entryPointHostData, (void*)entryPointCUDAData, entryPointDataSize, cudaMemcpyDeviceToHost);
+    }
+    //
+    // Second, the argument data for the entry-point parameters has
+    // been allocated and filled in as a single buffer, but `cuLaunchKernel`
+    // defaults to taking pointers to each of the kernel arguments.
+    //
+    // We could loop over the entry-point parameters using the refleciton
+    // information, and set up a pointer to each using the offset stored
+    // for it in the reflection data. Such an approach would require
+    // us to create and fill in a dynamically-sized array here.
+    //
+    // Instead, we take advantage of a documented but seldom-used feature
+    // of `cuLaunchKernel` that allows the argument data for all of the
+    // kernel "launch parameters" to be specified as a single buffer.
+    //
+    void* extraOptions[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, (void*) entryPointHostData,
+        CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize,
+        CU_LAUNCH_PARAM_END,
+    };
+
+    // Once we have all the decessary data extracted and/or
+    // set up, we can launch the kernel and see what happens.
+    //
     auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
         dispatchSize[0], dispatchSize[1], dispatchSize[2], 
         int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
         0,              // Shared memory size
         cudaStream,     // Stream. 0 is no stream.
-        args,           // Args
-        nullptr);       // extra
-
+        nullptr,        // Not using traditional argument passing
+        extraOptions);  // Instead passing kernel arguments via "extra" options
     SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
 
     // Do a sync here. Makes sure any issues are detected early and not on some implicit sync