summaryrefslogtreecommitdiffstats
path: root/tools/render-test
diff options
context:
space:
mode:
Diffstat (limited to 'tools/render-test')
-rw-r--r--tools/render-test/cuda/cuda-compute-util.cpp95
1 files changed, 84 insertions, 11 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index 304784518..5acddf94f 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -979,6 +979,22 @@ static SlangResult _loadAndInvokeComputeProgram(
ScopeCUDAModule cudaModule;
SLANG_RETURN_ON_FAIL(cudaModule.load(kernelDesc.codeBegin));
+ // The global-scope shader parameters in the input Slang program
+ // will be collected into a single `__constant__` global variable
+ // in the output CUDA module.
+ //
+ // We need to query the address of the `__constant__` variable
+ // so that we can copy parameter data into it when invoking
+ // a kernel.
+ //
+ // The Slang compiler always names this symbol `SLANG_globalParams`
+ // so that it is easy to look up independent of the module or
+ // entry point in question.
+ //
+ CUdeviceptr globalParamsSymbol = 0;
+ size_t globalParamsSymbolSize = 0;
+ cuModuleGetGlobal(&globalParamsSymbol, &globalParamsSymbolSize, cudaModule, "SLANG_globalParams");
+
slang::EntryPointReflection* entryPoint = nullptr;
auto entryPointCount = reflection->getEntryPointCount();
SLANG_ASSERT(entryPointCount == 1);
@@ -999,25 +1015,82 @@ static SlangResult _loadAndInvokeComputeProgram(
int sharedSizeInBytes;
SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cudaEntryPoint));
- // Work out the args
- CUdeviceptr uniformCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
- CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getEntryPointValue());
-
- // NOTE! These are pointers to the cuda memory pointers
- void* args[] = { &entryPointCUDAData , &uniformCUDAData };
-
+ // A single CUDA kernel can be invoked with thread groups
+ // of different shapes/sizes, but an HLSL/Slang compute
+ // kernel always has a fixed thread group shape baked in.
+ // We use reflection to query the thread-group size that
+ // the kernel expects, so that we can use the right size
+ // when invoking the kernel.
+ //
SlangUInt numThreadsPerAxis[3];
entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
- // Launch
+ // The argument data for the kernel has been set up in `bindRoot`,
+ // which encapsulates global buffers for both the global and
+ // entry-point parameter data.
+ //
+ // In the case of global parameters, we just need to extract the
+ // device address of the parameter data, so we can copy it into
+ // the `SLANG_globalParams` symbol.
+ //
+ {
+ CUdeviceptr globalParamsCUDAData = MemoryCUDAResource::getCUDAData(bindRoot.getRootValue());
+ cudaMemcpyAsync(
+ (void*) globalParamsSymbol,
+ (void*) globalParamsCUDAData,
+ globalParamsSymbolSize,
+ cudaMemcpyDeviceToDevice,
+ cudaStream);
+ }
+ //
+ // In the case of the entry-point parameters, we have to deal with
+ // two different wrinkles.
+ //
+ // First, the `bindRoot` will have the entry-point argument data
+ // stored in a GPU-memory buffer, but we actually need it to be
+ // in host CPU memory. We handle that for now by allocating a
+ // temporary host memory buffer (if needed) and copying the data
+ // from device to host.
+ //
+ auto entryPointBindValue = bindRoot.getEntryPointValue();
+ CUdeviceptr entryPointCUDAData = MemoryCUDAResource::getCUDAData(entryPointBindValue);
+ size_t entryPointDataSize = entryPointBindValue ? entryPointBindValue->m_sizeInBytes : 0;
+ void* entryPointHostData = nullptr;
+ if(entryPointDataSize)
+ {
+ entryPointHostData = alloca(entryPointDataSize);
+ cudaMemcpy(entryPointHostData, (void*)entryPointCUDAData, entryPointDataSize, cudaMemcpyDeviceToHost);
+ }
+ //
+ // Second, the argument data for the entry-point parameters has
+ // been allocated and filled in as a single buffer, but `cuLaunchKernel`
+ // defaults to taking pointers to each of the kernel arguments.
+ //
+ // We could loop over the entry-point parameters using the refleciton
+ // information, and set up a pointer to each using the offset stored
+ // for it in the reflection data. Such an approach would require
+ // us to create and fill in a dynamically-sized array here.
+ //
+ // Instead, we take advantage of a documented but seldom-used feature
+ // of `cuLaunchKernel` that allows the argument data for all of the
+ // kernel "launch parameters" to be specified as a single buffer.
+ //
+ void* extraOptions[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER, (void*) entryPointHostData,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &entryPointDataSize,
+ CU_LAUNCH_PARAM_END,
+ };
+
+ // Once we have all the decessary data extracted and/or
+ // set up, we can launch the kernel and see what happens.
+ //
auto cudaLaunchResult = cuLaunchKernel(cudaEntryPoint,
dispatchSize[0], dispatchSize[1], dispatchSize[2],
int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
0, // Shared memory size
cudaStream, // Stream. 0 is no stream.
- args, // Args
- nullptr); // extra
-
+ nullptr, // Not using traditional argument passing
+ extraOptions); // Instead passing kernel arguments via "extra" options
SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
// Do a sync here. Makes sure any issues are detected early and not on some implicit sync