summaryrefslogtreecommitdiffstats
path: root/tools/render-test/cuda/cuda-compute-util.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-02-08 11:19:31 -0500
committerGitHub <noreply@github.com>2020-02-08 11:19:31 -0500
commit0eed0125fa5e5f425d546efdc2b284b09ffc2785 (patch)
tree4ded20c4966f05881a056fad8146e34ac595063e /tools/render-test/cuda/cuda-compute-util.cpp
parent7de90c1e0b42b565a5f46e2f9f7580e1f577d414 (diff)
Fixes to make all CPU compute shaders work on CUDA (#1211)
* Launch CUDA test taking into account dispatch size. * Enable isCPUOnly hack to work on CUDA. * Rename 'isCPUOnly' hack to 'onlyCPULikeBinding'. * Add $T special type. Support SampleLevel on CUDA. * Fix typo.
Diffstat (limited to 'tools/render-test/cuda/cuda-compute-util.cpp')
-rw-r--r--tools/render-test/cuda/cuda-compute-util.cpp11
1 files changed, 4 insertions, 7 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index a50295063..c6862d2d3 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -347,7 +347,7 @@ public:
return SLANG_SUCCEEDED(context.init(0));
}
-static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, CUDAComputeUtil::Context& outContext)
+static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], CUDAComputeUtil::Context& outContext)
{
auto& bindSet = outContext.m_bindSet;
auto& bindRoot = outContext.m_bindRoot;
@@ -680,11 +680,8 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
// Launch
- // TODO(JS): We probably want to do something a little more clever here using the maxThreadsPerBlock,
- // but for now just launch a single block, and hope it all fits.
-
auto cudaLaunchResult = cuLaunchKernel(kernel,
- 1, 1, 1, // Blocks
+ dispatchSize[0], dispatchSize[1], dispatchSize[2],
int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
0, // Shared memory size
cudaStream, // Stream. 0 is no stream.
@@ -727,7 +724,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
return SLANG_OK;
}
-/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext)
+/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, const uint32_t dispatchSize[3], Context& outContext)
{
ScopeCUDAContext cudaContext;
SLANG_RETURN_ON_FAIL(cudaContext.init(0));
@@ -742,7 +739,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
ScopeCUDAModule cudaModule;
SLANG_RETURN_ON_FAIL(cudaModule.load(kernel.codeBegin));
- SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, outContext));
+ SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, dispatchSize, outContext));
return SLANG_OK;
}