diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2019-09-17 12:25:45 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-09-17 12:25:45 -0400 |
| commit | 3af404da7f7f125464b78159940cb3fc06e69cc5 (patch) | |
| tree | d1640fc1ac08be8a15420a8603eba991833a1792 /tools/render-test | |
| parent | 3758a50dae81973b00541f2a151e3ee9cd2d1645 (diff) | |
CPU ABI improvements (#1056)
* WIP: Improving CPU performance/ABI
* Optionally output code on CPU for groupThreadID and groupID.
* Added ability to set compute dispatch size on command line for render-test.
Dispatch compute tests taking into account dispatch size.
Added test for semantics are working.
* Test using GroupRange.
* Fix problem with adding \n for externa diagnostic - to do it if there isn't a \n at the end. Change the ouput order (put result before) so last value is diagnostic string.
Diffstat (limited to 'tools/render-test')
| -rw-r--r-- | tools/render-test/cpu-compute-util.cpp | 75 | ||||
| -rw-r--r-- | tools/render-test/cpu-compute-util.h | 2 | ||||
| -rw-r--r-- | tools/render-test/options.cpp | 28 | ||||
| -rw-r--r-- | tools/render-test/options.h | 2 | ||||
| -rw-r--r-- | tools/render-test/render-test-main.cpp | 4 |
5 files changed, 92 insertions, 19 deletions
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp index 4294ad539..1b1adef82 100644 --- a/tools/render-test/cpu-compute-util.cpp +++ b/tools/render-test/cpu-compute-util.cpp @@ -301,7 +301,7 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) return SLANG_OK; } -/* static */SlangResult CPUComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context) +/* static */SlangResult CPUComputeUtil::execute(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context) { auto request = compilationAndLayout.output.request; auto reflection = (slang::ShaderReflection*) spGetReflection(request); @@ -313,10 +313,12 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) struct UniformState; typedef void(*Func)(CPPPrelude::ComputeVaryingInput* varyingInput, CPPPrelude::UniformEntryPointParams* uniformEntryPointParams, UniformState* uniformState); + typedef void(*GroupRangeFunc)(CPPPrelude::GroupComputeVaryingInput* varyingInput, CPPPrelude::UniformEntryPointParams* uniformEntryPointParams, UniformState* uniformState); slang::EntryPointReflection* entryPoint = nullptr; Func func = nullptr; Func groupFunc = nullptr; + GroupRangeFunc groupRangeFunc = nullptr; { auto entryPointCount = reflection->getEntryPointCount(); SLANG_ASSERT(entryPointCount == 1); @@ -326,27 +328,58 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) const char* entryPointName = entryPoint->getName(); func = (Func)sharedLibrary->findFuncByName(entryPointName); - StringBuilder groupEntryPointName; - groupEntryPointName << entryPointName << "_Group"; + { + StringBuilder groupEntryPointName; + groupEntryPointName << entryPointName << "_Group"; + + groupFunc = (Func)sharedLibrary->findFuncByName(groupEntryPointName.getBuffer()); + } - groupFunc = (Func)sharedLibrary->findFuncByName(groupEntryPointName.getBuffer()); + { + StringBuilder groupRangeEntryPointName; + groupRangeEntryPointName << entryPointName << "_GroupRange"; + + groupRangeFunc = (GroupRangeFunc)sharedLibrary->findFuncByName(groupRangeEntryPointName.getBuffer()); + } - if (func == nullptr && groupFunc == nullptr) + if (func == nullptr && groupFunc == nullptr && groupRangeFunc == nullptr) { return SLANG_FAIL; } } // If we have the group function, that's the faster way to execute all threads in group... - if (groupFunc) + if (groupRangeFunc) { UniformState* uniformState = (UniformState*)context.binding.m_rootBuffer.m_data; CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; + CPPPrelude::GroupComputeVaryingInput varying; + varying.startGroupID = {}; + varying.endGroupID = { dispatchSize[0], dispatchSize[1], dispatchSize[2] }; + + groupRangeFunc(&varying, uniformEntryPointParams, uniformState); + } + else if (groupFunc) + { CPPPrelude::ComputeVaryingInput varying; - varying.groupID = {}; - groupFunc(&varying, uniformEntryPointParams, uniformState); + for (uint32_t groupZ = 0; groupZ < dispatchSize[2]; ++groupZ) + { + for (uint32_t groupY = 0; groupY < dispatchSize[1]; ++groupY) + { + for (uint32_t groupX = 0; groupX < dispatchSize[0]; ++groupX) + { + UniformState* uniformState = (UniformState*)context.binding.m_rootBuffer.m_data; + CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; + + varying.groupID = {groupX, groupY, groupZ}; + + groupFunc(&varying, uniformEntryPointParams, uniformState); + } + } + } + } else { @@ -359,19 +392,29 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; CPPPrelude::ComputeVaryingInput varying; - varying.groupID = {}; - for (int z = 0; z < int(numThreadsPerAxis[2]); ++z) + for (uint32_t groupZ = 0; groupZ < dispatchSize[2]; ++groupZ) { - varying.groupThreadID.z = z; - for (int y = 0; y < int(numThreadsPerAxis[1]); ++y) + for (uint32_t groupY = 0; groupY < dispatchSize[1]; ++groupY) { - varying.groupThreadID.y = y; - for (int x = 0; x < int(numThreadsPerAxis[0]); ++x) + for (uint32_t groupX = 0; groupX < dispatchSize[0]; ++groupX) { - varying.groupThreadID.x = x; + varying.groupID = {groupX, groupY, groupZ}; - func(&varying, uniformEntryPointParams, uniformState); + for (int z = 0; z < int(numThreadsPerAxis[2]); ++z) + { + varying.groupThreadID.z = z; + for (int y = 0; y < int(numThreadsPerAxis[1]); ++y) + { + varying.groupThreadID.y = y; + for (int x = 0; x < int(numThreadsPerAxis[0]); ++x) + { + varying.groupThreadID.x = x; + + func(&varying, uniformEntryPointParams, uniformState); + } + } + } } } } diff --git a/tools/render-test/cpu-compute-util.h b/tools/render-test/cpu-compute-util.h index cbc4e6e58..b30ef146b 100644 --- a/tools/render-test/cpu-compute-util.h +++ b/tools/render-test/cpu-compute-util.h @@ -29,7 +29,7 @@ struct CPUComputeUtil static SlangResult calcBindings(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext); - static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext); + static SlangResult execute(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext); static SlangResult writeBindings(const ShaderInputLayout& layout, const List<CPUMemoryBinding::Buffer>& buffers, const Slang::String& fileName); }; diff --git a/tools/render-test/options.cpp b/tools/render-test/options.cpp index 1cf0ffbe8..d2f21a5d9 100644 --- a/tools/render-test/options.cpp +++ b/tools/render-test/options.cpp @@ -179,6 +179,34 @@ SlangResult parseOptions(int argc, const char*const* argv, Slang::WriterHelper s gOptions.adapter = *argCursor++; } + else if (strcmp(arg, "-compute-dispatch") == 0) + { + if (argCursor == argEnd) + { + stdError.print("error: comma separated compute dispatch size for '%s'\n", arg); + return SLANG_FAIL; + } + List<UnownedStringSlice> slices; + StringUtil::split(UnownedStringSlice(*argCursor++), ',', slices); + if (slices.getCount() != 3) + { + stdError.print("error: expected 3 comma separated integers for compute dispatch size for '%s'\n", arg); + return SLANG_FAIL; + } + + String string; + for (Index i = 0; i < 3; ++i) + { + string = slices[i]; + int v = StringToInt(string); + if (v < 1) + { + stdError.print("error: expected 3 comma positive integers for compute dispatch size for '%s'\n", arg); + return SLANG_FAIL; + } + gOptions.computeDispatchSize[i] = v; + } + } else { // Lookup diff --git a/tools/render-test/options.h b/tools/render-test/options.h index a57c94ed0..67eae6603 100644 --- a/tools/render-test/options.h +++ b/tools/render-test/options.h @@ -64,6 +64,8 @@ struct Options Slang::List<Slang::CommandLine::Arg> compileArgs; Slang::String adapter; ///< The adapter to use either name or index + + uint32_t computeDispatchSize[3] = { 1, 1, 1 }; }; extern Options gOptions; diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp index 0e457f9e4..2a0b9a6c9 100644 --- a/tools/render-test/render-test-main.cpp +++ b/tools/render-test/render-test-main.cpp @@ -232,7 +232,7 @@ void RenderTestApp::runCompute() auto pipelineType = PipelineType::Compute; m_renderer->setPipelineState(pipelineType, m_pipelineState); m_bindingState->apply(m_renderer, pipelineType); - m_renderer->dispatchCompute(1, 1, 1); + m_renderer->dispatchCompute(m_options.computeDispatchSize[0], m_options.computeDispatchSize[1], m_options.computeDispatchSize[2]); } void RenderTestApp::finalize() @@ -461,7 +461,7 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe CPUComputeUtil::Context context; SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcBindings(compilationAndLayout, context)); - SLANG_RETURN_ON_FAIL(CPUComputeUtil::execute(compilationAndLayout, context)); + SLANG_RETURN_ON_FAIL(CPUComputeUtil::execute(gOptions.computeDispatchSize, compilationAndLayout, context)); // Dump everything out that was written return CPUComputeUtil::writeBindings(compilationAndLayout.layout, context.buffers, gOptions.outputPath); |
