diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2019-09-18 11:40:59 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-09-18 11:40:59 -0400 |
| commit | 31c7abcc27a33d63ac8d335387a0ce7b3ad74954 (patch) | |
| tree | 3b4254df7bdbf8b497aa8a3e5f08f8927c1afbc6 /tools/render-test | |
| parent | 3af404da7f7f125464b78159940cb3fc06e69cc5 (diff) | |
Improvements to testing and ABI for CPU (#1057)
* WIP: Improving CPU performance/ABI
* Optionally output code on CPU for groupThreadID and groupID.
* Added ability to set compute dispatch size on command line for render-test.
Dispatch compute tests taking into account dispatch size.
Added test for semantics are working.
* Test using GroupRange.
* Fix problem with adding \n for externa diagnostic - to do it if there isn't a \n at the end. Change the ouput order (put result before) so last value is diagnostic string.
* Made GroupRange the default exposed CPU ABI entry point style.
Removed CPU_EXECUTE test style -as tested via the now cross platform render-test
* Split out execution from setup for execution to improve perf.
* For better code coverage/testing test all styles of CPU compute entry point.
* Improve documentation for ABI changes for CPU code.
Add 'expecting' to error message from review.
* Fix small typos.
Diffstat (limited to 'tools/render-test')
| -rw-r--r-- | tools/render-test/cpu-compute-util.cpp | 223 | ||||
| -rw-r--r-- | tools/render-test/cpu-compute-util.h | 29 | ||||
| -rw-r--r-- | tools/render-test/options.cpp | 2 | ||||
| -rw-r--r-- | tools/render-test/render-test-main.cpp | 23 |
4 files changed, 205 insertions, 72 deletions
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp index 1b1adef82..81325ce80 100644 --- a/tools/render-test/cpu-compute-util.cpp +++ b/tools/render-test/cpu-compute-util.cpp @@ -301,127 +301,220 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) return SLANG_OK; } -/* static */SlangResult CPUComputeUtil::execute(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context) +/* static */SlangResult CPUComputeUtil::calcExecuteInfo(ExecuteStyle style, const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context, ExecuteInfo& out) { auto request = compilationAndLayout.output.request; auto reflection = (slang::ShaderReflection*) spGetReflection(request); + slang::EntryPointReflection* entryPoint = nullptr; + auto entryPointCount = reflection->getEntryPointCount(); + SLANG_ASSERT(entryPointCount == 1); + + entryPoint = reflection->getEntryPointByIndex(0); + + const char* entryPointName = entryPoint->getName(); + ComPtr<ISlangSharedLibrary> sharedLibrary; SLANG_RETURN_ON_FAIL(spGetEntryPointHostCallable(request, 0, 0, sharedLibrary.writeRef())); - // Use reflection to find the entry point name - - struct UniformState; - typedef void(*Func)(CPPPrelude::ComputeVaryingInput* varyingInput, CPPPrelude::UniformEntryPointParams* uniformEntryPointParams, UniformState* uniformState); - typedef void(*GroupRangeFunc)(CPPPrelude::GroupComputeVaryingInput* varyingInput, CPPPrelude::UniformEntryPointParams* uniformEntryPointParams, UniformState* uniformState); - - slang::EntryPointReflection* entryPoint = nullptr; - Func func = nullptr; - Func groupFunc = nullptr; - GroupRangeFunc groupRangeFunc = nullptr; + // Copy dispatch size + for (int i = 0; i < 3; ++i) { - auto entryPointCount = reflection->getEntryPointCount(); - SLANG_ASSERT(entryPointCount == 1); - - entryPoint = reflection->getEntryPointByIndex(0); + out.m_dispatchSize[i] = dispatchSize[i]; + } - const char* entryPointName = entryPoint->getName(); - func = (Func)sharedLibrary->findFuncByName(entryPointName); + out.m_style = style; + out.m_uniformState = (void*)context.binding.m_rootBuffer.m_data; + out.m_uniformEntryPointParams = (void*)context.binding.m_entryPointBuffer.m_data; + switch (style) + { + case ExecuteStyle::Group: { StringBuilder groupEntryPointName; groupEntryPointName << entryPointName << "_Group"; - groupFunc = (Func)sharedLibrary->findFuncByName(groupEntryPointName.getBuffer()); - } + CPPPrelude::ComputeFunc groupFunc = (CPPPrelude::ComputeFunc)sharedLibrary->findFuncByName(groupEntryPointName.getBuffer()); + if (!groupFunc) + { + return SLANG_FAIL; + } + out.m_func = (ExecuteInfo::Func)groupFunc; + break; + } + case ExecuteStyle::GroupRange: { - StringBuilder groupRangeEntryPointName; - groupRangeEntryPointName << entryPointName << "_GroupRange"; - - groupRangeFunc = (GroupRangeFunc)sharedLibrary->findFuncByName(groupRangeEntryPointName.getBuffer()); + CPPPrelude::ComputeFunc groupRangeFunc = nullptr; + groupRangeFunc = (CPPPrelude::ComputeFunc)sharedLibrary->findFuncByName(entryPointName); + if (!groupRangeFunc) + { + return SLANG_FAIL; + } + out.m_func = (ExecuteInfo::Func)groupRangeFunc; + break; } + case ExecuteStyle::Thread: + { + StringBuilder threadEntryPointName; + threadEntryPointName << entryPointName << "_Thread"; - if (func == nullptr && groupFunc == nullptr && groupRangeFunc == nullptr) + CPPPrelude::ComputeThreadFunc threadFunc = (CPPPrelude::ComputeThreadFunc)sharedLibrary->findFuncByName(threadEntryPointName.getBuffer()); + if (!threadFunc) + { + return SLANG_FAIL; + } + + SlangUInt numThreadsPerAxis[3]; + entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); + for (int i = 0; i < 3; ++i) + { + out.m_numThreadsPerAxis[i] = uint32_t(numThreadsPerAxis[i]); + } + out.m_func = (ExecuteInfo::Func)threadFunc; + break; + } + default: { return SLANG_FAIL; } } - // If we have the group function, that's the faster way to execute all threads in group... - if (groupRangeFunc) - { - UniformState* uniformState = (UniformState*)context.binding.m_rootBuffer.m_data; - CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; - CPPPrelude::GroupComputeVaryingInput varying; - - varying.startGroupID = {}; - varying.endGroupID = { dispatchSize[0], dispatchSize[1], dispatchSize[2] }; - - groupRangeFunc(&varying, uniformEntryPointParams, uniformState); - } - else if (groupFunc) - { - CPPPrelude::ComputeVaryingInput varying; + return SLANG_OK; +} + +/* static */SlangResult CPUComputeUtil::execute(const ExecuteInfo& info) +{ + CPPPrelude::UniformState* uniformState = (CPPPrelude::UniformState*)info.m_uniformState; + CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)info.m_uniformEntryPointParams; - for (uint32_t groupZ = 0; groupZ < dispatchSize[2]; ++groupZ) + switch (info.m_style) + { + case ExecuteStyle::Group: { - for (uint32_t groupY = 0; groupY < dispatchSize[1]; ++groupY) - { - for (uint32_t groupX = 0; groupX < dispatchSize[0]; ++groupX) - { - UniformState* uniformState = (UniformState*)context.binding.m_rootBuffer.m_data; - CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; + CPPPrelude::ComputeFunc groupFunc = (CPPPrelude::ComputeFunc)info.m_func; + CPPPrelude::ComputeVaryingInput varying; - varying.groupID = {groupX, groupY, groupZ}; + const uint32_t groupXCount = info.m_dispatchSize[0]; + const uint32_t groupYCount = info.m_dispatchSize[1]; + const uint32_t groupZCount = info.m_dispatchSize[2]; - groupFunc(&varying, uniformEntryPointParams, uniformState); + for (uint32_t groupZ = 0; groupZ < groupZCount; ++groupZ) + { + for (uint32_t groupY = 0; groupY < groupYCount; ++groupY) + { + for (uint32_t groupX = 0; groupX < groupXCount; ++groupX) + { + varying.startGroupID = { groupX, groupY, groupZ }; + groupFunc(&varying, uniformEntryPointParams, uniformState); + } } } + break; } + case ExecuteStyle::GroupRange: + { + CPPPrelude::ComputeFunc groupRangeFunc = (CPPPrelude::ComputeFunc)info.m_func; + CPPPrelude::ComputeVaryingInput varying; - } - else - { - // We can also fire off each thread individually - SlangUInt numThreadsPerAxis[3]; - entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); + varying.startGroupID = {}; + varying.endGroupID = { info.m_dispatchSize[0], info.m_dispatchSize[1], info.m_dispatchSize[2] }; + groupRangeFunc(&varying, uniformEntryPointParams, uniformState); + break; + } + case ExecuteStyle::Thread: { - UniformState* uniformState = (UniformState*)context.binding.m_rootBuffer.m_data; - CPPPrelude::UniformEntryPointParams* uniformEntryPointParams = (CPPPrelude::UniformEntryPointParams*)context.binding.m_entryPointBuffer.m_data; + CPPPrelude::ComputeThreadFunc threadFunc = (CPPPrelude::ComputeThreadFunc)info.m_func; + CPPPrelude::ComputeThreadVaryingInput varying; - CPPPrelude::ComputeVaryingInput varying; + const uint32_t groupXCount = info.m_dispatchSize[0]; + const uint32_t groupYCount = info.m_dispatchSize[1]; + const uint32_t groupZCount = info.m_dispatchSize[2]; + + const uint32_t threadXCount = uint32_t(info.m_numThreadsPerAxis[0]); + const uint32_t threadYCount = uint32_t(info.m_numThreadsPerAxis[1]); + const uint32_t threadZCount = uint32_t(info.m_numThreadsPerAxis[2]); - for (uint32_t groupZ = 0; groupZ < dispatchSize[2]; ++groupZ) + for (uint32_t groupZ = 0; groupZ < groupZCount; ++groupZ) { - for (uint32_t groupY = 0; groupY < dispatchSize[1]; ++groupY) + for (uint32_t groupY = 0; groupY < groupYCount; ++groupY) { - for (uint32_t groupX = 0; groupX < dispatchSize[0]; ++groupX) + for (uint32_t groupX = 0; groupX < groupXCount; ++groupX) { - varying.groupID = {groupX, groupY, groupZ}; + varying.groupID = { groupX, groupY, groupZ }; - for (int z = 0; z < int(numThreadsPerAxis[2]); ++z) + for (uint32_t z = 0; z < threadZCount; ++z) { varying.groupThreadID.z = z; - for (int y = 0; y < int(numThreadsPerAxis[1]); ++y) + for (uint32_t y = 0; y < threadYCount; ++y) { varying.groupThreadID.y = y; - for (int x = 0; x < int(numThreadsPerAxis[0]); ++x) + for (uint32_t x = 0; x < threadXCount; ++x) { varying.groupThreadID.x = x; - func(&varying, uniformEntryPointParams, uniformState); + threadFunc(&varying, uniformEntryPointParams, uniformState); } } } } } } + break; + } + default: return SLANG_FAIL; + } + + return SLANG_OK; +} + + +/* static */ SlangResult CPUComputeUtil::checkStyleConsistency(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout) +{ + Context context; + SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcBindings(compilationAndLayout, context)); + + // Run the thread style to test against + { + ExecuteInfo info; + SLANG_RETURN_ON_FAIL(calcExecuteInfo(ExecuteStyle::Thread, dispatchSize, compilationAndLayout, context, info)); + SLANG_RETURN_ON_FAIL(execute(info)); + } + + ExecuteStyle styles[] = { ExecuteStyle::Group, ExecuteStyle::GroupRange }; + for (auto style: styles) + { + Context checkContext; + SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcBindings(compilationAndLayout, checkContext)); + + ExecuteInfo info; + SLANG_RETURN_ON_FAIL(calcExecuteInfo(style, dispatchSize, compilationAndLayout, checkContext, info)); + SLANG_RETURN_ON_FAIL(execute(info)); + + // Make sure the out buffers are all the same + + const auto& entries = compilationAndLayout.layout.entries; + + for (int i = 0; i < entries.getCount(); ++i) + { + const auto& entry = entries[i]; + if (entry.isOutput) + { + const auto& buffer = context.buffers[i]; + const auto& checkBuffer = checkContext.buffers[i]; + + if (buffer.m_sizeInBytes != checkBuffer.m_sizeInBytes || + memcmp(buffer.m_data, checkBuffer.m_data, buffer.m_sizeInBytes) != 0) + { + return SLANG_FAIL; + } + } } } return SLANG_OK; } + } // renderer_test diff --git a/tools/render-test/cpu-compute-util.h b/tools/render-test/cpu-compute-util.h index b30ef146b..1284735c0 100644 --- a/tools/render-test/cpu-compute-util.h +++ b/tools/render-test/cpu-compute-util.h @@ -11,6 +11,14 @@ namespace renderer_test { struct CPUComputeUtil { + enum class ExecuteStyle + { + Unknown, + Thread, + Group, + GroupRange, + }; + struct Resource : public RefObject { void* getInterface() const { return m_interface; } @@ -27,9 +35,28 @@ struct CPUComputeUtil List<RefPtr<Resource> > m_resources; }; + struct ExecuteInfo + { + typedef void (*Func)(); + + ExecuteStyle m_style; + Func m_func; + uint32_t m_dispatchSize[3]; + uint32_t m_numThreadsPerAxis[3]; + + void* m_uniformState; + void* m_uniformEntryPointParams; + }; + + + /// Runs code across run styles and makes sure output buffers match + static SlangResult checkStyleConsistency(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout); + static SlangResult calcBindings(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext); - static SlangResult execute(const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext); + static SlangResult calcExecuteInfo(ExecuteStyle style, const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context, ExecuteInfo& out); + + static SlangResult execute(const ExecuteInfo& info); static SlangResult writeBindings(const ShaderInputLayout& layout, const List<CPUMemoryBinding::Buffer>& buffers, const Slang::String& fileName); }; diff --git a/tools/render-test/options.cpp b/tools/render-test/options.cpp index d2f21a5d9..e13a2b88f 100644 --- a/tools/render-test/options.cpp +++ b/tools/render-test/options.cpp @@ -183,7 +183,7 @@ SlangResult parseOptions(int argc, const char*const* argv, Slang::WriterHelper s { if (argCursor == argEnd) { - stdError.print("error: comma separated compute dispatch size for '%s'\n", arg); + stdError.print("error: expecting a comma separated compute dispatch size for '%s'\n", arg); return SLANG_FAIL; } List<UnownedStringSlice> slices; diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp index 2a0b9a6c9..3a8871618 100644 --- a/tools/render-test/render-test-main.cpp +++ b/tools/render-test/render-test-main.cpp @@ -459,12 +459,25 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe ShaderCompilerUtil::OutputAndLayout compilationAndLayout; SLANG_RETURN_ON_FAIL(ShaderCompilerUtil::compileWithLayout(session, gOptions.sourcePath, gOptions.compileArgs, gOptions.shaderType, input, compilationAndLayout)); - CPUComputeUtil::Context context; - SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcBindings(compilationAndLayout, context)); - SLANG_RETURN_ON_FAIL(CPUComputeUtil::execute(gOptions.computeDispatchSize, compilationAndLayout, context)); + + { + CPUComputeUtil::Context context; + SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcBindings(compilationAndLayout, context)); + + CPUComputeUtil::ExecuteInfo info; + SLANG_RETURN_ON_FAIL(CPUComputeUtil::calcExecuteInfo(CPUComputeUtil::ExecuteStyle::GroupRange, gOptions.computeDispatchSize, compilationAndLayout, context, info)); + SLANG_RETURN_ON_FAIL(CPUComputeUtil::execute(info)); + + // Dump everything out that was written + SLANG_RETURN_ON_FAIL(CPUComputeUtil::writeBindings(compilationAndLayout.layout, context.buffers, gOptions.outputPath)); + } + + { + // Check all execution styles produce the same result + SLANG_RETURN_ON_FAIL(CPUComputeUtil::checkStyleConsistency(gOptions.computeDispatchSize, compilationAndLayout)); + } - // Dump everything out that was written - return CPUComputeUtil::writeBindings(compilationAndLayout.layout, context.buffers, gOptions.outputPath); + return SLANG_OK; } Slang::RefPtr<Renderer> renderer; |
