diff options
| author | Konstantin <const@const.me> | 2023-01-21 21:23:08 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-21 21:23:08 +0100 |
| commit | cacec67bb649702db7a877de1b6482a46123f175 (patch) | |
| tree | 07456739a57274cfb8cf6750d2acd480b23d0de1 | |
| parent | 040281bccc424341c964788aa7ca17876a5ac6a4 (diff) | |
Experimental, alternative busy wait implementation
Disabled with a `constexpr` flag because on a desktop with discrete GPU
this slowed down by about 20%. But the CPU load is about zero. Need to
test on iGPUs, thermal shenanigans might make a difference there.
| -rw-r--r-- | Whisper/Utils/DelayExecution.cpp | 66 | ||||
| -rw-r--r-- | Whisper/Utils/DelayExecution.h | 24 | ||||
| -rw-r--r-- | Whisper/Utils/GpuProfiler.cpp | 15 | ||||
| -rw-r--r-- | Whisper/Utils/GpuProfiler.h | 2 | ||||
| -rw-r--r-- | Whisper/Utils/GpuProfilerSimple.h | 2 | ||||
| -rw-r--r-- | Whisper/Whisper.vcxproj | 2 | ||||
| -rw-r--r-- | Whisper/Whisper.vcxproj.filters | 2 |
7 files changed, 103 insertions, 10 deletions
diff --git a/Whisper/Utils/DelayExecution.cpp b/Whisper/Utils/DelayExecution.cpp new file mode 100644 index 0000000..d238e53 --- /dev/null +++ b/Whisper/Utils/DelayExecution.cpp @@ -0,0 +1,66 @@ +#include "stdafx.h" +#include "DelayExecution.h" + +namespace +{ + constexpr bool useHighRezTimer = false; + + constexpr int64_t sleepMicroseconds = 200; + + inline HRESULT sleepImpl( HANDLE timer ) + { + constexpr int64_t sleepTicks = sleepMicroseconds * 10; + + LARGE_INTEGER li; + // Negative values indicate relative time + li.QuadPart = -sleepTicks; + if( !SetWaitableTimerEx( timer, &li, 0, nullptr, nullptr, nullptr, 0 ) ) + return getLastHr(); + const DWORD res = WaitForSingleObject( timer, 50 ); + if( res == WAIT_OBJECT_0 ) + return S_OK; + if( res == WAIT_FAILED ) + return getLastHr(); + return E_FAIL; + } +} + +void DelayExecution::sleepOnTheTimer( const DelayExecution& delay ) +{ + HRESULT hr = sleepImpl( delay.timer ); + if( SUCCEEDED( hr ) ) + return; + logWarningHr( hr, u8"DelayExecution.sleepOnTheTimer" ); +} + +void DelayExecution::spinWait( const DelayExecution& ) +{ + for( size_t i = 0; i < 1024; i++ ) + _mm_pause(); +} + +void DelayExecution::sleep( const DelayExecution& ) +{ + Sleep( 0 ); +} + +DelayExecution::DelayExecution() +{ + if constexpr( useHighRezTimer ) + { + constexpr DWORD flags = CREATE_WAITABLE_TIMER_HIGH_RESOLUTION; + HANDLE h = CreateWaitableTimerEx( nullptr, nullptr, flags, TIMER_ALL_ACCESS ); + if( nullptr != h ) + { + timer.Attach( h ); + pfn = &sleepOnTheTimer; + return; + } + + const HRESULT hr = getLastHr(); + logWarningHr( hr, u8"CreateWaitableTimerEx" ); + } + + pfn = &spinWait; + // pfn = &sleep; +}
\ No newline at end of file diff --git a/Whisper/Utils/DelayExecution.h b/Whisper/Utils/DelayExecution.h new file mode 100644 index 0000000..23d0262 --- /dev/null +++ b/Whisper/Utils/DelayExecution.h @@ -0,0 +1,24 @@ +#pragma once +#include <atlbase.h> + +// Utility class implementing a high-resolution Sleep() function +class DelayExecution +{ + using pfnDelay = void( * )( const DelayExecution& de ); + pfnDelay pfn = nullptr; + CHandle timer; + + static void sleepOnTheTimer( const DelayExecution& delay ); + static void spinWait( const DelayExecution& ); + static void sleep( const DelayExecution& ); + +public: + DelayExecution(); + DelayExecution( const DelayExecution& ) = delete; + ~DelayExecution() = default; + + void delay() const + { + pfn( *this ); + } +};
\ No newline at end of file diff --git a/Whisper/Utils/GpuProfiler.cpp b/Whisper/Utils/GpuProfiler.cpp index 6f19415..f7d8dc3 100644 --- a/Whisper/Utils/GpuProfiler.cpp +++ b/Whisper/Utils/GpuProfiler.cpp @@ -45,7 +45,7 @@ HRESULT GpuProfiler::Queue::create() namespace { - static uint64_t getTimestamp( ID3D11Query* query ) + static uint64_t getTimestamp( ID3D11Query* query, const DelayExecution& delay ) { ID3D11DeviceContext* const ctx = context(); @@ -56,12 +56,7 @@ namespace check( hr ); if( S_OK == hr ) return res; -#if 0 - Sleep( 1 ); -#else - for( size_t i = 0; i < 1024; i++ ) - _mm_pause(); -#endif + delay.delay(); } } @@ -86,7 +81,7 @@ void GpuProfiler::Queue::Entry::join( GpuProfiler& owner ) { assert( nullptr != block ); - uint64_t res = getTimestamp( query ); + uint64_t res = getTimestamp( query, owner.delay ); #if PROFILER_COLLECT_TAGS block->haveTimestamp( event, shader, tag, res, owner ); #else @@ -350,8 +345,8 @@ HRESULT GpuProfilerSimple::time( uint64_t& rdi ) const try { const D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dtsd = waitForDisjointData( disjoint ); - const uint64_t t1 = getTimestamp( begin ); - const uint64_t t2 = getTimestamp( end ); + const uint64_t t2 = getTimestamp( end, delay ); + const uint64_t t1 = getTimestamp( begin, delay ); if( !dtsd.Disjoint ) { diff --git a/Whisper/Utils/GpuProfiler.h b/Whisper/Utils/GpuProfiler.h index fbc284e..c6dcd74 100644 --- a/Whisper/Utils/GpuProfiler.h +++ b/Whisper/Utils/GpuProfiler.h @@ -1,6 +1,7 @@ #pragma once #include "../D3D/device.h" #include "ProfileCollection.h" +#include "DelayExecution.h" namespace DirectCompute { @@ -19,6 +20,7 @@ namespace DirectCompute class GpuProfiler { + DelayExecution delay; CComPtr<ID3D11Query> disjoint; enum struct eEvent diff --git a/Whisper/Utils/GpuProfilerSimple.h b/Whisper/Utils/GpuProfilerSimple.h index 7938b44..e4fba8d 100644 --- a/Whisper/Utils/GpuProfilerSimple.h +++ b/Whisper/Utils/GpuProfilerSimple.h @@ -1,11 +1,13 @@ #pragma once #include "../D3D/device.h" +#include "DelayExecution.h" namespace DirectCompute { // A simple profiler which doesn't collect anything, used to measure time it took to load the model class GpuProfilerSimple { + DelayExecution delay; CComPtr<ID3D11Query> disjoint, begin, end; public: HRESULT create(); diff --git a/Whisper/Whisper.vcxproj b/Whisper/Whisper.vcxproj index 5cf4f08..237db29 100644 --- a/Whisper/Whisper.vcxproj +++ b/Whisper/Whisper.vcxproj @@ -110,6 +110,7 @@ <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions</EnableEnhancedInstructionSet> </ClCompile> <ClCompile Include="CPU\HybridLoader.cpp" /> + <ClCompile Include="Utils\DelayExecution.cpp" /> <ClCompile Include="Hybrid\HybridContext.cpp" /> <ClCompile Include="CPU\ParallelForRunner.cpp"> <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions</EnableEnhancedInstructionSet> @@ -235,6 +236,7 @@ <ClInclude Include="CPU\Tensor.h" /> <ClInclude Include="CPU\DecoderTensors.h" /> <ClInclude Include="CPU\HybridLoader.h" /> + <ClInclude Include="Utils\DelayExecution.h" /> <ClInclude Include="Hybrid\HybridContext.h" /> <ClInclude Include="CPU\ParallelForRunner.h" /> <ClInclude Include="CPU\LargeBuffer.h" /> diff --git a/Whisper/Whisper.vcxproj.filters b/Whisper/Whisper.vcxproj.filters index 8a7c371..0ffd30c 100644 --- a/Whisper/Whisper.vcxproj.filters +++ b/Whisper/Whisper.vcxproj.filters @@ -83,6 +83,7 @@ <ClCompile Include="CPU\mulMatImpl.avx2.cpp" /> <ClCompile Include="CPU\mulMatImpl.panel.cpp" /> <ClCompile Include="ML\Reshaper.cpp" /> + <ClCompile Include="Utils\DelayExecution.cpp" /> </ItemGroup> <ItemGroup> <ClInclude Include="source\ggml.h" /> @@ -188,6 +189,7 @@ <ClInclude Include="ML\Reshaper.h" /> <ClInclude Include="ML\reshapedMultiply.h" /> <ClInclude Include="API\eGpuModelFlags.h" /> + <ClInclude Include="Utils\DelayExecution.h" /> </ItemGroup> <ItemGroup> <None Include="whisper.def" /> |
