summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-21 21:23:08 +0100
committerKonstantin <const@const.me>2023-01-21 21:23:08 +0100
commitcacec67bb649702db7a877de1b6482a46123f175 (patch)
tree07456739a57274cfb8cf6750d2acd480b23d0de1
parent040281bccc424341c964788aa7ca17876a5ac6a4 (diff)
Experimental, alternative busy wait implementation
Disabled with a `constexpr` flag because on a desktop with discrete GPU this slowed down by about 20%. But the CPU load is about zero. Need to test on iGPUs, thermal shenanigans might make a difference there.
-rw-r--r--Whisper/Utils/DelayExecution.cpp66
-rw-r--r--Whisper/Utils/DelayExecution.h24
-rw-r--r--Whisper/Utils/GpuProfiler.cpp15
-rw-r--r--Whisper/Utils/GpuProfiler.h2
-rw-r--r--Whisper/Utils/GpuProfilerSimple.h2
-rw-r--r--Whisper/Whisper.vcxproj2
-rw-r--r--Whisper/Whisper.vcxproj.filters2
7 files changed, 103 insertions, 10 deletions
diff --git a/Whisper/Utils/DelayExecution.cpp b/Whisper/Utils/DelayExecution.cpp
new file mode 100644
index 0000000..d238e53
--- /dev/null
+++ b/Whisper/Utils/DelayExecution.cpp
@@ -0,0 +1,66 @@
+#include "stdafx.h"
+#include "DelayExecution.h"
+
+namespace
+{
+ constexpr bool useHighRezTimer = false;
+
+ constexpr int64_t sleepMicroseconds = 200;
+
+ inline HRESULT sleepImpl( HANDLE timer )
+ {
+ constexpr int64_t sleepTicks = sleepMicroseconds * 10;
+
+ LARGE_INTEGER li;
+ // Negative values indicate relative time
+ li.QuadPart = -sleepTicks;
+ if( !SetWaitableTimerEx( timer, &li, 0, nullptr, nullptr, nullptr, 0 ) )
+ return getLastHr();
+ const DWORD res = WaitForSingleObject( timer, 50 );
+ if( res == WAIT_OBJECT_0 )
+ return S_OK;
+ if( res == WAIT_FAILED )
+ return getLastHr();
+ return E_FAIL;
+ }
+}
+
+void DelayExecution::sleepOnTheTimer( const DelayExecution& delay )
+{
+ HRESULT hr = sleepImpl( delay.timer );
+ if( SUCCEEDED( hr ) )
+ return;
+ logWarningHr( hr, u8"DelayExecution.sleepOnTheTimer" );
+}
+
+void DelayExecution::spinWait( const DelayExecution& )
+{
+ for( size_t i = 0; i < 1024; i++ )
+ _mm_pause();
+}
+
+void DelayExecution::sleep( const DelayExecution& )
+{
+ Sleep( 0 );
+}
+
+DelayExecution::DelayExecution()
+{
+ if constexpr( useHighRezTimer )
+ {
+ constexpr DWORD flags = CREATE_WAITABLE_TIMER_HIGH_RESOLUTION;
+ HANDLE h = CreateWaitableTimerEx( nullptr, nullptr, flags, TIMER_ALL_ACCESS );
+ if( nullptr != h )
+ {
+ timer.Attach( h );
+ pfn = &sleepOnTheTimer;
+ return;
+ }
+
+ const HRESULT hr = getLastHr();
+ logWarningHr( hr, u8"CreateWaitableTimerEx" );
+ }
+
+ pfn = &spinWait;
+ // pfn = &sleep;
+} \ No newline at end of file
diff --git a/Whisper/Utils/DelayExecution.h b/Whisper/Utils/DelayExecution.h
new file mode 100644
index 0000000..23d0262
--- /dev/null
+++ b/Whisper/Utils/DelayExecution.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <atlbase.h>
+
+// Utility class implementing a high-resolution Sleep() function
+class DelayExecution
+{
+ using pfnDelay = void( * )( const DelayExecution& de );
+ pfnDelay pfn = nullptr;
+ CHandle timer;
+
+ static void sleepOnTheTimer( const DelayExecution& delay );
+ static void spinWait( const DelayExecution& );
+ static void sleep( const DelayExecution& );
+
+public:
+ DelayExecution();
+ DelayExecution( const DelayExecution& ) = delete;
+ ~DelayExecution() = default;
+
+ void delay() const
+ {
+ pfn( *this );
+ }
+}; \ No newline at end of file
diff --git a/Whisper/Utils/GpuProfiler.cpp b/Whisper/Utils/GpuProfiler.cpp
index 6f19415..f7d8dc3 100644
--- a/Whisper/Utils/GpuProfiler.cpp
+++ b/Whisper/Utils/GpuProfiler.cpp
@@ -45,7 +45,7 @@ HRESULT GpuProfiler::Queue::create()
namespace
{
- static uint64_t getTimestamp( ID3D11Query* query )
+ static uint64_t getTimestamp( ID3D11Query* query, const DelayExecution& delay )
{
ID3D11DeviceContext* const ctx = context();
@@ -56,12 +56,7 @@ namespace
check( hr );
if( S_OK == hr )
return res;
-#if 0
- Sleep( 1 );
-#else
- for( size_t i = 0; i < 1024; i++ )
- _mm_pause();
-#endif
+ delay.delay();
}
}
@@ -86,7 +81,7 @@ void GpuProfiler::Queue::Entry::join( GpuProfiler& owner )
{
assert( nullptr != block );
- uint64_t res = getTimestamp( query );
+ uint64_t res = getTimestamp( query, owner.delay );
#if PROFILER_COLLECT_TAGS
block->haveTimestamp( event, shader, tag, res, owner );
#else
@@ -350,8 +345,8 @@ HRESULT GpuProfilerSimple::time( uint64_t& rdi ) const
try
{
const D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dtsd = waitForDisjointData( disjoint );
- const uint64_t t1 = getTimestamp( begin );
- const uint64_t t2 = getTimestamp( end );
+ const uint64_t t2 = getTimestamp( end, delay );
+ const uint64_t t1 = getTimestamp( begin, delay );
if( !dtsd.Disjoint )
{
diff --git a/Whisper/Utils/GpuProfiler.h b/Whisper/Utils/GpuProfiler.h
index fbc284e..c6dcd74 100644
--- a/Whisper/Utils/GpuProfiler.h
+++ b/Whisper/Utils/GpuProfiler.h
@@ -1,6 +1,7 @@
#pragma once
#include "../D3D/device.h"
#include "ProfileCollection.h"
+#include "DelayExecution.h"
namespace DirectCompute
{
@@ -19,6 +20,7 @@ namespace DirectCompute
class GpuProfiler
{
+ DelayExecution delay;
CComPtr<ID3D11Query> disjoint;
enum struct eEvent
diff --git a/Whisper/Utils/GpuProfilerSimple.h b/Whisper/Utils/GpuProfilerSimple.h
index 7938b44..e4fba8d 100644
--- a/Whisper/Utils/GpuProfilerSimple.h
+++ b/Whisper/Utils/GpuProfilerSimple.h
@@ -1,11 +1,13 @@
#pragma once
#include "../D3D/device.h"
+#include "DelayExecution.h"
namespace DirectCompute
{
// A simple profiler which doesn't collect anything, used to measure time it took to load the model
class GpuProfilerSimple
{
+ DelayExecution delay;
CComPtr<ID3D11Query> disjoint, begin, end;
public:
HRESULT create();
diff --git a/Whisper/Whisper.vcxproj b/Whisper/Whisper.vcxproj
index 5cf4f08..237db29 100644
--- a/Whisper/Whisper.vcxproj
+++ b/Whisper/Whisper.vcxproj
@@ -110,6 +110,7 @@
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions</EnableEnhancedInstructionSet>
</ClCompile>
<ClCompile Include="CPU\HybridLoader.cpp" />
+ <ClCompile Include="Utils\DelayExecution.cpp" />
<ClCompile Include="Hybrid\HybridContext.cpp" />
<ClCompile Include="CPU\ParallelForRunner.cpp">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions</EnableEnhancedInstructionSet>
@@ -235,6 +236,7 @@
<ClInclude Include="CPU\Tensor.h" />
<ClInclude Include="CPU\DecoderTensors.h" />
<ClInclude Include="CPU\HybridLoader.h" />
+ <ClInclude Include="Utils\DelayExecution.h" />
<ClInclude Include="Hybrid\HybridContext.h" />
<ClInclude Include="CPU\ParallelForRunner.h" />
<ClInclude Include="CPU\LargeBuffer.h" />
diff --git a/Whisper/Whisper.vcxproj.filters b/Whisper/Whisper.vcxproj.filters
index 8a7c371..0ffd30c 100644
--- a/Whisper/Whisper.vcxproj.filters
+++ b/Whisper/Whisper.vcxproj.filters
@@ -83,6 +83,7 @@
<ClCompile Include="CPU\mulMatImpl.avx2.cpp" />
<ClCompile Include="CPU\mulMatImpl.panel.cpp" />
<ClCompile Include="ML\Reshaper.cpp" />
+ <ClCompile Include="Utils\DelayExecution.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="source\ggml.h" />
@@ -188,6 +189,7 @@
<ClInclude Include="ML\Reshaper.h" />
<ClInclude Include="ML\reshapedMultiply.h" />
<ClInclude Include="API\eGpuModelFlags.h" />
+ <ClInclude Include="Utils\DelayExecution.h" />
</ItemGroup>
<ItemGroup>
<None Include="whisper.def" />