summaryrefslogtreecommitdiffstats
path: root/Whisper/Utils/GpuProfiler.cpp
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/Utils/GpuProfiler.cpp
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Whisper/Utils/GpuProfiler.cpp')
-rw-r--r--Whisper/Utils/GpuProfiler.cpp374
1 files changed, 374 insertions, 0 deletions
diff --git a/Whisper/Utils/GpuProfiler.cpp b/Whisper/Utils/GpuProfiler.cpp
new file mode 100644
index 0000000..6f19415
--- /dev/null
+++ b/Whisper/Utils/GpuProfiler.cpp
@@ -0,0 +1,374 @@
+#include "stdafx.h"
+#include "GpuProfiler.h"
+#include "GpuProfilerSimple.h"
+using namespace DirectCompute;
+
+inline void GpuProfiler::sProfilerData::reset()
+{
+ _mm_storeu_si128( ( __m128i* ) & callsPending, _mm_setzero_si128() );
+}
+
+inline void GpuProfiler::sProfilerData::addPending( int64_t time )
+{
+ callsPending++;
+ timePending += time;
+}
+
+inline void GpuProfiler::sProfilerData::dropPending()
+{
+ callsPending = 0;
+ timePending = 0;
+}
+
+inline void GpuProfiler::sProfilerData::makeTime( uint64_t freq )
+{
+ dest->count += callsPending;
+ dest->totalTicks += ::makeTime( timePending, freq );
+ callsPending = 0;
+ timePending = 0;
+}
+
+HRESULT GpuProfiler::Queue::create()
+{
+ ID3D11Device* const dev = device();
+
+ CD3D11_QUERY_DESC desc{ D3D11_QUERY_TIMESTAMP };
+ for( Entry& e : queue )
+ {
+ CHECK( dev->CreateQuery( &desc, &e.query ) );
+ e.block = nullptr;
+ e.event = eEvent::None;
+ e.shader = EmptyShader;
+ }
+ return S_OK;
+}
+
+namespace
+{
+ static uint64_t getTimestamp( ID3D11Query* query )
+ {
+ ID3D11DeviceContext* const ctx = context();
+
+ uint64_t res = 0;
+ while( true )
+ {
+ const HRESULT hr = ctx->GetData( query, &res, sizeof( uint64_t ), 0 );
+ check( hr );
+ if( S_OK == hr )
+ return res;
+#if 0
+ Sleep( 1 );
+#else
+ for( size_t i = 0; i < 1024; i++ )
+ _mm_pause();
+#endif
+ }
+ }
+
+ static D3D11_QUERY_DATA_TIMESTAMP_DISJOINT waitForDisjointData( ID3D11Query* query )
+ {
+ ID3D11DeviceContext* const ctx = context();
+ ctx->End( query );
+
+ D3D11_QUERY_DATA_TIMESTAMP_DISJOINT res;
+ while( true )
+ {
+ const HRESULT hr = ctx->GetData( query, &res, sizeof( D3D11_QUERY_DATA_TIMESTAMP_DISJOINT ), 0 );
+ check( hr );
+ if( S_OK == hr )
+ return res;
+ Sleep( 1 );
+ }
+ }
+}
+
+void GpuProfiler::Queue::Entry::join( GpuProfiler& owner )
+{
+ assert( nullptr != block );
+
+ uint64_t res = getTimestamp( query );
+#if PROFILER_COLLECT_TAGS
+ block->haveTimestamp( event, shader, tag, res, owner );
+#else
+ block->haveTimestamp( event, shader, 0, res, owner );
+#endif
+ block = nullptr;
+ event = eEvent::None;
+ shader = EmptyShader;
+}
+
+void GpuProfiler::Queue::submit( BlockState* block, eEvent evt, uint16_t shader, uint16_t tag )
+{
+ // if( evt == GpuProfiler::eEvent::Shader && shader == 0 ) __debugbreak();
+ assert( nullptr != block );
+
+ Entry& e = queue[ nextEntry ];
+ if( nullptr != e.block )
+ e.join( owner );
+
+ e.block = block;
+ e.event = evt;
+ e.shader = shader;
+#if PROFILER_COLLECT_TAGS
+ e.tag = tag;
+#endif
+ context()->End( e.query );
+ nextEntry = ( nextEntry + 1 ) % queueLength;
+}
+
+void GpuProfiler::Queue::join()
+{
+ while( true )
+ {
+ Entry& e = queue[ nextEntry ];
+ if( nullptr == e.block )
+ return;
+ e.join( owner );
+ nextEntry = ( nextEntry + 1 ) % queueLength;
+ }
+}
+
+static inline uint32_t makeTagKey( uint16_t cs, uint16_t tag )
+{
+ uint32_t r = cs;
+ r = r << 16;
+ r |= tag;
+ return r;
+}
+
+void GpuProfiler::BlockState::completePrevShader( uint64_t time, GpuProfiler& profiler )
+{
+ if( shaderStart == -1 )
+ return;
+ assert( prevShader != EmptyShader );
+ const int64_t elapsed = (int64_t)time - shaderStart;
+
+ sProfilerData* dest = nullptr;
+ auto* p = profiler.results.Lookup( prevShader );
+ if( nullptr != p )
+ dest = &p->m_value;
+ else
+ {
+ sProfilerData& res = profiler.results[ prevShader ];
+ res.dest = &profiler.dest.measure( (eComputeShader)prevShader );
+ dest = &res;
+ }
+ dest->addPending( elapsed );
+
+#if PROFILER_COLLECT_TAGS
+ if( 0 != prevShaderTag )
+ {
+ const uint32_t key = makeTagKey( prevShader, prevShaderTag );
+ auto* pt = profiler.resultsTagged.Lookup( key );
+ if( nullptr != pt )
+ dest = &pt->m_value;
+ else
+ {
+ sProfilerData& res = profiler.resultsTagged[ key ];
+ res.dest = &profiler.dest.measure( (eComputeShader)prevShader, prevShaderTag );
+ dest = &res;
+ }
+ dest->addPending( elapsed );
+ }
+#endif
+ prevShader = EmptyShader;
+ prevShaderTag = 0;
+ shaderStart = -1;
+}
+
+void GpuProfiler::BlockState::haveTimestamp( eEvent evt, uint16_t cs, uint16_t tag, uint64_t time, GpuProfiler& profiler )
+{
+ switch( evt )
+ {
+ case eEvent::BlockStart:
+ assert( -1 == timeStart );
+ assert( -1 == shaderStart );
+ assert( cs == EmptyShader );
+ timeStart = (int64_t)time;
+ if( nullptr != parentBlock )
+ parentBlock->completePrevShader( time, profiler );
+ return;
+ case eEvent::BlockEnd:
+ assert( -1 != timeStart );
+ assert( cs == EmptyShader );
+ completePrevShader( time, profiler );
+ destBlock->addPending( (int64_t)time - timeStart );
+ timeStart = -1;
+ return;
+ case eEvent::Shader:
+ assert( cs != EmptyShader );
+ // if( cs == (uint16_t)0 ) __debugbreak();
+ completePrevShader( time, profiler );
+ prevShader = cs;
+ prevShaderTag = tag;
+ shaderStart = (int64_t)time;
+ return;
+ }
+ assert( false );
+}
+
+HRESULT GpuProfiler::create( size_t maxDepth )
+{
+ CD3D11_QUERY_DESC desc{ D3D11_QUERY_TIMESTAMP_DISJOINT };
+ CHECK( device()->CreateQuery( &desc, &disjoint ) );
+ CHECK( queries.create() );
+ stack.reserve( maxDepth );
+ return S_OK;
+}
+
+void GpuProfiler::blockStart( eProfilerBlock which )
+{
+ BlockState* parentBlock;
+ if( stack.empty() )
+ {
+ context()->Begin( disjoint );
+ parentBlock = nullptr;
+ }
+ else
+ parentBlock = *stack.rbegin();
+
+ BlockState* bs = nullptr;
+ auto p = blockStates.Lookup( which );
+ if( nullptr != p )
+ bs = &p->m_value;
+ else
+ {
+ BlockState& block = blockStates[ which ];
+ block.destBlock = &results[ (uint16_t)which ];
+ block.destBlock->dest = &dest.measure( which );
+ bs = &block;
+ }
+ bs->parentBlock = parentBlock;
+ queries.submit( bs, eEvent::BlockStart );
+ stack.push_back( bs );
+}
+
+void GpuProfiler::blockEnd()
+{
+ assert( !stack.empty() );
+ BlockState* const bs = *stack.rbegin();
+ queries.submit( bs, eEvent::BlockEnd );
+ stack.pop_back();
+
+ if( !stack.empty() )
+ return;
+
+ const D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dtsd = waitForDisjointData( disjoint );
+ queries.join();
+
+ if( !dtsd.Disjoint )
+ {
+ // Fortunately, these timers appear to be relatively high resolution.
+ // Specifically, on the iGPU inside Ryzen 7 5700G that frequency is 1E+8 = 100 MHz
+ // On nVidia 1080Ti, that frequency is 1E+9 = 1 GHz
+ const uint64_t freq = dtsd.Frequency;
+ resultsMakeTime( freq );
+ }
+ else
+ {
+ // Something occurred in between the query's ID3D11DeviceContext::Begin and ID3D11DeviceContext::End calls
+ // that caused the timestamp counter to become discontinuous or disjoint, such as unplugging the AC cord on a laptop, overheating, or throttling up/down due to laptop savings events.
+ // The timestamp returned by ID3D11DeviceContext::GetData for a timestamp query is only reliable if Disjoint is FALSE.
+ resultsDropPending();
+ }
+}
+
+void GpuProfiler::computeShader( eComputeShader cs )
+{
+ assert( !stack.empty() );
+ if( !profileShaders )
+ return;
+
+ BlockState* const bs = *stack.rbegin();
+#if PROFILER_COLLECT_TAGS
+ queries.submit( bs, eEvent::Shader, (uint16_t)cs, m_nextTag );
+ m_nextTag = 0;
+#else
+ queries.submit( bs, eEvent::Shader, (uint16_t)cs );
+#endif
+}
+
+void GpuProfiler::resultsDropPending()
+{
+ for( POSITION pos = results.GetStartPosition(); nullptr != pos; )
+ results.GetNextValue( pos ).dropPending();
+#if PROFILER_COLLECT_TAGS
+ for( POSITION pos = resultsTagged.GetStartPosition(); nullptr != pos; )
+ resultsTagged.GetNextValue( pos ).dropPending();
+#endif
+}
+
+void GpuProfiler::resultsMakeTime( uint64_t freq )
+{
+ for( POSITION pos = results.GetStartPosition(); nullptr != pos; )
+ results.GetNextValue( pos ).makeTime( freq );
+#if PROFILER_COLLECT_TAGS
+ for( POSITION pos = resultsTagged.GetStartPosition(); nullptr != pos; )
+ resultsTagged.GetNextValue( pos ).makeTime( freq );
+#endif
+}
+
+void GpuProfiler::resultsReset()
+{
+ for( POSITION pos = results.GetStartPosition(); nullptr != pos; )
+ results.GetNextValue( pos ).reset();
+#if PROFILER_COLLECT_TAGS
+ for( POSITION pos = resultsTagged.GetStartPosition(); nullptr != pos; )
+ resultsTagged.GetNextValue( pos ).reset();
+#endif
+}
+
+#if PROFILER_COLLECT_TAGS
+uint16_t __declspec( noinline ) GpuProfiler::setNextTag( const char* name )
+{
+ uint16_t tag = dest.makeTagId( name );
+ m_nextTag = tag;
+ return tag;
+}
+#endif
+
+HRESULT GpuProfilerSimple::create()
+{
+ ID3D11Device* const dev = device();
+
+ CD3D11_QUERY_DESC desc{ D3D11_QUERY_TIMESTAMP_DISJOINT };
+ CHECK( dev->CreateQuery( &desc, &disjoint ) );
+
+ desc.Query = D3D11_QUERY_TIMESTAMP;
+ CHECK( dev->CreateQuery( &desc, &begin ) );
+ CHECK( dev->CreateQuery( &desc, &end ) );
+
+ context()->Begin( disjoint );
+ context()->End( begin );
+ return S_OK;
+}
+
+HRESULT GpuProfilerSimple::time( uint64_t& rdi ) const
+{
+ context()->End( end );
+
+ try
+ {
+ const D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dtsd = waitForDisjointData( disjoint );
+ const uint64_t t1 = getTimestamp( begin );
+ const uint64_t t2 = getTimestamp( end );
+
+ if( !dtsd.Disjoint )
+ {
+ rdi = makeTime( t2 - t1, dtsd.Frequency );
+ return S_OK;
+ }
+ else
+ {
+ // Something occurred in between the query's ID3D11DeviceContext::Begin and ID3D11DeviceContext::End calls
+ // that caused the timestamp counter to become discontinuous or disjoint, such as unplugging the AC cord on a laptop, overheating, or throttling up/down due to laptop savings events.
+ // The timestamp returned by ID3D11DeviceContext::GetData for a timestamp query is only reliable if Disjoint is FALSE.
+ rdi = -1;
+ return S_FALSE;
+ }
+ }
+ catch( HRESULT hr )
+ {
+ return hr;
+ }
+} \ No newline at end of file