summaryrefslogtreecommitdiffstats
path: root/Whisper/Utils/ProfileCollection.cpp
blob: eaf737cecddf25cb431e0d8e81fd492195ec778e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#include "stdafx.h"
#include "ProfileCollection.h"
#include "GpuProfiler.h"
#include "../Whisper/WhisperModel.h"
#include "../D3D/shaderNames.h"
using namespace Whisper;

ProfileCollection::Measure& ProfileCollection::measure( DirectCompute::eProfilerBlock which )
{
	uint32_t key = (uint16_t)which;
	key |= 0x20000;
	return measures[ key ];
}

ProfileCollection::Measure& ProfileCollection::measure( DirectCompute::eComputeShader which )
{
	uint32_t key = (uint16_t)which;
	key |= 0x30000;
	return measures[ key ];
}

ProfileCollection::Measure& ProfileCollection::measure( eCpuBlock which )
{
	uint32_t key = (uint8_t)which;
	key |= 0x10000;
	CComCritSecLock<CComAutoCriticalSection> lock{ critSec };
	return measures[ key ];
}

#if PROFILER_COLLECT_TAGS
ProfileCollection::Measure& ProfileCollection::measure( DirectCompute::eComputeShader which, uint16_t tag )
{
	uint32_t key = (uint8_t)which;
	key = key << 16;
	key |= tag;
	CComCritSecLock<CComAutoCriticalSection> lock{ critSec };
	return taggedShaders[ key ];
}
#endif

namespace
{
	using pfnPrintEnum = const char* ( * )( uint16_t val );

	static const char* printCpuBlock( uint16_t id )
	{
		const eCpuBlock which = (eCpuBlock)id;
		switch( which )
		{
#define V(x) case eCpuBlock::x: return #x
			V( LoadModel );
			V( Run );
			V( Callbacks );
			V( Spectrogram );
			V( Sample );
			V( VAD );
			V( Encode );
			V( Decode );
			V( DecodeStep );
			V( DecodeLayer );
#undef V
		}
		assert( false );
		return nullptr;
	}

	static const char* printGpuBlock( uint16_t id )
	{
		using DirectCompute::eProfilerBlock;
		const eProfilerBlock which = (eProfilerBlock)id;

		switch( which )
		{
#define V(x) case eProfilerBlock::x: return #x
			V( LoadModel );
			V( Run );
			V( Encode );
			V( EncodeLayer );
			V( Decode );
			V( DecodeStep );
			V( DecodeLayer );
#undef V
		}
		assert( false );
		return nullptr;
	}

	static const char* printShader( uint16_t id )
	{
		return DirectCompute::computeShaderName( (DirectCompute::eComputeShader)id );
	}

	static pfnPrintEnum printSectionStart( uint16_t type )
	{
		switch( type )
		{
		case 1:
			logInfo( u8"    CPU Tasks" );
			return &printCpuBlock;
		case 2:
			logInfo( u8"    GPU Tasks" );
			return &printGpuBlock;
		case 3:
			logInfo( u8"    Compute Shaders" );
			return &printShader;
		default:
			return nullptr;
		}
	}

	struct PrintedTime
	{
		double value;
		const char* unit;

		PrintedTime( uint64_t ticks )
		{
			const double dbl = (double)(int64_t)ticks;
			if( ticks >= 10'000'000 )
			{
				value = dbl / 1.0E+7;
				unit = "seconds";
			}
			else if( ticks >= 10'000 )
			{
				value = dbl / 1.0E+4;
				unit = "milliseconds";
			}
			else
			{
				value = dbl / 1.0E+1;
				unit = "microseconds";
			}
		}
		PrintedTime( double dbl )
		{
			if( dbl >= 10'000'000 )
			{
				value = dbl / 1.0E+7;
				unit = "seconds";
			}
			else if( dbl >= 10'000 )
			{
				value = dbl / 1.0E+4;
				unit = "milliseconds";
			}
			else
			{
				value = dbl / 1.0E+1;
				unit = "microseconds";
			}
		}
	};
}

void ProfileCollection::Measure::print( const char* name ) const
{
	PrintedTime total{ totalTicks };
	if( 1 == count )
		logInfo( u8"%s\t%g %s", name, total.value, total.unit );
	else
	{
		PrintedTime avg = (double)totalTicks / (double)(int64_t)count;
		logInfo( u8"%s\t%g %s, %zu calls, %g %s average", name, total.value, total.unit, count, avg.value, avg.unit );
	}
}

#if PROFILER_COLLECT_TAGS
struct TaggedShaderCmp
{
	bool operator()( uint16_t cs, uint32_t key ) const
	{
		return cs < key >> 16;
	}
	bool operator()( uint32_t key, uint16_t cs ) const
	{
		return key >> 16 < cs;
	}
};

void ProfileCollection::TaggedTemp::print() const
{
	PrintedTime total{ ticks };
	if( 1 == count )
		logInfo( u8"  %s\t%g %s", name, total.value, total.unit );
	else
	{
		PrintedTime avg = (double)ticks / (double)(int64_t)count;
		logInfo( u8"  %s\t%g %s, %zu calls, %g %s average", name, total.value, total.unit, count, avg.value, avg.unit );
	}
}
#endif

void ProfileCollection::print()
{
	keysTemp.clear();
	for( POSITION pos = measures.GetStartPosition(); nullptr != pos; )
	{
		auto* p = measures.GetNext( pos );
		if( p->m_value.count == 0 )
			continue;
		keysTemp.push_back( p->m_key );
	}

	std::sort( keysTemp.begin(), keysTemp.end() );
	auto it = std::lower_bound( keysTemp.begin(), keysTemp.end(), 0x30000u );
	if( it != keysTemp.end() )
	{
		auto lambda = [ this ]( uint32_t a, uint32_t b )
		{
			const uint64_t ta = measures.Lookup( a )->m_value.totalTicks;
			const uint64_t tb = measures.Lookup( b )->m_value.totalTicks;
			return ta > tb;
		};
		std::stable_sort( it, keysTemp.end(), lambda );
	}

#if PROFILER_COLLECT_TAGS
	taggedKeysTemp.clear();
	for( POSITION pos = taggedShaders.GetStartPosition(); nullptr != pos; )
	{
		auto* p = taggedShaders.GetNext( pos );
		if( p->m_value.count == 0 )
			continue;
		taggedKeysTemp.push_back( p->m_key );
	}
	std::sort( taggedKeysTemp.begin(), taggedKeysTemp.end() );
#endif

	uint16_t prevKeyType = 0;
	pfnPrintEnum pfn = nullptr;
	for( uint32_t k : keysTemp )
	{
		const uint16_t type = (uint16_t)( k >> 16 );
		if( type != prevKeyType )
		{
			prevKeyType = type;
			pfn = printSectionStart( type );
		}
		if( pfn == nullptr )
			continue;
		const auto* p = measures.Lookup( k );
		assert( nullptr != p );
		p->m_value.print( pfn( (uint16_t)k ) );

#if PROFILER_COLLECT_TAGS
		if( type == 3 )	
		{
			// Compute shader
			auto range = std::equal_range( taggedKeysTemp.begin(), taggedKeysTemp.end(), (uint16_t)k, TaggedShaderCmp{} );
			if( range.first != range.second )
			{
				// We have at least 1 tag for that compute shader
				taggedTimes.clear();
				uint64_t totalTicks = 0;
				size_t totalCount = 0;
				for( auto it = range.first; it != range.second; it++ )
				{
					const uint32_t key = *it;
					const uint16_t tagId = (uint16_t)key;
					assert( 0 != tagId );
					const auto* p = taggedShaders.Lookup( key );
					assert( nullptr != p );

					auto& rdi = taggedTimes.emplace_back();
					rdi.ticks = p->m_value.totalTicks;
					totalTicks += p->m_value.totalTicks;

					rdi.count = p->m_value.count;
					totalCount += p->m_value.count;

					rdi.name = tagNames[ tagId ];
				}

				assert( totalCount <= p->m_value.count );
				if( totalCount < p->m_value.count )
				{
					auto& rdi = taggedTimes.emplace_back();
					rdi.ticks = p->m_value.totalTicks - totalTicks;
					rdi.count = p->m_value.count - totalCount;
					rdi.name = tagNames[ 0 ];
				}
				std::stable_sort( taggedTimes.begin(), taggedTimes.end() );
				for( const auto& e : taggedTimes )
					e.print();
			}
		}
#endif
	}
}

void ProfileCollection::reset()
{
	for( POSITION pos = measures.GetStartPosition(); nullptr != pos; )
		measures.GetNextValue( pos ).reset();
}

ProfileCollection::ProfileCollection( const WhisperModel& model )
{
	const __m128i vals = model.getLoadTimes();

	uint64_t s = (uint64_t)_mm_cvtsi128_si64( vals );
	measure( eCpuBlock::LoadModel ).add( s );

	s = (uint64_t)_mm_extract_epi64( vals, 1 );
	measure( DirectCompute::eProfilerBlock::LoadModel ).add( s );
#if PROFILER_COLLECT_TAGS
	// Tag ID 0 means no tag at all. makeTagId() method returns 0 for nullptr name, and starts numbering with 1 for non-empoty tag names
	// Push the tag name corresponding to ID = 0, this way we can index directly with tag IDs.
	tagNames.push_back( "<untagged>" );
#endif
}

uint16_t ProfileCollection::makeTagId( const char* tag )
{
#if PROFILER_COLLECT_TAGS
	if( nullptr == tag )
		return 0;
	auto p = tagIDs.Lookup( tag );
	if( nullptr != p )
		return p->m_value;
	const size_t newTag = tagIDs.GetCount() + 1;
	if( newTag <= 0xFFFF )
	{
		tagIDs.SetAt( tag, (uint16_t)newTag );
		tagNames.push_back( tag );
		return (uint16_t)newTag;
	}
	throw DISP_E_OVERFLOW;
#else
	return 0;
#endif
}