summaryrefslogtreecommitdiffstats
path: root/Whisper/Utils/GpuProfiler.h
blob: c6dcd749dfa5db324957709eabb432cbcffd6d14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#pragma once
#include "../D3D/device.h"
#include "ProfileCollection.h"
#include "DelayExecution.h"

namespace DirectCompute
{
	enum struct eProfilerBlock : uint16_t
	{
		LoadModel = 0x1000,
		Run = 0x2000,
		Encode = 0x3000,
		EncodeLayer = 0x4000,
		Decode = 0x5000,
		DecodeStep = 0x6000,
		DecodeLayer = 0x7000,
	};

	enum struct eComputeShader : uint16_t;

	class GpuProfiler
	{
		DelayExecution delay;
		CComPtr<ID3D11Query> disjoint;

		enum struct eEvent
		{
			None = 0,
			BlockStart,
			BlockEnd,
			Shader
		};

		struct BlockState;
		static constexpr uint16_t EmptyShader = ~(uint16_t)0;

		// A circular buffer with in-flight queries which feeds timestamps into the iTimestampSink interface
		class Queue
		{
			static constexpr size_t queueLength = 32;

			// Ring buffer for individual measures
			struct Entry
			{
				CComPtr<ID3D11Query> query;
				BlockState* block;
				eEvent event;
				uint16_t shader;
#if PROFILER_COLLECT_TAGS
				uint16_t tag = 0;
#endif
				void join( GpuProfiler& owner );
			};

			GpuProfiler& owner;
			std::array<Entry, queueLength> queue;
			size_t nextEntry = 0;

		public:
			Queue( GpuProfiler& gp ) : owner( gp ) {}

			HRESULT create();

			// Begin a next query. Eventually, this will result in the BlockState.haveTimestamp callback
			void submit( BlockState* block, eEvent evt, uint16_t shader = EmptyShader, uint16_t tag = 0 );

			// Wait for all the pending queries, and call their callbacks
			void join();
		};
		Queue queries;

		struct sProfilerData;
		struct BlockState
		{
			int64_t timeStart = -1;
			sProfilerData* destBlock = nullptr;
			int64_t shaderStart = -1;
			uint16_t prevShader = EmptyShader;
			uint16_t prevShaderTag = 0;
			BlockState* parentBlock = nullptr;
			void haveTimestamp( eEvent evt, uint16_t cs, uint16_t tag, uint64_t time, GpuProfiler& profiler );
		private:
			void completePrevShader( uint64_t time, GpuProfiler& profiler );
		};
		CAtlMap<eProfilerBlock, BlockState> blockStates;
		std::vector<BlockState*> stack;

		struct sProfilerData
		{
			// Count of accumulated measures
			size_t callsPending;
			// Total time spent running all instances of that measure, expressed in GPU ticks
			uint64_t timePending;

			Whisper::ProfileCollection::Measure* dest;

			inline void makeTime( uint64_t freq );
			inline void addPending( int64_t time );
			inline void reset();
			inline void dropPending();

			sProfilerData()
			{
				reset();
			}
		};

		CAtlMap<uint16_t, sProfilerData> results;
#if PROFILER_COLLECT_TAGS
		CAtlMap<uint32_t, sProfilerData> resultsTagged;
#endif
		void resultsMakeTime( uint64_t freq );
		void resultsDropPending();
		void resultsReset();

		void blockStart( eProfilerBlock which );
		void blockEnd();

		Whisper::ProfileCollection& dest;
#if PROFILER_COLLECT_TAGS
		uint16_t m_nextTag = 0;
#endif
	public:

		GpuProfiler( Whisper::ProfileCollection& pc ) :
			dest( pc ), queries( *this ) { }

		HRESULT create( size_t maxDepth = 3 );

		class BlockRaii
		{
			GpuProfiler* profiler;

		public:
			BlockRaii( GpuProfiler& owner, eProfilerBlock which )
			{
				owner.blockStart( which );
				profiler = &owner;
			}
			~BlockRaii()
			{
				if( nullptr != profiler )
				{
					profiler->blockEnd();
					profiler = nullptr;
				}
			}
			BlockRaii( BlockRaii&& that ) noexcept :
				profiler( that.profiler )
			{
				that.profiler = nullptr;
			}
			BlockRaii( const BlockRaii& ) = delete;
			void operator=( const BlockRaii& ) = delete;
			void operator=( BlockRaii&& ) = delete;
		};

		BlockRaii block( eProfilerBlock which )
		{
			return BlockRaii{ *this, which };
		}

		void computeShader( eComputeShader cs );

		bool profileShaders = false;
		// bool profileShaders = true;

		decltype( auto ) cpuBlock( Whisper::eCpuBlock block )
		{
			return dest.cpuBlock( block );
		}
		Whisper::ProfileCollection& profiler() { return dest; }

		// Set tag string for the next compute shader
		// The string should be readonly: for performance reason the implementation doesn’t copy nor compare any strings, it only keeps the pointer
#if PROFILER_COLLECT_TAGS
		uint16_t setNextTag( const char* name );
#else
		inline uint16_t setNextTag( const char* name ) { return 0; }
#endif

		void setNextTag( uint16_t tag )
		{
#if PROFILER_COLLECT_TAGS
			m_nextTag = tag;
#endif
		}
	};
}