summaryrefslogtreecommitdiffstats
path: root/Whisper/CPU/ParallelForRunner.h
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/CPU/ParallelForRunner.h
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Whisper/CPU/ParallelForRunner.h')
-rw-r--r--Whisper/CPU/ParallelForRunner.h52
1 files changed, 52 insertions, 0 deletions
diff --git a/Whisper/CPU/ParallelForRunner.h b/Whisper/CPU/ParallelForRunner.h
new file mode 100644
index 0000000..baef647
--- /dev/null
+++ b/Whisper/CPU/ParallelForRunner.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "LargeBuffer.h"
+
+namespace CpuCompute
+{
+ // Callback interface for the parallel `for`
+ __interface iComputeRange
+ {
+ // The implementation calls this method on multiple thread pool threads in parallel, and aggregates status codes.
+ HRESULT __stdcall compute( size_t begin, size_t end ) const;
+ };
+
+ // Similar to ThreadPoolWork in parallelFor.h, optimized to be used as a direct replacement of OpenMP pool.
+ class alignas( 64 ) ParallelForRunner
+ {
+ public:
+ ParallelForRunner( int threads );
+ ~ParallelForRunner();
+
+ HRESULT setThreadsCount( int threads );
+
+ HRESULT parallelFor( iComputeRange& compute, size_t length, size_t minBatch = 1 );
+
+ // Allocate a temporary buffer for the calling thread.
+ // The pointer is guaranteed to be aligned by page size = 4kb
+ void* threadLocalBuffer( size_t cb );
+
+ private:
+
+ int maxThreads;
+ PTP_WORK work = nullptr;
+ iComputeRange* computeRange = nullptr;
+ size_t countItems = 0;
+ size_t countThreads = 0;
+
+ // Aligning by cache lines.
+ // Avoiding cache line sharing between CPU cores improves performance, despite wasting a few bytes of memory.
+ struct alignas( 64 ) ThreadBuffer
+ {
+ LargeBuffer memory;
+ size_t cb = 0;
+ };
+ std::vector<ThreadBuffer> threadBuffers;
+
+ alignas( 64 ) volatile long threadIndex = 0;
+ volatile HRESULT status = S_OK;
+
+ void runBatch( size_t ith ) noexcept;
+
+ static void __stdcall workCallbackStatic( PTP_CALLBACK_INSTANCE Instance, void* pv, PTP_WORK Work ) noexcept;
+ };
+} \ No newline at end of file