diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/CPU/ParallelForRunner.h | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'Whisper/CPU/ParallelForRunner.h')
| -rw-r--r-- | Whisper/CPU/ParallelForRunner.h | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/Whisper/CPU/ParallelForRunner.h b/Whisper/CPU/ParallelForRunner.h new file mode 100644 index 0000000..baef647 --- /dev/null +++ b/Whisper/CPU/ParallelForRunner.h @@ -0,0 +1,52 @@ +#pragma once +#include "LargeBuffer.h" + +namespace CpuCompute +{ + // Callback interface for the parallel `for` + __interface iComputeRange + { + // The implementation calls this method on multiple thread pool threads in parallel, and aggregates status codes. + HRESULT __stdcall compute( size_t begin, size_t end ) const; + }; + + // Similar to ThreadPoolWork in parallelFor.h, optimized to be used as a direct replacement of OpenMP pool. + class alignas( 64 ) ParallelForRunner + { + public: + ParallelForRunner( int threads ); + ~ParallelForRunner(); + + HRESULT setThreadsCount( int threads ); + + HRESULT parallelFor( iComputeRange& compute, size_t length, size_t minBatch = 1 ); + + // Allocate a temporary buffer for the calling thread. + // The pointer is guaranteed to be aligned by page size = 4kb + void* threadLocalBuffer( size_t cb ); + + private: + + int maxThreads; + PTP_WORK work = nullptr; + iComputeRange* computeRange = nullptr; + size_t countItems = 0; + size_t countThreads = 0; + + // Aligning by cache lines. + // Avoiding cache line sharing between CPU cores improves performance, despite wasting a few bytes of memory. + struct alignas( 64 ) ThreadBuffer + { + LargeBuffer memory; + size_t cb = 0; + }; + std::vector<ThreadBuffer> threadBuffers; + + alignas( 64 ) volatile long threadIndex = 0; + volatile HRESULT status = S_OK; + + void runBatch( size_t ith ) noexcept; + + static void __stdcall workCallbackStatic( PTP_CALLBACK_INSTANCE Instance, void* pv, PTP_WORK Work ) noexcept; + }; +}
\ No newline at end of file |
