diff options
| author | yum <yum.food.vr@gmail.com> | 2023-02-26 19:42:33 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-02-26 20:09:15 -0800 |
| commit | 1136acfc365f357d2df13a263714e8ae0614c4f9 (patch) | |
| tree | 6109149806673ade4505d956b09d1996034f7cab | |
| parent | 02c2605454288f7c86023ae700366acf08cd2206 (diff) | |
Add retainDuration option to CaptureParams
This allows users to retain a suffix of the PCM buffer after a VAD
segmentation event, reducing some instances of words being lost at
the start of the next VAD window.
| -rw-r--r-- | Whisper/API/MfStructs.h | 3 | ||||
| -rw-r--r-- | Whisper/MF/AudioBuffer.h | 15 | ||||
| -rw-r--r-- | Whisper/Whisper/ContextImpl.capture.cpp | 5 |
3 files changed, 22 insertions, 1 deletions
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h index 39255de..c23d633 100644 --- a/Whisper/API/MfStructs.h +++ b/Whisper/API/MfStructs.h @@ -28,6 +28,9 @@ namespace Whisper float maxDuration = 3.0f; float dropStartSilence = 0.25f; float pauseDuration = 0.333f; + // After audio is segmented using VAD, as many as this many seconds of + // audio will be retained as the input to the next transcription window. + float retainDuration = 0.25f; // Flags for the audio capture uint32_t flags = 0; }; diff --git a/Whisper/MF/AudioBuffer.h b/Whisper/MF/AudioBuffer.h index 77be1e0..63c4a8c 100644 --- a/Whisper/MF/AudioBuffer.h +++ b/Whisper/MF/AudioBuffer.h @@ -48,12 +48,27 @@ namespace Whisper void dropFirst(size_t len) { assert(len <= mono.size()); + if (len >= mono.size()) { + mono.clear(); + return; + } size_t remainder = mono.size() - len; auto tmp = std::vector<float>(remainder); memcpy(tmp.data(), mono.data() + len, remainder); mono = std::move(tmp); } + void retainLast(size_t len) + { + if (len >= mono.size()) { + return; + } + size_t prefix_len = mono.size() - len; + auto tmp = std::vector<float>(len); + memcpy(tmp.data(), mono.data() + prefix_len, len); + mono = std::move(tmp); + } + void normalize() { const auto &min = *std::min_element(mono.begin(), mono.end()); diff --git a/Whisper/Whisper/ContextImpl.capture.cpp b/Whisper/Whisper/ContextImpl.capture.cpp index bc88249..0100fcd 100644 --- a/Whisper/Whisper/ContextImpl.capture.cpp +++ b/Whisper/Whisper/ContextImpl.capture.cpp @@ -53,6 +53,7 @@ namespace struct CaptureParams { uint32_t minDuration, maxDuration, dropStartSilence, pauseDuration; + uint32_t retainDuration; uint32_t flags; CaptureParams( const sCaptureParams& cp ) @@ -64,6 +65,8 @@ namespace __m128i ints = _mm_cvtps_epi32( floats ); store16( &minDuration, ints ); + retainDuration = std::round(retainDuration * SAMPLE_RATE); + flags = cp.flags; } }; @@ -142,7 +145,7 @@ namespace buffer.pcm.normalize(); SubmitThreadpoolWork( work ); pcmStartTime = nextSampleTime; - pcm.clear(); + pcm.retainLast(captureParams.retainDuration); vad.clear(); return S_OK; } |
