Enforce minimum 5.0 second duration on audio buffer

New commit logic would reduce buffer to a size smaller than this, causing it to hallucinate things like: * "See you next time!" * "Thanks for watching!" * "Bye!" The hope is that by keeping the buffer at least 5.0 seconds long, as described in the paper, this will cut down on these events.
author: yum <yum.food.vr@gmail.com> 2023-07-06 17:36:14 -0700
committer: yum <yum.food.vr@gmail.com> 2023-07-06 17:36:14 -0700
commit: d0d3b18ad0a859e5e7a1cc5b8a569349b505c924 (patch)
tree: 61b121c750a6d9f3050d481e6cea3cc47b7b6bf0 /Scripts
parent: 76ae7c28ea6224b2c919122d5dc71bcc00a0ecaa (diff)
1 files changed, 14 insertions, 0 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 039811c..5d2d893 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -170,6 +170,20 @@ def onAudioFramesAvailable(
         audio_state.frames = audio_state.frames[n_frames_to_drop:]
         audio_state.drop_samples_till_i = -1
 
+    # Now enforce a minimum duration on frames. This reduces cases where the
+    # STT hallucinates random things. In the Whisper paper, they enforce a
+    # minimum audio buffer duration of 5.0 seconds, so I do the same here.
+    empty_chunk = [0] * int(audio_state.CHUNK / keep_every)
+    chunk_duration_s = float(audio_state.CHUNK) / audio_state.RATE
+    cur_duration_s = len(audio_state.frames) * chunk_duration_s
+    desired_min_duration_s = 5.0
+    delta_duration_s = desired_min_duration_s - cur_duration_s
+    if delta_duration_s > 0:
+        delta_chunks = int(ceil(delta_duration_s / chunk_duration_s))
+        if audio_state.enable_debug_mode:
+            print(f"Padding with {delta_duration_s} seconds ({delta_chunks} chunks) of silence")
+        audio_state.frames = empty_chunk * delta_chunks + audio_state.frames
+
     return (frames, pyaudio.paContinue)
 
 def getMicStream(which_mic) -> AudioState:
author	yum <yum.food.vr@gmail.com>	2023-07-06 17:36:14 -0700
committer	yum <yum.food.vr@gmail.com>	2023-07-06 17:36:14 -0700
commit	d0d3b18ad0a859e5e7a1cc5b8a569349b505c924 (patch)
tree	61b121c750a6d9f3050d481e6cea3cc47b7b6bf0 /Scripts
parent	76ae7c28ea6224b2c919122d5dc71bcc00a0ecaa (diff)