Commit logic now drops parts of frames

We used to drop entire frames only, leading to situations where more audio is dropped than desired. Now we drop frames down to the precision of the individual audio sample requested.
author: yum <yum.food.vr@gmail.com> 2023-07-08 15:57:39 -0700
committer: yum <yum.food.vr@gmail.com> 2023-07-08 15:57:39 -0700
commit: 80f46a7a346e73c94a3bb8ae01099743020ef2a4 (patch)
tree: d253958f84e52c0ae12f7f8670374abd7aed3876
parent: 5db7426bb14b7e51275c14d8173bd67e8addc4ce (diff)
1 files changed, 8 insertions, 6 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index a1b4e8e..f0b0338 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from emotes_v2 import EmotesState
 from faster_whisper import WhisperModel
 from functools import partial
-from math import ceil
+from math import ceil, floor
 from playsound import playsound
 from profanity_filter import ProfanityFilter
 from sentence_splitter import split_text_into_sentences
@@ -161,16 +161,18 @@ def onAudioFramesAvailable(
         # mics usually have a higher sample rate than 16 KHz (see decimation
         # code above).
         # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`.
-        #
-        # TODO we should be more careful and avoid dropping an entire chunk
-        # since phonemes can absolutely fit into that amount of time. However
-        # whisper can usually figure it out so it's not a huge deal.
         n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK
         n_frames_to_drop *= keep_every
-        n_frames_to_drop = int(ceil(n_frames_to_drop))
+        n_frames_to_drop = int(floor(n_frames_to_drop))
         if audio_state.enable_debug_mode:
             print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total")
+        # First drop every whole chunk
         audio_state.frames = audio_state.frames[n_frames_to_drop:]
+        # Then drop the part of the most recent chunk we no longer want
+        if len(audio_state.frames) > 0:
+            n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK))
+            bytes_per_sample = 2
+            audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
         audio_state.drop_samples_till_i = -1
 
     # Now enforce a minimum duration on frames. This reduces cases where the
author	yum <yum.food.vr@gmail.com>	2023-07-08 15:57:39 -0700
committer	yum <yum.food.vr@gmail.com>	2023-07-08 15:57:39 -0700
commit	80f46a7a346e73c94a3bb8ae01099743020ef2a4 (patch)
tree	d253958f84e52c0ae12f7f8670374abd7aed3876
parent	5db7426bb14b7e51275c14d8173bd67e8addc4ce (diff)