Bugfix: commit no longer wipes out audio buffer

Audio data is stored in chunks of frames, not in individual frames. When I commit a transcript, I want to get rid of the portion of the audio data responsible for that particular transcript. I have code that does this, but it was dropping a slice of the list assuming that each sample is stored individually. Extra fun: Because we have to decimate mic frames, we have to convert between whisper frames and mic frames to drop the correct amount of audio data.
author: yum <yum.food.vr@gmail.com> 2023-06-28 22:11:46 -0700
committer: yum <yum.food.vr@gmail.com> 2023-06-28 22:18:18 -0700
commit: b1efbf5ce1ebd584796d4a57cf9c7b6517f91fac (patch)
tree: c03ed443ebe3fd22ec39db88f186b0e069a18547
parent: bdaeb1911297d7901a12e3ac51b38c3463789279 (diff)
2 files changed, 25 insertions, 12 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 69def6b..236d375 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -2137,9 +2137,6 @@ void Frame::OnAppStart(wxCommandEvent& event) {
         return;
     }
 
-    Log(transcribe_out_, "Commit fuzz threshold str: {}\n", commit_fuzz_threshold_str);
-    Log(transcribe_out_, "Commit fuzz threshold: {}\n", commit_fuzz_threshold);
-
     app_c_->microphone = kMicChoices[which_mic].ToStdString();
     app_c_->language = kLangChoices[which_lang].ToStdString();
     app_c_->language_target = kLangTargetChoices[which_translate_target].ToStdString();
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index d937cb6..4a438b7 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from emotes_v2 import EmotesState
 from faster_whisper import WhisperModel
 from functools import partial
-from math import ceil
+from math import floor, ceil
 from playsound import playsound
 from profanity_filter import ProfanityFilter
 from sentence_splitter import split_text_into_sentences
@@ -78,7 +78,7 @@ class AudioState:
         #     Segment start time, end time, and text
         self.ranges_ls = []
         self.frames = []
-        self.drop_frames_till_i = -1
+        self.drop_samples_till_i = -1
 
         # Locks access to `text`.
         self.transcribe_lock = threading.Lock()
@@ -153,10 +153,24 @@ def onAudioFramesAvailable(
             audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
         audio_state.frames = audio_state.frames[-1 * max_frames:]
-    if audio_state.drop_frames_till_i > 0:
-        audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1]
-        audio_state.drop_frames_till_i = -1
-
+    if audio_state.drop_samples_till_i > 0:
+        # Caller wants us to keep this many *whisper* samples, assuming that
+        # we're getting one full frame every (1024 / 16KHz) seconds.
+        # However we really get one full whisper frame a little slower, since
+        # mics usually have a higher sample rate than 16 KHz (see decimation
+        # code above).
+        # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`.
+        #
+        # TODO we should be more careful and avoid dropping an entire chunk
+        # since phonemes can absolutely fit into that amount of time. However
+        # whisper can usually figure it out so it's not a huge deal.
+        n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK
+        n_frames_to_drop *= keep_every
+        n_frames_to_drop = int(ceil(n_frames_to_drop))
+        if audio_state.enable_debug_mode:
+            print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total")
+        audio_state.frames = audio_state.frames[n_frames_to_drop:]
+        audio_state.drop_samples_till_i = -1
 
     return (frames, pyaudio.paContinue)
 
@@ -282,15 +296,15 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             max_edit = audio_state.commit_fuzz_threshold
 
             if audio_state.enable_debug_mode:
-                print(f"c0: {c0}, c1: {c1}, c2: {c2}")
+                print(f"c0: {c0}, c1: {c1}, c2: {c2}, c3: {c3}")
             if c0_c1_d < max_edit and c1_c2_d < max_edit and c2_c3_d < max_edit:
                 # For simplicity, completely reset saved audio ranges.
                 audio_state.ranges_ls = []
                 committed_text = c0[2]
                 if audio_state.enable_debug_mode:
                     print(f"Dropping frames until {c0[1]}")
-                n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
-                audio_state.drop_frames_till_i = n_frames_to_drop
+                n_samples_to_drop = int(ceil(audio_state.RATE * c0[1]))
+                audio_state.drop_samples_till_i = n_samples_to_drop
 
     preview_text = ""
     for seg in ranges:
@@ -357,6 +371,8 @@ def transcribeAudio(audio_state,
                 now - last_transcribe_time,
                 audio_state.preview_text))
             last_transcribe_time = now
+            print(f"Commit text: {text}")
+            print(f"Preview text: {preview_text}")
 
         # Translate if requested.
         translated = audio_state.preview_text
author	yum <yum.food.vr@gmail.com>	2023-06-28 22:11:46 -0700
committer	yum <yum.food.vr@gmail.com>	2023-06-28 22:18:18 -0700
commit	b1efbf5ce1ebd584796d4a57cf9c7b6517f91fac (patch)
tree	c03ed443ebe3fd22ec39db88f186b0e069a18547
parent	bdaeb1911297d7901a12e3ac51b38c3463789279 (diff)