diff options
| author | yum <yum.food.vr@gmail.com> | 2023-06-28 22:11:46 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-06-28 22:18:18 -0700 |
| commit | b1efbf5ce1ebd584796d4a57cf9c7b6517f91fac (patch) | |
| tree | c03ed443ebe3fd22ec39db88f186b0e069a18547 | |
| parent | bdaeb1911297d7901a12e3ac51b38c3463789279 (diff) | |
Bugfix: commit no longer wipes out audio buffer
Audio data is stored in chunks of frames, not in individual frames.
When I commit a transcript, I want to get rid of the portion of the
audio data responsible for that particular transcript. I have code that
does this, but it was dropping a slice of the list assuming that each
sample is stored individually.
Extra fun: Because we have to decimate mic frames, we have to convert
between whisper frames and mic frames to drop the correct amount of
audio data.
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 3 | ||||
| -rw-r--r-- | Scripts/transcribe.py | 34 |
2 files changed, 25 insertions, 12 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 69def6b..236d375 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -2137,9 +2137,6 @@ void Frame::OnAppStart(wxCommandEvent& event) { return;
}
- Log(transcribe_out_, "Commit fuzz threshold str: {}\n", commit_fuzz_threshold_str);
- Log(transcribe_out_, "Commit fuzz threshold: {}\n", commit_fuzz_threshold);
-
app_c_->microphone = kMicChoices[which_mic].ToStdString();
app_c_->language = kLangChoices[which_lang].ToStdString();
app_c_->language_target = kLangTargetChoices[which_translate_target].ToStdString();
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index d937cb6..4a438b7 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -4,7 +4,7 @@ from datetime import datetime from emotes_v2 import EmotesState from faster_whisper import WhisperModel from functools import partial -from math import ceil +from math import floor, ceil from playsound import playsound from profanity_filter import ProfanityFilter from sentence_splitter import split_text_into_sentences @@ -78,7 +78,7 @@ class AudioState: # Segment start time, end time, and text self.ranges_ls = [] self.frames = [] - self.drop_frames_till_i = -1 + self.drop_samples_till_i = -1 # Locks access to `text`. self.transcribe_lock = threading.Lock() @@ -153,10 +153,24 @@ def onAudioFramesAvailable( audio_state.CHUNK) if len(audio_state.frames) > max_frames: audio_state.frames = audio_state.frames[-1 * max_frames:] - if audio_state.drop_frames_till_i > 0: - audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1] - audio_state.drop_frames_till_i = -1 - + if audio_state.drop_samples_till_i > 0: + # Caller wants us to keep this many *whisper* samples, assuming that + # we're getting one full frame every (1024 / 16KHz) seconds. + # However we really get one full whisper frame a little slower, since + # mics usually have a higher sample rate than 16 KHz (see decimation + # code above). + # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`. + # + # TODO we should be more careful and avoid dropping an entire chunk + # since phonemes can absolutely fit into that amount of time. However + # whisper can usually figure it out so it's not a huge deal. + n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK + n_frames_to_drop *= keep_every + n_frames_to_drop = int(ceil(n_frames_to_drop)) + if audio_state.enable_debug_mode: + print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total") + audio_state.frames = audio_state.frames[n_frames_to_drop:] + audio_state.drop_samples_till_i = -1 return (frames, pyaudio.paContinue) @@ -282,15 +296,15 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st max_edit = audio_state.commit_fuzz_threshold if audio_state.enable_debug_mode: - print(f"c0: {c0}, c1: {c1}, c2: {c2}") + print(f"c0: {c0}, c1: {c1}, c2: {c2}, c3: {c3}") if c0_c1_d < max_edit and c1_c2_d < max_edit and c2_c3_d < max_edit: # For simplicity, completely reset saved audio ranges. audio_state.ranges_ls = [] committed_text = c0[2] if audio_state.enable_debug_mode: print(f"Dropping frames until {c0[1]}") - n_frames_to_drop = int(ceil(audio_state.RATE * c0[1])) - audio_state.drop_frames_till_i = n_frames_to_drop + n_samples_to_drop = int(ceil(audio_state.RATE * c0[1])) + audio_state.drop_samples_till_i = n_samples_to_drop preview_text = "" for seg in ranges: @@ -357,6 +371,8 @@ def transcribeAudio(audio_state, now - last_transcribe_time, audio_state.preview_text)) last_transcribe_time = now + print(f"Commit text: {text}") + print(f"Preview text: {preview_text}") # Translate if requested. translated = audio_state.preview_text |
