diff options
| author | yum <yum.food.vr@gmail.com> | 2023-07-08 15:57:39 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-07-08 15:57:39 -0700 |
| commit | 80f46a7a346e73c94a3bb8ae01099743020ef2a4 (patch) | |
| tree | d253958f84e52c0ae12f7f8670374abd7aed3876 | |
| parent | 5db7426bb14b7e51275c14d8173bd67e8addc4ce (diff) | |
Commit logic now drops parts of frames
We used to drop entire frames only, leading to situations where more
audio is dropped than desired. Now we drop frames down to the precision
of the individual audio sample requested.
| -rw-r--r-- | Scripts/transcribe.py | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index a1b4e8e..f0b0338 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -4,7 +4,7 @@ from datetime import datetime from emotes_v2 import EmotesState from faster_whisper import WhisperModel from functools import partial -from math import ceil +from math import ceil, floor from playsound import playsound from profanity_filter import ProfanityFilter from sentence_splitter import split_text_into_sentences @@ -161,16 +161,18 @@ def onAudioFramesAvailable( # mics usually have a higher sample rate than 16 KHz (see decimation # code above). # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`. - # - # TODO we should be more careful and avoid dropping an entire chunk - # since phonemes can absolutely fit into that amount of time. However - # whisper can usually figure it out so it's not a huge deal. n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK n_frames_to_drop *= keep_every - n_frames_to_drop = int(ceil(n_frames_to_drop)) + n_frames_to_drop = int(floor(n_frames_to_drop)) if audio_state.enable_debug_mode: print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total") + # First drop every whole chunk audio_state.frames = audio_state.frames[n_frames_to_drop:] + # Then drop the part of the most recent chunk we no longer want + if len(audio_state.frames) > 0: + n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK)) + bytes_per_sample = 2 + audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] audio_state.drop_samples_till_i = -1 # Now enforce a minimum duration on frames. This reduces cases where the |
