From 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 Mon Sep 17 00:00:00 2001 From: yum Date: Wed, 30 Aug 2023 17:45:53 -0700 Subject: Deprecate commit similarity threshold This is now dynamically set inside transcribe.py. As the buffer grows long, the threshold grows exponentially, keeping the buffer short. The threshold starts small so that transcription starts strict (accurate, slow) and get looser (inaccurate, fast) as needed. --- Scripts/transcribe.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'Scripts') diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 5301b0b..cea2da0 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -66,7 +66,7 @@ class AudioState: # The edit distance under which two consecutive transcripts are # considered to match. This affects how easily `preview_text` # gets appended to `text`. - self.commit_fuzz_threshold = 8 + self.commit_fuzz_threshold = 1 # If set, profanity in transcriptions will have their vowels replaced # with asterisks. Only works in English. @@ -157,6 +157,19 @@ def onAudioFramesAvailable( if not audio_state.audio_paused: audio_state.frames.append(decimated) + # If buffer is getting long, tell the transcription loop to be more ready + # to accept transcripts. + fps = int(input_rate / audio_state.CHUNK) + cur_len_s = len(audio_state.frames) / fps + double_at_s = 3.0 + double_every_s = 1.5 + delta_s = cur_len_s - double_at_s + n_doubles = ceil(delta_s / double_every_s) + if n_doubles >= 1: + audio_state.commit_fuzz_threshold = 2 ** n_doubles + else: + audio_state.commit_fuzz_threshold = 1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: @@ -380,7 +393,12 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() + # Prevent audio buffer from holding more than 1 second of silence + # before real speech. + audio_state.MAX_LENGTH_S = 1 continue + else: + audio_state.MAX_LENGTH_S = 300 if audio_state.drop_transcription: audio_state.drop_transcription = False @@ -720,7 +738,7 @@ def transcribeLoop(mic: str, audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - audio_state.commit_fuzz_threshold = commit_fuzz_threshold + #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter -- cgit v1.2.3