diff options
| author | yum <yum.food.vr@gmail.com> | 2023-08-30 17:45:53 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-08-30 17:45:53 -0700 |
| commit | 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch) | |
| tree | 0750a03f9620fe8b8bf66355feb399efb3ec29da /Scripts | |
| parent | 358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff) | |
Deprecate commit similarity threshold
This is now dynamically set inside transcribe.py.
As the buffer grows long, the threshold grows exponentially, keeping the
buffer short. The threshold starts small so that transcription starts
strict (accurate, slow) and get looser (inaccurate, fast) as needed.
Diffstat (limited to 'Scripts')
| -rw-r--r-- | Scripts/transcribe.py | 22 |
1 files changed, 20 insertions, 2 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 5301b0b..cea2da0 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -66,7 +66,7 @@ class AudioState: # The edit distance under which two consecutive transcripts are # considered to match. This affects how easily `preview_text` # gets appended to `text`. - self.commit_fuzz_threshold = 8 + self.commit_fuzz_threshold = 1 # If set, profanity in transcriptions will have their vowels replaced # with asterisks. Only works in English. @@ -157,6 +157,19 @@ def onAudioFramesAvailable( if not audio_state.audio_paused: audio_state.frames.append(decimated) + # If buffer is getting long, tell the transcription loop to be more ready + # to accept transcripts. + fps = int(input_rate / audio_state.CHUNK) + cur_len_s = len(audio_state.frames) / fps + double_at_s = 3.0 + double_every_s = 1.5 + delta_s = cur_len_s - double_at_s + n_doubles = ceil(delta_s / double_every_s) + if n_doubles >= 1: + audio_state.commit_fuzz_threshold = 2 ** n_doubles + else: + audio_state.commit_fuzz_threshold = 1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: @@ -380,7 +393,12 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() + # Prevent audio buffer from holding more than 1 second of silence + # before real speech. + audio_state.MAX_LENGTH_S = 1 continue + else: + audio_state.MAX_LENGTH_S = 300 if audio_state.drop_transcription: audio_state.drop_transcription = False @@ -720,7 +738,7 @@ def transcribeLoop(mic: str, audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - audio_state.commit_fuzz_threshold = commit_fuzz_threshold + #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter |
