diff options
| author | yum <yum.food.vr@gmail.com> | 2023-06-27 16:01:16 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-06-27 16:01:16 -0700 |
| commit | 6638993e313773ba6ca8bdb6d7690b798d41f0d4 (patch) | |
| tree | 75ce815ed4cca607ad237a39fd4ee5c313fb8504 /Scripts | |
| parent | 241813a5af11093c6b86e70ada729788c1f0dee6 (diff) | |
Add UI for fuzzy commit threshold
Recap: In the STT there's an algorithm that tries to determine when a
transcript is "stable" enough to commit. If that is too loose, then
accuracy suffers; if too strict, then the audio buffer eventually fills.
To mitigate the problem, I check whether the last N transcripts are
within some edit distance (Levenshtein edit distance) of each other. The
fuzzy matching lets us forgive small instabilities, like differences in
uppercase/lowercase or punctuation, while rejecting large instabilities.
The default value of 8 seems to be in the sweet spot of accuracy &
performance, but it will likely be tuned in the future.
Diffstat (limited to 'Scripts')
| -rw-r--r-- | Scripts/transcribe.py | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 6cb78cd..cc1944c 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -59,6 +59,11 @@ class AudioState: # this only applies to keyboard controls. self.reset_on_toggle = True + # The edit distance under which two consecutive transcripts are + # considered to match. This affects how easily `preview_text` + # gets appended to `text`. + self.commit_fuzz_threshold = 8 + # List of: # List of tuples of: # Segment start time, end time, and text @@ -263,7 +268,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st c1_c2_d = editdistance.eval(c1[2], c2[2]) c2_c3_d = editdistance.eval(c2[2], c3[2]) - max_edit = 8 + max_edit = audio_state.commit_fuzz_threshold #print(f"c0: {c0}, c1: {c1}, c2: {c2}") #if c0 == c1 and c1 == c2 and c2 == c3: @@ -607,12 +612,14 @@ def transcribeLoop(mic: str, window_duration_s: int, gpu_idx: int, keyboard_hotkey: str, - reset_on_toggle: bool): + reset_on_toggle: bool, + commit_fuzz_threshold: int): audio_state = getMicStream(mic) audio_state.whisper_language = language audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle + audio_state.commit_fuzz_threshold = commit_fuzz_threshold lang_bits = language_target.split(" | ") if len(lang_bits) == 2: @@ -772,6 +779,7 @@ if __name__ == "__main__": parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") + parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.") args = parser.parse_args() if not args.mic: @@ -809,6 +817,11 @@ if __name__ == "__main__": if not args.gpu_idx: print("--gpu_idx required", file=sys.stderr) sys.exit(1) + + if not args.commit_fuzz_threshold: + print("--commit_fuzz_threshold required", file=sys.stderr) + sys.exit(1) + args.gpu_idx = int(args.gpu_idx) window_duration_s = 120 @@ -875,5 +888,6 @@ if __name__ == "__main__": estate, window_duration_s, args.gpu_idx, args.keybind, - args.reset_on_toggle) + args.reset_on_toggle, + args.commit_fuzz_threshold) |
