Add UI for fuzzy commit threshold

Recap: In the STT there's an algorithm that tries to determine when a transcript is "stable" enough to commit. If that is too loose, then accuracy suffers; if too strict, then the audio buffer eventually fills. To mitigate the problem, I check whether the last N transcripts are within some edit distance (Levenshtein edit distance) of each other. The fuzzy matching lets us forgive small instabilities, like differences in uppercase/lowercase or punctuation, while rejecting large instabilities. The default value of 8 seems to be in the sweet spot of accuracy & performance, but it will likely be tuned in the future.
author: yum <yum.food.vr@gmail.com> 2023-06-27 16:01:16 -0700
committer: yum <yum.food.vr@gmail.com> 2023-06-27 16:01:16 -0700
commit: 6638993e313773ba6ca8bdb6d7690b798d41f0d4 (patch)
tree: 75ce815ed4cca607ad237a39fd4ee5c313fb8504 /Scripts
parent: 241813a5af11093c6b86e70ada729788c1f0dee6 (diff)
1 files changed, 17 insertions, 3 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 6cb78cd..cc1944c 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -59,6 +59,11 @@ class AudioState:
         # this only applies to keyboard controls.
         self.reset_on_toggle = True
 
+        # The edit distance under which two consecutive transcripts are
+        # considered to match. This affects how easily `preview_text`
+        # gets appended to `text`.
+        self.commit_fuzz_threshold = 8
+
         # List of:
         #   List of tuples of:
         #     Segment start time, end time, and text
@@ -263,7 +268,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             c1_c2_d = editdistance.eval(c1[2], c2[2])
             c2_c3_d = editdistance.eval(c2[2], c3[2])
 
-            max_edit = 8
+            max_edit = audio_state.commit_fuzz_threshold
 
             #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
             #if c0 == c1 and c1 == c2 and c2 == c3:
@@ -607,12 +612,14 @@ def transcribeLoop(mic: str,
         window_duration_s: int,
         gpu_idx: int,
         keyboard_hotkey: str,
-        reset_on_toggle: bool):
+        reset_on_toggle: bool,
+        commit_fuzz_threshold: int):
     audio_state = getMicStream(mic)
     audio_state.whisper_language = language
     audio_state.language = langcodes.find(language).language
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
+    audio_state.commit_fuzz_threshold = commit_fuzz_threshold
 
     lang_bits = language_target.split(" | ")
     if len(lang_bits) == 2:
@@ -772,6 +779,7 @@ if __name__ == "__main__":
     parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
     parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
     parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
+    parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
     args = parser.parse_args()
 
     if not args.mic:
@@ -809,6 +817,11 @@ if __name__ == "__main__":
     if not args.gpu_idx:
         print("--gpu_idx required", file=sys.stderr)
         sys.exit(1)
+
+    if not args.commit_fuzz_threshold:
+        print("--commit_fuzz_threshold required", file=sys.stderr)
+        sys.exit(1)
+
     args.gpu_idx = int(args.gpu_idx)
 
     window_duration_s = 120
@@ -875,5 +888,6 @@ if __name__ == "__main__":
             estate, window_duration_s,
             args.gpu_idx,
             args.keybind,
-            args.reset_on_toggle)
+            args.reset_on_toggle,
+            args.commit_fuzz_threshold)
author	yum <yum.food.vr@gmail.com>	2023-06-27 16:01:16 -0700
committer	yum <yum.food.vr@gmail.com>	2023-06-27 16:01:16 -0700
commit	6638993e313773ba6ca8bdb6d7690b798d41f0d4 (patch)
tree	75ce815ed4cca607ad237a39fd4ee5c313fb8504 /Scripts
parent	241813a5af11093c6b86e70ada729788c1f0dee6 (diff)