Deprecate commit similarity threshold

This is now dynamically set inside transcribe.py. As the buffer grows long, the threshold grows exponentially, keeping the buffer short. The threshold starts small so that transcription starts strict (accurate, slow) and get looser (inaccurate, fast) as needed.
author: yum <yum.food.vr@gmail.com> 2023-08-30 17:45:53 -0700
committer: yum <yum.food.vr@gmail.com> 2023-08-30 17:45:53 -0700
commit: 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch)
tree: 0750a03f9620fe8b8bf66355feb399efb3ec29da /Scripts
parent: 358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff)
1 files changed, 20 insertions, 2 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 5301b0b..cea2da0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -66,7 +66,7 @@ class AudioState:
         # The edit distance under which two consecutive transcripts are
         # considered to match. This affects how easily `preview_text`
         # gets appended to `text`.
-        self.commit_fuzz_threshold = 8
+        self.commit_fuzz_threshold = 1
 
         # If set, profanity in transcriptions will have their vowels replaced
         # with asterisks. Only works in English.
@@ -157,6 +157,19 @@ def onAudioFramesAvailable(
     if not audio_state.audio_paused:
         audio_state.frames.append(decimated)
 
+    # If buffer is getting long, tell the transcription loop to be more ready
+    # to accept transcripts.
+    fps = int(input_rate / audio_state.CHUNK)
+    cur_len_s = len(audio_state.frames) / fps
+    double_at_s = 3.0
+    double_every_s = 1.5
+    delta_s = cur_len_s - double_at_s
+    n_doubles = ceil(delta_s / double_every_s)
+    if n_doubles >= 1:
+        audio_state.commit_fuzz_threshold = 2 ** n_doubles
+    else:
+        audio_state.commit_fuzz_threshold = 1
+
     max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
             audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
@@ -380,7 +393,12 @@ def transcribeAudio(audio_state,
             if audio_state.enable_debug_mode:
                 print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
+            # Prevent audio buffer from holding more than 1 second of silence
+            # before real speech.
+            audio_state.MAX_LENGTH_S = 1
             continue
+        else:
+            audio_state.MAX_LENGTH_S = 300
 
         if audio_state.drop_transcription:
             audio_state.drop_transcription = False
@@ -720,7 +738,7 @@ def transcribeLoop(mic: str,
     audio_state.language = langcodes.find(language).language
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
-    audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+    #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
     audio_state.enable_debug_mode = enable_debug_mode
     audio_state.enable_profanity_filter = enable_profanity_filter
author	yum <yum.food.vr@gmail.com>	2023-08-30 17:45:53 -0700
committer	yum <yum.food.vr@gmail.com>	2023-08-30 17:45:53 -0700
commit	4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch)
tree	0750a03f9620fe8b8bf66355feb399efb3ec29da /Scripts
parent	358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff)