From 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Wed, 30 Aug 2023 17:45:53 -0700
Subject: Deprecate commit similarity threshold

This is now dynamically set inside transcribe.py.

As the buffer grows long, the threshold grows exponentially, keeping the
buffer short. The threshold starts small so that transcription starts
strict (accurate, slow) and get looser (inaccurate, fast) as needed.
---
 Scripts/transcribe.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'Scripts')

diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 5301b0b..cea2da0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -66,7 +66,7 @@ class AudioState:
         # The edit distance under which two consecutive transcripts are
         # considered to match. This affects how easily `preview_text`
         # gets appended to `text`.
-        self.commit_fuzz_threshold = 8
+        self.commit_fuzz_threshold = 1
 
         # If set, profanity in transcriptions will have their vowels replaced
         # with asterisks. Only works in English.
@@ -157,6 +157,19 @@ def onAudioFramesAvailable(
     if not audio_state.audio_paused:
         audio_state.frames.append(decimated)
 
+    # If buffer is getting long, tell the transcription loop to be more ready
+    # to accept transcripts.
+    fps = int(input_rate / audio_state.CHUNK)
+    cur_len_s = len(audio_state.frames) / fps
+    double_at_s = 3.0
+    double_every_s = 1.5
+    delta_s = cur_len_s - double_at_s
+    n_doubles = ceil(delta_s / double_every_s)
+    if n_doubles >= 1:
+        audio_state.commit_fuzz_threshold = 2 ** n_doubles
+    else:
+        audio_state.commit_fuzz_threshold = 1
+
     max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
             audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
@@ -380,7 +393,12 @@ def transcribeAudio(audio_state,
             if audio_state.enable_debug_mode:
                 print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
+            # Prevent audio buffer from holding more than 1 second of silence
+            # before real speech.
+            audio_state.MAX_LENGTH_S = 1
             continue
+        else:
+            audio_state.MAX_LENGTH_S = 300
 
         if audio_state.drop_transcription:
             audio_state.drop_transcription = False
@@ -720,7 +738,7 @@ def transcribeLoop(mic: str,
     audio_state.language = langcodes.find(language).language
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
-    audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+    #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
     audio_state.enable_debug_mode = enable_debug_mode
     audio_state.enable_profanity_filter = enable_profanity_filter
 
-- 
cgit v1.2.3