summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-08-30 17:45:53 -0700
committeryum <yum.food.vr@gmail.com>2023-08-30 17:45:53 -0700
commit4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch)
tree0750a03f9620fe8b8bf66355feb399efb3ec29da /Scripts/transcribe.py
parent358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff)
Deprecate commit similarity threshold
This is now dynamically set inside transcribe.py. As the buffer grows long, the threshold grows exponentially, keeping the buffer short. The threshold starts small so that transcription starts strict (accurate, slow) and get looser (inaccurate, fast) as needed.
Diffstat (limited to 'Scripts/transcribe.py')
-rw-r--r--Scripts/transcribe.py22
1 files changed, 20 insertions, 2 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 5301b0b..cea2da0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -66,7 +66,7 @@ class AudioState:
# The edit distance under which two consecutive transcripts are
# considered to match. This affects how easily `preview_text`
# gets appended to `text`.
- self.commit_fuzz_threshold = 8
+ self.commit_fuzz_threshold = 1
# If set, profanity in transcriptions will have their vowels replaced
# with asterisks. Only works in English.
@@ -157,6 +157,19 @@ def onAudioFramesAvailable(
if not audio_state.audio_paused:
audio_state.frames.append(decimated)
+ # If buffer is getting long, tell the transcription loop to be more ready
+ # to accept transcripts.
+ fps = int(input_rate / audio_state.CHUNK)
+ cur_len_s = len(audio_state.frames) / fps
+ double_at_s = 3.0
+ double_every_s = 1.5
+ delta_s = cur_len_s - double_at_s
+ n_doubles = ceil(delta_s / double_every_s)
+ if n_doubles >= 1:
+ audio_state.commit_fuzz_threshold = 2 ** n_doubles
+ else:
+ audio_state.commit_fuzz_threshold = 1
+
max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
@@ -380,7 +393,12 @@ def transcribeAudio(audio_state,
if audio_state.enable_debug_mode:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
+ # Prevent audio buffer from holding more than 1 second of silence
+ # before real speech.
+ audio_state.MAX_LENGTH_S = 1
continue
+ else:
+ audio_state.MAX_LENGTH_S = 300
if audio_state.drop_transcription:
audio_state.drop_transcription = False
@@ -720,7 +738,7 @@ def transcribeLoop(mic: str,
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
audio_state.reset_on_toggle = reset_on_toggle
- audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+ #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
audio_state.enable_debug_mode = enable_debug_mode
audio_state.enable_profanity_filter = enable_profanity_filter