summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-06-27 16:01:16 -0700
committeryum <yum.food.vr@gmail.com>2023-06-27 16:01:16 -0700
commit6638993e313773ba6ca8bdb6d7690b798d41f0d4 (patch)
tree75ce815ed4cca607ad237a39fd4ee5c313fb8504 /Scripts
parent241813a5af11093c6b86e70ada729788c1f0dee6 (diff)
Add UI for fuzzy commit threshold
Recap: In the STT there's an algorithm that tries to determine when a transcript is "stable" enough to commit. If that is too loose, then accuracy suffers; if too strict, then the audio buffer eventually fills. To mitigate the problem, I check whether the last N transcripts are within some edit distance (Levenshtein edit distance) of each other. The fuzzy matching lets us forgive small instabilities, like differences in uppercase/lowercase or punctuation, while rejecting large instabilities. The default value of 8 seems to be in the sweet spot of accuracy & performance, but it will likely be tuned in the future.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe.py20
1 files changed, 17 insertions, 3 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 6cb78cd..cc1944c 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -59,6 +59,11 @@ class AudioState:
# this only applies to keyboard controls.
self.reset_on_toggle = True
+ # The edit distance under which two consecutive transcripts are
+ # considered to match. This affects how easily `preview_text`
+ # gets appended to `text`.
+ self.commit_fuzz_threshold = 8
+
# List of:
# List of tuples of:
# Segment start time, end time, and text
@@ -263,7 +268,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
c1_c2_d = editdistance.eval(c1[2], c2[2])
c2_c3_d = editdistance.eval(c2[2], c3[2])
- max_edit = 8
+ max_edit = audio_state.commit_fuzz_threshold
#print(f"c0: {c0}, c1: {c1}, c2: {c2}")
#if c0 == c1 and c1 == c2 and c2 == c3:
@@ -607,12 +612,14 @@ def transcribeLoop(mic: str,
window_duration_s: int,
gpu_idx: int,
keyboard_hotkey: str,
- reset_on_toggle: bool):
+ reset_on_toggle: bool,
+ commit_fuzz_threshold: int):
audio_state = getMicStream(mic)
audio_state.whisper_language = language
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
audio_state.reset_on_toggle = reset_on_toggle
+ audio_state.commit_fuzz_threshold = commit_fuzz_threshold
lang_bits = language_target.split(" | ")
if len(lang_bits) == 2:
@@ -772,6 +779,7 @@ if __name__ == "__main__":
parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
+ parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
args = parser.parse_args()
if not args.mic:
@@ -809,6 +817,11 @@ if __name__ == "__main__":
if not args.gpu_idx:
print("--gpu_idx required", file=sys.stderr)
sys.exit(1)
+
+ if not args.commit_fuzz_threshold:
+ print("--commit_fuzz_threshold required", file=sys.stderr)
+ sys.exit(1)
+
args.gpu_idx = int(args.gpu_idx)
window_duration_s = 120
@@ -875,5 +888,6 @@ if __name__ == "__main__":
estate, window_duration_s,
args.gpu_idx,
args.keybind,
- args.reset_on_toggle)
+ args.reset_on_toggle,
+ args.commit_fuzz_threshold)