summaryrefslogtreecommitdiffstats
path: root/transcribe.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-10-25 17:46:44 -0700
committeryum <yum.food.vr@gmail.com>2022-10-25 17:46:44 -0700
commiteefa14c431efa4e3bc16cafbcb004e41622c2411 (patch)
tree98885c442474e812c0442c1d2d38b6021c0e7692 /transcribe.py
parentf84d83b611b751d20c9b7b983c4a90e389117a6b (diff)
De-scuff continuous transcription
Transcription stitching now occurs in word space, rather than in text space. This avoids problems where we accidentally duplicate or delete letters in the middle of words. Factor out stitching into its own module and add a small handful of test cases. Hopefully if we hit problems in production, we can just grow this list and avoid regressions if we reimplement.
Diffstat (limited to 'transcribe.py')
-rw-r--r--transcribe.py49
1 files changed, 9 insertions, 40 deletions
diff --git a/transcribe.py b/transcribe.py
index 99429c6..474cd59 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -2,8 +2,7 @@
import argparse
import copy
-# python3 -m pip install python-Levenshtein
-from Levenshtein import distance as levenshtein_distance
+import string_matcher
import os
import osc_ctrl
# python3 -m pip install pydub
@@ -212,7 +211,7 @@ def transcribeAudio(audio_state, model):
continue
words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
- print("words: {}".format(words))
+ #print("words: {}".format(words))
if len(words) > 0 and words[-1] == "clear":
audio_state.text = ""
audio_state.text_candidate = ""
@@ -229,7 +228,7 @@ def transcribeAudio(audio_state, model):
# immediately accept it, since the transcription is obviously
# somewhat stable.
# 3. If the transcription is somewhat long and the
- # first few characters change, we assume this is due to a
+ # first few words change, we assume this is due to a
# trim event and immediately accept the transcription.
candidate_words = ''.join(c for c in audio_state.text_candidate.lower() if (c.isalpha() or c == " ")).split()
@@ -241,47 +240,17 @@ def transcribeAudio(audio_state, model):
commit_transcription = False
if words == candidate_words or candidate_words_are_prefix_of_text:
commit_transcription = True
- elif len(text) > 30 and len(audio_state.text_candidate) >= 10:
- d = levenshtein_distance(text[0:10],
- audio_state.text_candidate[0:10])
- if d > 2:
- commit_transcription = True
+ elif len(words) >= 3 and len(candidate_words) >= 3 and \
+ words[0:3] != candidate_words[0:3]:
+ commit_transcription = True
print("Transcription: {}".format(audio_state.text))
if commit_transcription:
- window_size = 20
old_text = audio_state.text
- if audio_state.text == text:
- pass
- elif len(text) >= window_size and len(old_text) >= window_size:
- old_slice = old_text[len(old_text) - window_size:]
- best_match_i = None
- best_match_d = window_size * 1000
- for i in range(0, 1 + len(text) - window_size):
- new_slice = text[i:i + window_size]
- #print("Consider slice {}".format(new_slice))
- d = levenshtein_distance(old_slice, new_slice)
- if d < best_match_d and d < window_size:
- best_match_i = i
- best_match_d = d
- if best_match_i == None:
- audio_state.text = text
- else:
- #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
- #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
- #print("New suffix: {}".format(text[best_match_i:]))
- new_text = old_text[0:len(old_text) - window_size]
- new_text += text[best_match_i:]
- audio_state.text = new_text
- else:
- audio_state.text = text
-
- if audio_state.text != old_text:
- # We think the user said something, so reset the amount of
- # time we sleep between transcriptions to the minimum.
- audio_state.transcribe_no_change_count = 0
- audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
+ old_words = audio_state.text.split()
+ new_words = text.split()
+ audio_state.text = string_matcher.matchStringList(old_words, new_words)
audio_state.text_candidate = text