diff options
| -rw-r--r-- | string_matcher.py | 66 | ||||
| -rw-r--r-- | transcribe.py | 49 |
2 files changed, 75 insertions, 40 deletions
diff --git a/string_matcher.py b/string_matcher.py new file mode 100644 index 0000000..458244b --- /dev/null +++ b/string_matcher.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# python3 -m pip install python-Levenshtein +from Levenshtein import distance as levenshtein_distance + +import typing + +# Find the window where the distance between these two transcriptions is +# minimized and use it to stitch them together. +def matchStringList(old_words: typing.List[str], + new_words: typing.List[str], window_size = 4) -> str: + if old_words == new_words: + return " ".join(old_words) + elif len(old_words) >= window_size and len(new_words) >= window_size: + # Find the window where the cumulative string distance + # between the words in that window in the old/new transcription + # is minimized. + old_slice = old_words[len(old_words) - window_size:] + + best_match_i = None + best_match_d = window_size * 1000 + + for i in range(0, 1 + len(new_words) - window_size): + new_slice = new_words[i:i + window_size] + cur_d = 0 + for j in range(0, window_size): + cur_d += levenshtein_distance(old_slice[j], new_slice[j]) + if cur_d < best_match_d: + best_match_i = i + best_match_d = cur_d + + old_prefix = old_words[0:len(old_words) - window_size] + overlap = new_words[best_match_i:best_match_i + window_size] + new_suffix = new_words[best_match_i + window_size:] + + #print("Best match i: {}".format(best_match_i)) + #print("Window size: {}".format(window_size)) + #print("Old prefix: {}".format(old_prefix)) + #print("Overlap: {}".format(overlap)) + #print("New suffix: {}".format(new_suffix)) + return " ".join(old_prefix + new_words[best_match_i:]) + else: + return " ".join(new_words) + +def matchStrings(old_text: str, new_text: str, window_size = 4) -> str: + old_words = old_text.split() + new_words = new_text.split() + return matchStringList(old_words, new_words, window_size) + +if __name__ == "__main__": + # Identical transcriptions should not be changed. + assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.") + # A suffix should be detected and ignored. + assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.") + # A lengthening suffix should be correctly appended. + assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.") + # A strictly longer transcription should override the old prefix. + assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.") + # Paranoia: repetitive text broke the older implementation, so I included + # some test cases without fully understanding what the old problem was. + assert(matchStrings("test test test", "test test test test test test", window_size + = 3) == "test test test test test test") + assert(matchStrings("test test test test test test", "test test test", window_size + = 3) == "test test test test test test") + print("Tests passed.") + diff --git a/transcribe.py b/transcribe.py index 99429c6..474cd59 100644 --- a/transcribe.py +++ b/transcribe.py @@ -2,8 +2,7 @@ import argparse import copy -# python3 -m pip install python-Levenshtein -from Levenshtein import distance as levenshtein_distance +import string_matcher import os import osc_ctrl # python3 -m pip install pydub @@ -212,7 +211,7 @@ def transcribeAudio(audio_state, model): continue words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() - print("words: {}".format(words)) + #print("words: {}".format(words)) if len(words) > 0 and words[-1] == "clear": audio_state.text = "" audio_state.text_candidate = "" @@ -229,7 +228,7 @@ def transcribeAudio(audio_state, model): # immediately accept it, since the transcription is obviously # somewhat stable. # 3. If the transcription is somewhat long and the - # first few characters change, we assume this is due to a + # first few words change, we assume this is due to a # trim event and immediately accept the transcription. candidate_words = ''.join(c for c in audio_state.text_candidate.lower() if (c.isalpha() or c == " ")).split() @@ -241,47 +240,17 @@ def transcribeAudio(audio_state, model): commit_transcription = False if words == candidate_words or candidate_words_are_prefix_of_text: commit_transcription = True - elif len(text) > 30 and len(audio_state.text_candidate) >= 10: - d = levenshtein_distance(text[0:10], - audio_state.text_candidate[0:10]) - if d > 2: - commit_transcription = True + elif len(words) >= 3 and len(candidate_words) >= 3 and \ + words[0:3] != candidate_words[0:3]: + commit_transcription = True print("Transcription: {}".format(audio_state.text)) if commit_transcription: - window_size = 20 old_text = audio_state.text - if audio_state.text == text: - pass - elif len(text) >= window_size and len(old_text) >= window_size: - old_slice = old_text[len(old_text) - window_size:] - best_match_i = None - best_match_d = window_size * 1000 - for i in range(0, 1 + len(text) - window_size): - new_slice = text[i:i + window_size] - #print("Consider slice {}".format(new_slice)) - d = levenshtein_distance(old_slice, new_slice) - if d < best_match_d and d < window_size: - best_match_i = i - best_match_d = d - if best_match_i == None: - audio_state.text = text - else: - #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) - #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) - #print("New suffix: {}".format(text[best_match_i:])) - new_text = old_text[0:len(old_text) - window_size] - new_text += text[best_match_i:] - audio_state.text = new_text - else: - audio_state.text = text - - if audio_state.text != old_text: - # We think the user said something, so reset the amount of - # time we sleep between transcriptions to the minimum. - audio_state.transcribe_no_change_count = 0 - audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s + old_words = audio_state.text.split() + new_words = text.split() + audio_state.text = string_matcher.matchStringList(old_words, new_words) audio_state.text_candidate = text |
