2 files changed, 75 insertions, 40 deletions
diff --git a/string_matcher.py b/string_matcher.py
new file mode 100644
index 0000000..458244b
--- /dev/null
+++ b/string_matcher.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# python3 -m pip install python-Levenshtein
+from Levenshtein import distance as levenshtein_distance
+
+import typing
+
+# Find the window where the distance between these two transcriptions is
+# minimized and use it to stitch them together.
+def matchStringList(old_words: typing.List[str],
+        new_words: typing.List[str], window_size = 4) -> str:
+    if old_words == new_words:
+        return " ".join(old_words)
+    elif len(old_words) >= window_size and len(new_words) >= window_size:
+        # Find the window where the cumulative string distance
+        # between the words in that window in the old/new transcription
+        # is minimized.
+        old_slice = old_words[len(old_words) - window_size:]
+
+        best_match_i = None
+        best_match_d = window_size * 1000
+
+        for i in range(0, 1 + len(new_words) - window_size):
+            new_slice = new_words[i:i + window_size]
+            cur_d = 0
+            for j in range(0, window_size):
+                cur_d += levenshtein_distance(old_slice[j], new_slice[j])
+            if cur_d < best_match_d:
+                best_match_i = i
+                best_match_d = cur_d
+
+        old_prefix = old_words[0:len(old_words) - window_size]
+        overlap = new_words[best_match_i:best_match_i + window_size]
+        new_suffix = new_words[best_match_i + window_size:]
+
+        #print("Best match i:    {}".format(best_match_i))
+        #print("Window size:     {}".format(window_size))
+        #print("Old prefix:      {}".format(old_prefix))
+        #print("Overlap:         {}".format(overlap))
+        #print("New suffix:      {}".format(new_suffix))
+        return " ".join(old_prefix + new_words[best_match_i:])
+    else:
+        return " ".join(new_words)
+
+def matchStrings(old_text: str, new_text: str, window_size = 4) -> str:
+    old_words = old_text.split()
+    new_words = new_text.split()
+    return matchStringList(old_words, new_words, window_size)
+
+if __name__ == "__main__":
+    # Identical transcriptions should not be changed.
+    assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
+    # A suffix should be detected and ignored.
+    assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
+    # A lengthening suffix should be correctly appended.
+    assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
+    # A strictly longer transcription should override the old prefix.
+    assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
+    # Paranoia: repetitive text broke the older implementation, so I included
+    # some test cases without fully understanding what the old problem was.
+    assert(matchStrings("test test test", "test test test test test test", window_size
+        = 3) == "test test test test test test")
+    assert(matchStrings("test test test test test test", "test test test", window_size
+        = 3) == "test test test test test test")
+    print("Tests passed.")
+
diff --git a/transcribe.py b/transcribe.py
index 99429c6..474cd59 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -2,8 +2,7 @@
 
 import argparse
 import copy
-# python3 -m pip install python-Levenshtein
-from Levenshtein import distance as levenshtein_distance
+import string_matcher
 import os
 import osc_ctrl
 # python3 -m pip install pydub
@@ -212,7 +211,7 @@ def transcribeAudio(audio_state, model):
             continue
 
         words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
-        print("words: {}".format(words))
+        #print("words: {}".format(words))
         if len(words) > 0 and words[-1] == "clear":
             audio_state.text = ""
             audio_state.text_candidate = ""
@@ -229,7 +228,7 @@ def transcribeAudio(audio_state, model):
         #       immediately accept it, since the transcription is obviously
         #       somewhat stable.
         #   3. If the transcription is somewhat long and the
-        #       first few characters change, we assume this is due to a
+        #       first few words change, we assume this is due to a
         #       trim event and immediately accept the transcription.
         candidate_words = ''.join(c for c in audio_state.text_candidate.lower() if (c.isalpha() or c == " ")).split()
 
@@ -241,47 +240,17 @@ def transcribeAudio(audio_state, model):
         commit_transcription = False
         if words == candidate_words or candidate_words_are_prefix_of_text:
             commit_transcription = True
-        elif len(text) > 30 and len(audio_state.text_candidate) >= 10:
-            d = levenshtein_distance(text[0:10],
-                    audio_state.text_candidate[0:10])
-            if d > 2:
-                commit_transcription = True
+        elif len(words) >= 3 and len(candidate_words) >= 3 and \
+                words[0:3] != candidate_words[0:3]:
+            commit_transcription = True
 
         print("Transcription: {}".format(audio_state.text))
 
         if commit_transcription:
-            window_size = 20
             old_text = audio_state.text
-            if audio_state.text == text:
-                pass
-            elif len(text) >= window_size and len(old_text) >= window_size:
-                old_slice = old_text[len(old_text) - window_size:]
-                best_match_i = None
-                best_match_d = window_size * 1000
-                for i in range(0, 1 + len(text) - window_size):
-                    new_slice = text[i:i + window_size]
-                    #print("Consider slice {}".format(new_slice))
-                    d = levenshtein_distance(old_slice, new_slice)
-                    if d < best_match_d and d < window_size:
-                        best_match_i = i
-                        best_match_d = d
-                if best_match_i == None:
-                    audio_state.text = text
-                else:
-                    #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
-                    #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
-                    #print("New suffix: {}".format(text[best_match_i:]))
-                    new_text = old_text[0:len(old_text) - window_size]
-                    new_text += text[best_match_i:]
-                    audio_state.text = new_text
-            else:
-                audio_state.text = text
-
-            if audio_state.text != old_text:
-                # We think the user said something, so  reset the amount of
-                # time we sleep between transcriptions to the minimum.
-                audio_state.transcribe_no_change_count = 0
-                audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
+            old_words = audio_state.text.split()
+            new_words = text.split()
+            audio_state.text = string_matcher.matchStringList(old_words, new_words)
 
         audio_state.text_candidate = text