De-scuff continuous transcription

Transcription stitching now occurs in word space, rather than in text space. This avoids problems where we accidentally duplicate or delete letters in the middle of words. Factor out stitching into its own module and add a small handful of test cases. Hopefully if we hit problems in production, we can just grow this list and avoid regressions if we reimplement.
author: yum <yum.food.vr@gmail.com> 2022-10-25 17:46:44 -0700
committer: yum <yum.food.vr@gmail.com> 2022-10-25 17:46:44 -0700
commit: eefa14c431efa4e3bc16cafbcb004e41622c2411 (patch)
tree: 98885c442474e812c0442c1d2d38b6021c0e7692
parent: f84d83b611b751d20c9b7b983c4a90e389117a6b (diff)
2 files changed, 75 insertions, 40 deletions
diff --git a/string_matcher.py b/string_matcher.py
new file mode 100644
index 0000000..458244b
--- /dev/null
+++ b/string_matcher.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# python3 -m pip install python-Levenshtein
+from Levenshtein import distance as levenshtein_distance
+
+import typing
+
+# Find the window where the distance between these two transcriptions is
+# minimized and use it to stitch them together.
+def matchStringList(old_words: typing.List[str],
+        new_words: typing.List[str], window_size = 4) -> str:
+    if old_words == new_words:
+        return " ".join(old_words)
+    elif len(old_words) >= window_size and len(new_words) >= window_size:
+        # Find the window where the cumulative string distance
+        # between the words in that window in the old/new transcription
+        # is minimized.
+        old_slice = old_words[len(old_words) - window_size:]
+
+        best_match_i = None
+        best_match_d = window_size * 1000
+
+        for i in range(0, 1 + len(new_words) - window_size):
+            new_slice = new_words[i:i + window_size]
+            cur_d = 0
+            for j in range(0, window_size):
+                cur_d += levenshtein_distance(old_slice[j], new_slice[j])
+            if cur_d < best_match_d:
+                best_match_i = i
+                best_match_d = cur_d
+
+        old_prefix = old_words[0:len(old_words) - window_size]
+        overlap = new_words[best_match_i:best_match_i + window_size]
+        new_suffix = new_words[best_match_i + window_size:]
+
+        #print("Best match i:    {}".format(best_match_i))
+        #print("Window size:     {}".format(window_size))
+        #print("Old prefix:      {}".format(old_prefix))
+        #print("Overlap:         {}".format(overlap))
+        #print("New suffix:      {}".format(new_suffix))
+        return " ".join(old_prefix + new_words[best_match_i:])
+    else:
+        return " ".join(new_words)
+
+def matchStrings(old_text: str, new_text: str, window_size = 4) -> str:
+    old_words = old_text.split()
+    new_words = new_text.split()
+    return matchStringList(old_words, new_words, window_size)
+
+if __name__ == "__main__":
+    # Identical transcriptions should not be changed.
+    assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
+    # A suffix should be detected and ignored.
+    assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
+    # A lengthening suffix should be correctly appended.
+    assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
+    # A strictly longer transcription should override the old prefix.
+    assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
+    # Paranoia: repetitive text broke the older implementation, so I included
+    # some test cases without fully understanding what the old problem was.
+    assert(matchStrings("test test test", "test test test test test test", window_size
+        = 3) == "test test test test test test")
+    assert(matchStrings("test test test test test test", "test test test", window_size
+        = 3) == "test test test test test test")
+    print("Tests passed.")
+
diff --git a/transcribe.py b/transcribe.py
index 99429c6..474cd59 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -2,8 +2,7 @@
 
 import argparse
 import copy
-# python3 -m pip install python-Levenshtein
-from Levenshtein import distance as levenshtein_distance
+import string_matcher
 import os
 import osc_ctrl
 # python3 -m pip install pydub
@@ -212,7 +211,7 @@ def transcribeAudio(audio_state, model):
             continue
 
         words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
-        print("words: {}".format(words))
+        #print("words: {}".format(words))
         if len(words) > 0 and words[-1] == "clear":
             audio_state.text = ""
             audio_state.text_candidate = ""
@@ -229,7 +228,7 @@ def transcribeAudio(audio_state, model):
         #       immediately accept it, since the transcription is obviously
         #       somewhat stable.
         #   3. If the transcription is somewhat long and the
-        #       first few characters change, we assume this is due to a
+        #       first few words change, we assume this is due to a
         #       trim event and immediately accept the transcription.
         candidate_words = ''.join(c for c in audio_state.text_candidate.lower() if (c.isalpha() or c == " ")).split()
 
@@ -241,47 +240,17 @@ def transcribeAudio(audio_state, model):
         commit_transcription = False
         if words == candidate_words or candidate_words_are_prefix_of_text:
             commit_transcription = True
-        elif len(text) > 30 and len(audio_state.text_candidate) >= 10:
-            d = levenshtein_distance(text[0:10],
-                    audio_state.text_candidate[0:10])
-            if d > 2:
-                commit_transcription = True
+        elif len(words) >= 3 and len(candidate_words) >= 3 and \
+                words[0:3] != candidate_words[0:3]:
+            commit_transcription = True
 
         print("Transcription: {}".format(audio_state.text))
 
         if commit_transcription:
-            window_size = 20
             old_text = audio_state.text
-            if audio_state.text == text:
-                pass
-            elif len(text) >= window_size and len(old_text) >= window_size:
-                old_slice = old_text[len(old_text) - window_size:]
-                best_match_i = None
-                best_match_d = window_size * 1000
-                for i in range(0, 1 + len(text) - window_size):
-                    new_slice = text[i:i + window_size]
-                    #print("Consider slice {}".format(new_slice))
-                    d = levenshtein_distance(old_slice, new_slice)
-                    if d < best_match_d and d < window_size:
-                        best_match_i = i
-                        best_match_d = d
-                if best_match_i == None:
-                    audio_state.text = text
-                else:
-                    #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
-                    #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
-                    #print("New suffix: {}".format(text[best_match_i:]))
-                    new_text = old_text[0:len(old_text) - window_size]
-                    new_text += text[best_match_i:]
-                    audio_state.text = new_text
-            else:
-                audio_state.text = text
-
-            if audio_state.text != old_text:
-                # We think the user said something, so  reset the amount of
-                # time we sleep between transcriptions to the minimum.
-                audio_state.transcribe_no_change_count = 0
-                audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
+            old_words = audio_state.text.split()
+            new_words = text.split()
+            audio_state.text = string_matcher.matchStringList(old_words, new_words)
 
         audio_state.text_candidate = text
author	yum <yum.food.vr@gmail.com>	2022-10-25 17:46:44 -0700
committer	yum <yum.food.vr@gmail.com>	2022-10-25 17:46:44 -0700
commit	eefa14c431efa4e3bc16cafbcb004e41622c2411 (patch)
tree	98885c442474e812c0442c1d2d38b6021c0e7692
parent	f84d83b611b751d20c9b7b983c4a90e389117a6b (diff)