Fix matchStrings O(n^2) loop

This slides 2 windows across input strings, looking for a region where they are most similar. It then uses that region to stitch the strings together. Since transcribe.py passes in a continuous transcription as the `old_text` argument, we can wind up spending a lot of time here. Constrain the area of the `old_text` argument that we look at to the most recent 50 characters. This should be good enough. Also fix how we calculate levenshtein_distance. Uh... yeah, let's not talk about how it was before.
author: yum <yum.food.vr@gmail.com> 2022-11-07 20:31:16 -0800
committer: yum <yum.food.vr@gmail.com> 2022-11-07 20:31:16 -0800
commit: 77c6f366b2f81c60ed67e2fa6dc92df451e4229c (patch)
tree: fafe57211324c624cb40b2d3fa252fe3e163e633 /string_matcher.py
parent: 564a3abbf4247bd0658b8d4eb8a4881fa274e309 (diff)
1 files changed, 19 insertions, 6 deletions
diff --git a/string_matcher.py b/string_matcher.py
index f529e0c..9060b26 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -61,14 +61,22 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
         best_match_j = None
         best_match_d = window_size * 1000
 
-        for i in range(0, 1 + len(old_text) - window_size):
+        # The number of old slices to look at. Since the old text can grow
+        # unboundedly, it's crucial that we don't compare to every possible
+        # slice in the old and new transcriptions (O(N^2) time complexity).
+        # This is still wildly inefficient, but good enough for continuous
+        # transcription in a game bound by a single CPU core, like VRChat.
+        max_old_slices = 50
+        old_n_slices = min(max_old_slices, len(old_text))
+        last_old_window = len(old_text) - window_size
+        first_old_window = max(last_old_window - old_n_slices, 0)
+
+        for i in range(first_old_window, last_old_window + 1):
             old_slice = old_text[i:i + window_size]
 
             for j in range(0, 1 + len(new_text) - window_size):
                 new_slice = new_text[j:j + window_size]
-                cur_d = 0
-                for k in range(0, window_size):
-                    cur_d += levenshtein_distance(old_slice[k], new_slice[k])
+                cur_d = levenshtein_distance(old_slice, new_slice)
                 if cur_d <= best_match_d:
                     best_match_i = i
                     best_match_j = j
@@ -120,13 +128,18 @@ if __name__ == "__main__":
     in2 = "okay what about now looks like it sort of works key word being sort of looks"
     bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
     good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
-    print(matchStrings(in1, in2))
+    assert(matchStrings(in1, in2) == good_out)
 
     in1 = "This repository can take"
     in2 = "This repository contains the code for"
     bad_out  = "This repository can tode for"
     good_out = "This repository contains the code for"
-    print(matchStrings(in1, in2))
+    assert(matchStrings(in1, in2) == good_out)
+
+    in1 = "a" * 10 * 1000
+    in2 = "b" * 10 * 1000
+    # This should be fast (< 1 second)
+    matchStrings(in1, in2)
 
     print("Tests passed.")
author	yum <yum.food.vr@gmail.com>	2022-11-07 20:31:16 -0800
committer	yum <yum.food.vr@gmail.com>	2022-11-07 20:31:16 -0800
commit	77c6f366b2f81c60ed67e2fa6dc92df451e4229c (patch)
tree	fafe57211324c624cb40b2d3fa252fe3e163e633 /string_matcher.py
parent	564a3abbf4247bd0658b8d4eb8a4881fa274e309 (diff)