String matching no longer relies on spaces

Add a `matchStrings` which does basically the same thing as `matchStringList` except it doesn't split the input at space boundaries. I think this should work better for Japanese and Chinese, since they don't use spaces. Doesn't seem to cause any accuracy regressions for English. Also update the README.
author: yum <yum.food.vr@gmail.com> 2022-11-06 12:50:38 -0800
committer: yum <yum.food.vr@gmail.com> 2022-11-06 12:50:38 -0800
commit: 7146acb9d4ad751fc5ced411a2990d0aad17d08f (patch)
tree: 30d5f9f9a7f47bc4272fa9e9fff5c0226c376686 /string_matcher.py
parent: 3a123fb5cabdbdef4f1b98031ec90c42e1d6e911 (diff)
1 files changed, 73 insertions, 7 deletions
diff --git a/string_matcher.py b/string_matcher.py
index 17bfaac..f529e0c 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -5,6 +5,8 @@ from Levenshtein import distance as levenshtein_distance
 
 import typing
 
+DEBUG = False
+
 # Find the window where the distance between these two transcriptions is
 # minimized and use it to stitch them together.
 def matchStringList(old_words: typing.List[str],
@@ -42,25 +44,89 @@ def matchStringList(old_words: typing.List[str],
     else:
         return " ".join(new_words)
 
-def matchStrings(old_text: str, new_text: str, window_size = 4) -> str:
+def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) -> str:
     old_words = old_text.split()
     new_words = new_text.split()
     return matchStringList(old_words, new_words, window_size)
 
+def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
+    if old_text == new_text:
+        return old_text
+    elif len(old_text) >= window_size and len(new_text) >= window_size:
+        # Find the window where the cumulative string distance
+        # between the text in that window in the old/new transcription
+        # is minimized.
+
+        best_match_i = None
+        best_match_j = None
+        best_match_d = window_size * 1000
+
+        for i in range(0, 1 + len(old_text) - window_size):
+            old_slice = old_text[i:i + window_size]
+
+            for j in range(0, 1 + len(new_text) - window_size):
+                new_slice = new_text[j:j + window_size]
+                cur_d = 0
+                for k in range(0, window_size):
+                    cur_d += levenshtein_distance(old_slice[k], new_slice[k])
+                if cur_d <= best_match_d:
+                    best_match_i = i
+                    best_match_j = j
+                    best_match_d = cur_d
+
+                    if DEBUG:
+                        print("optimum at old '{}'/{} new '{}'/{} d={}".format(
+                            old_slice, i, new_slice, j, cur_d))
+
+        old_prefix = old_text[0:best_match_i]
+        overlap = new_text[best_match_j:best_match_j + window_size]
+        new_suffix = new_text[best_match_j + window_size:]
+
+        if DEBUG:
+            print("Best match i:    {}".format(best_match_i))
+            print("Best match j:    {}".format(best_match_j))
+            print("Window size:     {}".format(window_size))
+            print("Old prefix:      {}".format(old_prefix))
+            print("Overlap:         {}".format(overlap))
+            print("New suffix:      {}".format(new_suffix))
+            print("Input 1:         {}".format(old_text))
+            print("Input 2:         {}".format(new_text))
+            print("Output:          {}".format(old_prefix +
+                new_text[best_match_j:]))
+        return old_prefix + new_text[best_match_j:]
+    else:
+        return new_text
+
 if __name__ == "__main__":
     # Identical transcriptions should not be changed.
-    assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
+    assert(matchSpaceDelimitedStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
     # A suffix should be detected and ignored.
-    assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
+    assert(matchSpaceDelimitedStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
     # A lengthening suffix should be correctly appended.
-    assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
+    assert(matchSpaceDelimitedStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
     # A strictly longer transcription should override the old prefix.
-    assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
+    assert(matchSpaceDelimitedStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
     # Paranoia: repetitive text broke the older implementation, so I included
     # some test cases without fully understanding what the old problem was.
-    assert(matchStrings("test test test", "test test test test test test", window_size
+    assert(matchSpaceDelimitedStrings("test test test", "test test test test test test", window_size
         = 3) == "test test test test test test")
-    assert(matchStrings("test test test test test test", "test test test", window_size
+    assert(matchSpaceDelimitedStrings("test test test test test test", "test test test", window_size
         = 3) == "test test test test test test")
+
+    print(matchStrings("foo bar", "bar baz"))
+    print(matchStrings("alpha beta", "beta gamma"))
+
+    in1 = "Okay, what about now? Looks like it sort of works. Key word being sort of."
+    in2 = "okay what about now looks like it sort of works key word being sort of looks"
+    bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
+    good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
+    print(matchStrings(in1, in2))
+
+    in1 = "This repository can take"
+    in2 = "This repository contains the code for"
+    bad_out  = "This repository can tode for"
+    good_out = "This repository contains the code for"
+    print(matchStrings(in1, in2))
+
     print("Tests passed.")
author	yum <yum.food.vr@gmail.com>	2022-11-06 12:50:38 -0800
committer	yum <yum.food.vr@gmail.com>	2022-11-06 12:50:38 -0800
commit	7146acb9d4ad751fc5ced411a2990d0aad17d08f (patch)
tree	30d5f9f9a7f47bc4272fa9e9fff5c0226c376686 /string_matcher.py
parent	3a123fb5cabdbdef4f1b98031ec90c42e1d6e911 (diff)