summaryrefslogtreecommitdiffstats
path: root/string_matcher.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-11-06 12:50:38 -0800
committeryum <yum.food.vr@gmail.com>2022-11-06 12:50:38 -0800
commit7146acb9d4ad751fc5ced411a2990d0aad17d08f (patch)
tree30d5f9f9a7f47bc4272fa9e9fff5c0226c376686 /string_matcher.py
parent3a123fb5cabdbdef4f1b98031ec90c42e1d6e911 (diff)
String matching no longer relies on spaces
Add a `matchStrings` which does basically the same thing as `matchStringList` except it doesn't split the input at space boundaries. I think this should work better for Japanese and Chinese, since they don't use spaces. Doesn't seem to cause any accuracy regressions for English. Also update the README.
Diffstat (limited to 'string_matcher.py')
-rw-r--r--string_matcher.py80
1 files changed, 73 insertions, 7 deletions
diff --git a/string_matcher.py b/string_matcher.py
index 17bfaac..f529e0c 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -5,6 +5,8 @@ from Levenshtein import distance as levenshtein_distance
import typing
+DEBUG = False
+
# Find the window where the distance between these two transcriptions is
# minimized and use it to stitch them together.
def matchStringList(old_words: typing.List[str],
@@ -42,25 +44,89 @@ def matchStringList(old_words: typing.List[str],
else:
return " ".join(new_words)
-def matchStrings(old_text: str, new_text: str, window_size = 4) -> str:
+def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) -> str:
old_words = old_text.split()
new_words = new_text.split()
return matchStringList(old_words, new_words, window_size)
+def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
+ if old_text == new_text:
+ return old_text
+ elif len(old_text) >= window_size and len(new_text) >= window_size:
+ # Find the window where the cumulative string distance
+ # between the text in that window in the old/new transcription
+ # is minimized.
+
+ best_match_i = None
+ best_match_j = None
+ best_match_d = window_size * 1000
+
+ for i in range(0, 1 + len(old_text) - window_size):
+ old_slice = old_text[i:i + window_size]
+
+ for j in range(0, 1 + len(new_text) - window_size):
+ new_slice = new_text[j:j + window_size]
+ cur_d = 0
+ for k in range(0, window_size):
+ cur_d += levenshtein_distance(old_slice[k], new_slice[k])
+ if cur_d <= best_match_d:
+ best_match_i = i
+ best_match_j = j
+ best_match_d = cur_d
+
+ if DEBUG:
+ print("optimum at old '{}'/{} new '{}'/{} d={}".format(
+ old_slice, i, new_slice, j, cur_d))
+
+ old_prefix = old_text[0:best_match_i]
+ overlap = new_text[best_match_j:best_match_j + window_size]
+ new_suffix = new_text[best_match_j + window_size:]
+
+ if DEBUG:
+ print("Best match i: {}".format(best_match_i))
+ print("Best match j: {}".format(best_match_j))
+ print("Window size: {}".format(window_size))
+ print("Old prefix: {}".format(old_prefix))
+ print("Overlap: {}".format(overlap))
+ print("New suffix: {}".format(new_suffix))
+ print("Input 1: {}".format(old_text))
+ print("Input 2: {}".format(new_text))
+ print("Output: {}".format(old_prefix +
+ new_text[best_match_j:]))
+ return old_prefix + new_text[best_match_j:]
+ else:
+ return new_text
+
if __name__ == "__main__":
# Identical transcriptions should not be changed.
- assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
+ assert(matchSpaceDelimitedStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.")
# A suffix should be detected and ignored.
- assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
+ assert(matchSpaceDelimitedStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.")
# A lengthening suffix should be correctly appended.
- assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
+ assert(matchSpaceDelimitedStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.")
# A strictly longer transcription should override the old prefix.
- assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
+ assert(matchSpaceDelimitedStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.")
# Paranoia: repetitive text broke the older implementation, so I included
# some test cases without fully understanding what the old problem was.
- assert(matchStrings("test test test", "test test test test test test", window_size
+ assert(matchSpaceDelimitedStrings("test test test", "test test test test test test", window_size
= 3) == "test test test test test test")
- assert(matchStrings("test test test test test test", "test test test", window_size
+ assert(matchSpaceDelimitedStrings("test test test test test test", "test test test", window_size
= 3) == "test test test test test test")
+
+ print(matchStrings("foo bar", "bar baz"))
+ print(matchStrings("alpha beta", "beta gamma"))
+
+ in1 = "Okay, what about now? Looks like it sort of works. Key word being sort of."
+ in2 = "okay what about now looks like it sort of works key word being sort of looks"
+ bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
+ good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
+ print(matchStrings(in1, in2))
+
+ in1 = "This repository can take"
+ in2 = "This repository contains the code for"
+ bad_out = "This repository can tode for"
+ good_out = "This repository contains the code for"
+ print(matchStrings(in1, in2))
+
print("Tests passed.")