diff options
Diffstat (limited to 'string_matcher.py')
| -rw-r--r-- | string_matcher.py | 80 |
1 files changed, 73 insertions, 7 deletions
diff --git a/string_matcher.py b/string_matcher.py index 17bfaac..f529e0c 100644 --- a/string_matcher.py +++ b/string_matcher.py @@ -5,6 +5,8 @@ from Levenshtein import distance as levenshtein_distance import typing +DEBUG = False + # Find the window where the distance between these two transcriptions is # minimized and use it to stitch them together. def matchStringList(old_words: typing.List[str], @@ -42,25 +44,89 @@ def matchStringList(old_words: typing.List[str], else: return " ".join(new_words) -def matchStrings(old_text: str, new_text: str, window_size = 4) -> str: +def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) -> str: old_words = old_text.split() new_words = new_text.split() return matchStringList(old_words, new_words, window_size) +def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: + if old_text == new_text: + return old_text + elif len(old_text) >= window_size and len(new_text) >= window_size: + # Find the window where the cumulative string distance + # between the text in that window in the old/new transcription + # is minimized. + + best_match_i = None + best_match_j = None + best_match_d = window_size * 1000 + + for i in range(0, 1 + len(old_text) - window_size): + old_slice = old_text[i:i + window_size] + + for j in range(0, 1 + len(new_text) - window_size): + new_slice = new_text[j:j + window_size] + cur_d = 0 + for k in range(0, window_size): + cur_d += levenshtein_distance(old_slice[k], new_slice[k]) + if cur_d <= best_match_d: + best_match_i = i + best_match_j = j + best_match_d = cur_d + + if DEBUG: + print("optimum at old '{}'/{} new '{}'/{} d={}".format( + old_slice, i, new_slice, j, cur_d)) + + old_prefix = old_text[0:best_match_i] + overlap = new_text[best_match_j:best_match_j + window_size] + new_suffix = new_text[best_match_j + window_size:] + + if DEBUG: + print("Best match i: {}".format(best_match_i)) + print("Best match j: {}".format(best_match_j)) + print("Window size: {}".format(window_size)) + print("Old prefix: {}".format(old_prefix)) + print("Overlap: {}".format(overlap)) + print("New suffix: {}".format(new_suffix)) + print("Input 1: {}".format(old_text)) + print("Input 2: {}".format(new_text)) + print("Output: {}".format(old_prefix + + new_text[best_match_j:])) + return old_prefix + new_text[best_match_j:] + else: + return new_text + if __name__ == "__main__": # Identical transcriptions should not be changed. - assert(matchStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.") + assert(matchSpaceDelimitedStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.") # A suffix should be detected and ignored. - assert(matchStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.") + assert(matchSpaceDelimitedStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.") # A lengthening suffix should be correctly appended. - assert(matchStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.") + assert(matchSpaceDelimitedStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.") # A strictly longer transcription should override the old prefix. - assert(matchStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.") + assert(matchSpaceDelimitedStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.") # Paranoia: repetitive text broke the older implementation, so I included # some test cases without fully understanding what the old problem was. - assert(matchStrings("test test test", "test test test test test test", window_size + assert(matchSpaceDelimitedStrings("test test test", "test test test test test test", window_size = 3) == "test test test test test test") - assert(matchStrings("test test test test test test", "test test test", window_size + assert(matchSpaceDelimitedStrings("test test test test test test", "test test test", window_size = 3) == "test test test test test test") + + print(matchStrings("foo bar", "bar baz")) + print(matchStrings("alpha beta", "beta gamma")) + + in1 = "Okay, what about now? Looks like it sort of works. Key word being sort of." + in2 = "okay what about now looks like it sort of works key word being sort of looks" + bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks" + good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks" + print(matchStrings(in1, in2)) + + in1 = "This repository can take" + in2 = "This repository contains the code for" + bad_out = "This repository can tode for" + good_out = "This repository contains the code for" + print(matchStrings(in1, in2)) + print("Tests passed.") |
