#!/usr/bin/env python3 # python3 -m pip install editdistance # License: MIT. import editdistance import typing DEBUG = False # Find the window where the distance between these two transcriptions is # minimized and use it to stitch them together. def matchStringList(old_words: typing.List[str], new_words: typing.List[str], window_size = 6) -> str: if old_words == new_words: return " ".join(old_words) elif len(old_words) >= window_size and len(new_words) >= window_size: # Find the window where the cumulative string distance # between the words in that window in the old/new transcription # is minimized. old_slice = old_words[len(old_words) - window_size:] best_match_i = None best_match_d = window_size * 1000 for i in range(0, 1 + len(new_words) - window_size): new_slice = new_words[i:i + window_size] cur_d = 0 for j in range(0, window_size): cur_d += editdistance.eval(old_slice[j], new_slice[j]) if cur_d < best_match_d: best_match_i = i best_match_d = cur_d old_prefix = old_words[0:len(old_words) - window_size] overlap = new_words[best_match_i:best_match_i + window_size] new_suffix = new_words[best_match_i + window_size:] #print("Best match i: {}".format(best_match_i)) #print("Window size: {}".format(window_size)) #print("Old prefix: {}".format(old_prefix)) #print("Overlap: {}".format(overlap)) #print("New suffix: {}".format(new_suffix)) return " ".join(old_prefix + new_words[best_match_i:]) else: return " ".join(new_words) def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) -> str: old_words = old_text.split() new_words = new_text.split() return matchStringList(old_words, new_words, window_size) def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: if old_text == new_text: if DEBUG: print("STRING MATCH exception path 1") return old_text elif len(new_text) == 0: return old_text elif len(old_text) == 0: return new_text elif len(old_text) >= window_size and len(new_text) >= window_size: # Find the window where the cumulative string distance # between the text in that window in the old/new transcription # is minimized. best_match_i = None best_match_j = None best_match_d = window_size * 1000 # The number of old slices to look at. Since the old text can grow # unboundedly, it's crucial that we don't compare to every possible # slice in the old and new transcriptions (O(N^2) time complexity). # This is still wildly inefficient, but good enough for continuous # transcription in a game bound by a single CPU core, like VRChat. max_old_slices = 150 old_n_slices = min(max_old_slices, len(old_text)) last_old_window = len(old_text) - window_size first_old_window = max(last_old_window - old_n_slices, 0) for i in range(first_old_window, last_old_window + 1): old_slice = old_text[i:i + window_size] for j in range(0, 1 + len(new_text) - window_size): new_slice = new_text[j:j + window_size] cur_d = editdistance.eval(old_slice, new_slice) if cur_d < best_match_d: best_match_i = i best_match_j = j best_match_d = cur_d if DEBUG: print("optimum at old '{}' i={} new '{}' j={} d={}".format( old_slice, i, new_slice, j, cur_d)) old_prefix = old_text[0:best_match_i] overlap = new_text[best_match_j:best_match_j + window_size] new_suffix = new_text[best_match_j + window_size:] if DEBUG: print("Best match i: {}".format(best_match_i)) print("Best match j: {}".format(best_match_j)) print("Window size: {}".format(window_size)) print("Old prefix: {}".format(old_prefix)) print("Overlap: {}".format(overlap)) print("New suffix: {}".format(new_suffix)) print("Input 1: {}".format(old_text)) print("Input 2: {}".format(new_text)) print("Output: {}".format(old_prefix + new_text[best_match_j:])) return old_prefix + new_text[best_match_j:] else: if DEBUG: print("STRING MATCH exception path 2") print(" OLD: {}".format(old_text)) print(" NEW: {}".format(new_text)) return new_text if __name__ == "__main__": # Identical transcriptions should not be changed. assert(matchSpaceDelimitedStrings("This is a test case.", "This is a test case.", window_size = 3) == "This is a test case.") # A suffix should be detected and ignored. assert(matchSpaceDelimitedStrings("This is a test case.", "is a test case.", window_size = 3) == "This is a test case.") # A lengthening suffix should be correctly appended. assert(matchSpaceDelimitedStrings("This is a test", "is a test case.", window_size = 3) == "This is a test case.") # A strictly longer transcription should override the old prefix. assert(matchSpaceDelimitedStrings("This is a test", "This is a test case.", window_size = 3) == "This is a test case.") # Paranoia: repetitive text broke the older implementation, so I included # some test cases without fully understanding what the old problem was. assert(matchSpaceDelimitedStrings("test test test", "test test test test test test", window_size = 3) == "test test test test test test") assert(matchSpaceDelimitedStrings("test test test test test test", "test test test", window_size = 3) == "test test test test test test") print(matchStrings("foo bar", "bar baz")) print(matchStrings("alpha beta", "beta gamma")) in1 = "Okay, what about now? Looks like it sort of works. Key word being sort of." in2 = "okay what about now looks like it sort of works key word being sort of looks" bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks" good_out = "Okay what about now looks like it sort of works key word being sort of looks" good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks" print(matchStrings(in1, in2)) assert(matchStrings(in1, in2) == good_out) in1 = "This repository can take" in2 = "This repository contains the code for" bad_out = "This repository can tode for" good_out = "This repository contains the code for" assert(matchStrings(in1, in2) == good_out) in1 = "See something." in2 = "See something. Say something." bad_out = in1 good_out = in2 print(matchStrings(in1, in2)) assert(matchStrings(in1, in2) == bad_out) in1 = "a" * 1000 in2 = "b" * 10 * 1000 # This should be fast (< 1 second) #matchStrings(in1, in2) print("Tests passed.")