diff options
| author | yum <yum.food.vr@gmail.com> | 2022-11-07 20:31:16 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-11-07 20:31:16 -0800 |
| commit | 77c6f366b2f81c60ed67e2fa6dc92df451e4229c (patch) | |
| tree | fafe57211324c624cb40b2d3fa252fe3e163e633 /string_matcher.py | |
| parent | 564a3abbf4247bd0658b8d4eb8a4881fa274e309 (diff) | |
Fix matchStrings O(n^2) loop
This slides 2 windows across input strings, looking for a region where
they are most similar. It then uses that region to stitch the strings
together. Since transcribe.py passes in a continuous transcription as
the `old_text` argument, we can wind up spending a lot of time here.
Constrain the area of the `old_text` argument that we look at to the
most recent 50 characters. This should be good enough.
Also fix how we calculate levenshtein_distance. Uh... yeah, let's not
talk about how it was before.
Diffstat (limited to 'string_matcher.py')
| -rw-r--r-- | string_matcher.py | 25 |
1 files changed, 19 insertions, 6 deletions
diff --git a/string_matcher.py b/string_matcher.py index f529e0c..9060b26 100644 --- a/string_matcher.py +++ b/string_matcher.py @@ -61,14 +61,22 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: best_match_j = None best_match_d = window_size * 1000 - for i in range(0, 1 + len(old_text) - window_size): + # The number of old slices to look at. Since the old text can grow + # unboundedly, it's crucial that we don't compare to every possible + # slice in the old and new transcriptions (O(N^2) time complexity). + # This is still wildly inefficient, but good enough for continuous + # transcription in a game bound by a single CPU core, like VRChat. + max_old_slices = 50 + old_n_slices = min(max_old_slices, len(old_text)) + last_old_window = len(old_text) - window_size + first_old_window = max(last_old_window - old_n_slices, 0) + + for i in range(first_old_window, last_old_window + 1): old_slice = old_text[i:i + window_size] for j in range(0, 1 + len(new_text) - window_size): new_slice = new_text[j:j + window_size] - cur_d = 0 - for k in range(0, window_size): - cur_d += levenshtein_distance(old_slice[k], new_slice[k]) + cur_d = levenshtein_distance(old_slice, new_slice) if cur_d <= best_match_d: best_match_i = i best_match_j = j @@ -120,13 +128,18 @@ if __name__ == "__main__": in2 = "okay what about now looks like it sort of works key word being sort of looks" bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks" good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks" - print(matchStrings(in1, in2)) + assert(matchStrings(in1, in2) == good_out) in1 = "This repository can take" in2 = "This repository contains the code for" bad_out = "This repository can tode for" good_out = "This repository contains the code for" - print(matchStrings(in1, in2)) + assert(matchStrings(in1, in2) == good_out) + + in1 = "a" * 10 * 1000 + in2 = "b" * 10 * 1000 + # This should be fast (< 1 second) + matchStrings(in1, in2) print("Tests passed.") |
