summaryrefslogtreecommitdiffstats
path: root/string_matcher.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-11-07 20:31:16 -0800
committeryum <yum.food.vr@gmail.com>2022-11-07 20:31:16 -0800
commit77c6f366b2f81c60ed67e2fa6dc92df451e4229c (patch)
treefafe57211324c624cb40b2d3fa252fe3e163e633 /string_matcher.py
parent564a3abbf4247bd0658b8d4eb8a4881fa274e309 (diff)
Fix matchStrings O(n^2) loop
This slides 2 windows across input strings, looking for a region where they are most similar. It then uses that region to stitch the strings together. Since transcribe.py passes in a continuous transcription as the `old_text` argument, we can wind up spending a lot of time here. Constrain the area of the `old_text` argument that we look at to the most recent 50 characters. This should be good enough. Also fix how we calculate levenshtein_distance. Uh... yeah, let's not talk about how it was before.
Diffstat (limited to 'string_matcher.py')
-rw-r--r--string_matcher.py25
1 files changed, 19 insertions, 6 deletions
diff --git a/string_matcher.py b/string_matcher.py
index f529e0c..9060b26 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -61,14 +61,22 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
best_match_j = None
best_match_d = window_size * 1000
- for i in range(0, 1 + len(old_text) - window_size):
+ # The number of old slices to look at. Since the old text can grow
+ # unboundedly, it's crucial that we don't compare to every possible
+ # slice in the old and new transcriptions (O(N^2) time complexity).
+ # This is still wildly inefficient, but good enough for continuous
+ # transcription in a game bound by a single CPU core, like VRChat.
+ max_old_slices = 50
+ old_n_slices = min(max_old_slices, len(old_text))
+ last_old_window = len(old_text) - window_size
+ first_old_window = max(last_old_window - old_n_slices, 0)
+
+ for i in range(first_old_window, last_old_window + 1):
old_slice = old_text[i:i + window_size]
for j in range(0, 1 + len(new_text) - window_size):
new_slice = new_text[j:j + window_size]
- cur_d = 0
- for k in range(0, window_size):
- cur_d += levenshtein_distance(old_slice[k], new_slice[k])
+ cur_d = levenshtein_distance(old_slice, new_slice)
if cur_d <= best_match_d:
best_match_i = i
best_match_j = j
@@ -120,13 +128,18 @@ if __name__ == "__main__":
in2 = "okay what about now looks like it sort of works key word being sort of looks"
bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
- print(matchStrings(in1, in2))
+ assert(matchStrings(in1, in2) == good_out)
in1 = "This repository can take"
in2 = "This repository contains the code for"
bad_out = "This repository can tode for"
good_out = "This repository contains the code for"
- print(matchStrings(in1, in2))
+ assert(matchStrings(in1, in2) == good_out)
+
+ in1 = "a" * 10 * 1000
+ in2 = "b" * 10 * 1000
+ # This should be fast (< 1 second)
+ matchStrings(in1, in2)
print("Tests passed.")