Fine-tune transcription

Bump up recording window to 28 seconds. This helps a lot with long-form transcription tasks, s.a. transcribing an audiobook. We should expose this as a parameter, since at 10s the transcription delay is typically 300ms, while at 28s it's typically 1.1-1.2s.
author: yum <yum.food.vr@gmail.com> 2022-12-30 00:01:28 -0800
committer: yum <yum.food.vr@gmail.com> 2022-12-30 00:01:28 -0800
commit: abdaa7ce215086bf1070d6093731cd35df866cbb (patch)
tree: 5b17cf5f699e9e21dd479dd17d69a65727fa28c7
parent: 12bcd1d40df21cb9bad6ae353ea4f6697e1275a6 (diff)
3 files changed, 15 insertions, 11 deletions
diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py
index 7efd435..422d854 100644
--- a/Scripts/osc_ctrl.py
+++ b/Scripts/osc_ctrl.py
@@ -90,8 +90,8 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
     msg_slice, slice_idx = osc_state.pager.getNextSlice(msg)
     if slice_idx == -1:
         return True
-    print("sending page {}: {} ({})".format(slice_idx, msg_slice,
-        len(msg_slice)))
+    #print("sending page {}: {} ({})".format(slice_idx, msg_slice,
+    #    len(msg_slice)))
 
     empty_slice = " " * len(msg_slice)
     if msg_slice != empty_slice:
@@ -100,20 +100,20 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
 
     # Really long messages just wrap back around.
     which_region = (slice_idx % generate_utils.config.numRegions(0))
-    print("send to region {}".format(which_region))
+    #print("send to region {}".format(which_region))
 
     # if in last region:
     #   how long is it
     num_cells = generate_utils.config.BOARD_ROWS * generate_utils.config.BOARD_COLS
     num_regions = ceil(num_cells / generate_utils.config.CHARS_PER_SYNC)
-    print("num regions: {}".format(num_regions))
+    #print("num regions: {}".format(num_regions))
     if which_region == num_regions - 1:
         layers_in_last_region = num_cells % generate_utils.config.CHARS_PER_SYNC
-        print("layers in last region: {}".format(layers_in_last_region))
+        #print("layers in last region: {}".format(layers_in_last_region))
         old_len = len(msg_slice)
         msg_slice = msg_slice[0:layers_in_last_region]
-        print("truncate msg_slice from length {} to length {}".format(old_len,
-            len(msg_slice)))
+        #print("truncate msg_slice from length {} to length {}".format(old_len,
+        #    len(msg_slice)))
 
     enable(osc_state.client)
 
@@ -123,7 +123,7 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
 
     # Update each letter.
     encoded = encodeMessage(osc_state.encoding, msg_slice)
-    print("len encoded: {}".format(len(encoded)))
+    #print("len encoded: {}".format(len(encoded)))
     for i in range(0, len(encoded)):
         updateRegion(osc_state.client, i, encoded[i])
 
diff --git a/Scripts/string_matcher.py b/Scripts/string_matcher.py
index 461f180..686056c 100644
--- a/Scripts/string_matcher.py
+++ b/Scripts/string_matcher.py
@@ -52,6 +52,7 @@ def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) ->
 
 def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
     if old_text == new_text:
+        print("STRING MATCH exception path 1")
         return old_text
     elif len(old_text) >= window_size and len(new_text) >= window_size:
         # Find the window where the cumulative string distance
@@ -67,7 +68,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
         # slice in the old and new transcriptions (O(N^2) time complexity).
         # This is still wildly inefficient, but good enough for continuous
         # transcription in a game bound by a single CPU core, like VRChat.
-        max_old_slices = 300
+        max_old_slices = 150
         old_n_slices = min(max_old_slices, len(old_text))
         last_old_window = len(old_text) - window_size
         first_old_window = max(last_old_window - old_n_slices, 0)
@@ -104,6 +105,9 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
                 new_text[best_match_j:]))
         return old_prefix + new_text[best_match_j:]
     else:
+        print("STRING MATCH exception path 2")
+        print("  OLD: {}".format(old_text))
+        print("  NEW: {}".format(new_text))
         return new_text
 
 if __name__ == "__main__":
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 37a8407..21bb4ba 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -35,7 +35,7 @@ class AudioState:
 
         # The maximum length that recordAudio() will put into frames before it
         # starts dropping from the start.
-        self.MAX_LENGTH_S = 10
+        self.MAX_LENGTH_S = 28
         self.MAX_LENGTH_S_WHISPER = 30
         # The minimum length that recordAudio() will wait for before saving audio.
         self.MIN_LENGTH_S = 1
@@ -283,7 +283,7 @@ def transcribeAudio(audio_state, model):
         old_text = audio_state.text
 
         audio_state.text = string_matcher.matchStrings(audio_state.text,
-                text, window_size = 20)
+                text, window_size = 25)
         if old_text != audio_state.text:
             # We think the user said something, so  reset the amount of
             # time we sleep between transcriptions to the minimum.
author	yum <yum.food.vr@gmail.com>	2022-12-30 00:01:28 -0800
committer	yum <yum.food.vr@gmail.com>	2022-12-30 00:01:28 -0800
commit	abdaa7ce215086bf1070d6093731cd35df866cbb (patch)
tree	5b17cf5f699e9e21dd479dd17d69a65727fa28c7
parent	12bcd1d40df21cb9bad6ae353ea4f6697e1275a6 (diff)