diff options
| author | yum <yum.food.vr@gmail.com> | 2022-12-30 00:01:28 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-12-30 00:01:28 -0800 |
| commit | abdaa7ce215086bf1070d6093731cd35df866cbb (patch) | |
| tree | 5b17cf5f699e9e21dd479dd17d69a65727fa28c7 | |
| parent | 12bcd1d40df21cb9bad6ae353ea4f6697e1275a6 (diff) | |
Fine-tune transcription
Bump up recording window to 28 seconds. This helps a lot with long-form
transcription tasks, s.a. transcribing an audiobook.
We should expose this as a parameter, since at 10s the transcription delay is
typically 300ms, while at 28s it's typically 1.1-1.2s.
| -rw-r--r-- | Scripts/osc_ctrl.py | 16 | ||||
| -rw-r--r-- | Scripts/string_matcher.py | 6 | ||||
| -rw-r--r-- | Scripts/transcribe.py | 4 |
3 files changed, 15 insertions, 11 deletions
diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py index 7efd435..422d854 100644 --- a/Scripts/osc_ctrl.py +++ b/Scripts/osc_ctrl.py @@ -90,8 +90,8 @@ def pageMessage(osc_state: OscState, msg: str) -> bool: msg_slice, slice_idx = osc_state.pager.getNextSlice(msg) if slice_idx == -1: return True - print("sending page {}: {} ({})".format(slice_idx, msg_slice, - len(msg_slice))) + #print("sending page {}: {} ({})".format(slice_idx, msg_slice, + # len(msg_slice))) empty_slice = " " * len(msg_slice) if msg_slice != empty_slice: @@ -100,20 +100,20 @@ def pageMessage(osc_state: OscState, msg: str) -> bool: # Really long messages just wrap back around. which_region = (slice_idx % generate_utils.config.numRegions(0)) - print("send to region {}".format(which_region)) + #print("send to region {}".format(which_region)) # if in last region: # how long is it num_cells = generate_utils.config.BOARD_ROWS * generate_utils.config.BOARD_COLS num_regions = ceil(num_cells / generate_utils.config.CHARS_PER_SYNC) - print("num regions: {}".format(num_regions)) + #print("num regions: {}".format(num_regions)) if which_region == num_regions - 1: layers_in_last_region = num_cells % generate_utils.config.CHARS_PER_SYNC - print("layers in last region: {}".format(layers_in_last_region)) + #print("layers in last region: {}".format(layers_in_last_region)) old_len = len(msg_slice) msg_slice = msg_slice[0:layers_in_last_region] - print("truncate msg_slice from length {} to length {}".format(old_len, - len(msg_slice))) + #print("truncate msg_slice from length {} to length {}".format(old_len, + # len(msg_slice))) enable(osc_state.client) @@ -123,7 +123,7 @@ def pageMessage(osc_state: OscState, msg: str) -> bool: # Update each letter. encoded = encodeMessage(osc_state.encoding, msg_slice) - print("len encoded: {}".format(len(encoded))) + #print("len encoded: {}".format(len(encoded))) for i in range(0, len(encoded)): updateRegion(osc_state.client, i, encoded[i]) diff --git a/Scripts/string_matcher.py b/Scripts/string_matcher.py index 461f180..686056c 100644 --- a/Scripts/string_matcher.py +++ b/Scripts/string_matcher.py @@ -52,6 +52,7 @@ def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) -> def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: if old_text == new_text: + print("STRING MATCH exception path 1") return old_text elif len(old_text) >= window_size and len(new_text) >= window_size: # Find the window where the cumulative string distance @@ -67,7 +68,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: # slice in the old and new transcriptions (O(N^2) time complexity). # This is still wildly inefficient, but good enough for continuous # transcription in a game bound by a single CPU core, like VRChat. - max_old_slices = 300 + max_old_slices = 150 old_n_slices = min(max_old_slices, len(old_text)) last_old_window = len(old_text) - window_size first_old_window = max(last_old_window - old_n_slices, 0) @@ -104,6 +105,9 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: new_text[best_match_j:])) return old_prefix + new_text[best_match_j:] else: + print("STRING MATCH exception path 2") + print(" OLD: {}".format(old_text)) + print(" NEW: {}".format(new_text)) return new_text if __name__ == "__main__": diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 37a8407..21bb4ba 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -35,7 +35,7 @@ class AudioState: # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. - self.MAX_LENGTH_S = 10 + self.MAX_LENGTH_S = 28 self.MAX_LENGTH_S_WHISPER = 30 # The minimum length that recordAudio() will wait for before saving audio. self.MIN_LENGTH_S = 1 @@ -283,7 +283,7 @@ def transcribeAudio(audio_state, model): old_text = audio_state.text audio_state.text = string_matcher.matchStrings(audio_state.text, - text, window_size = 20) + text, window_size = 25) if old_text != audio_state.text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. |
