summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-12-30 00:01:28 -0800
committeryum <yum.food.vr@gmail.com>2022-12-30 00:01:28 -0800
commitabdaa7ce215086bf1070d6093731cd35df866cbb (patch)
tree5b17cf5f699e9e21dd479dd17d69a65727fa28c7
parent12bcd1d40df21cb9bad6ae353ea4f6697e1275a6 (diff)
Fine-tune transcription
Bump up recording window to 28 seconds. This helps a lot with long-form transcription tasks, s.a. transcribing an audiobook. We should expose this as a parameter, since at 10s the transcription delay is typically 300ms, while at 28s it's typically 1.1-1.2s.
-rw-r--r--Scripts/osc_ctrl.py16
-rw-r--r--Scripts/string_matcher.py6
-rw-r--r--Scripts/transcribe.py4
3 files changed, 15 insertions, 11 deletions
diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py
index 7efd435..422d854 100644
--- a/Scripts/osc_ctrl.py
+++ b/Scripts/osc_ctrl.py
@@ -90,8 +90,8 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
msg_slice, slice_idx = osc_state.pager.getNextSlice(msg)
if slice_idx == -1:
return True
- print("sending page {}: {} ({})".format(slice_idx, msg_slice,
- len(msg_slice)))
+ #print("sending page {}: {} ({})".format(slice_idx, msg_slice,
+ # len(msg_slice)))
empty_slice = " " * len(msg_slice)
if msg_slice != empty_slice:
@@ -100,20 +100,20 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
# Really long messages just wrap back around.
which_region = (slice_idx % generate_utils.config.numRegions(0))
- print("send to region {}".format(which_region))
+ #print("send to region {}".format(which_region))
# if in last region:
# how long is it
num_cells = generate_utils.config.BOARD_ROWS * generate_utils.config.BOARD_COLS
num_regions = ceil(num_cells / generate_utils.config.CHARS_PER_SYNC)
- print("num regions: {}".format(num_regions))
+ #print("num regions: {}".format(num_regions))
if which_region == num_regions - 1:
layers_in_last_region = num_cells % generate_utils.config.CHARS_PER_SYNC
- print("layers in last region: {}".format(layers_in_last_region))
+ #print("layers in last region: {}".format(layers_in_last_region))
old_len = len(msg_slice)
msg_slice = msg_slice[0:layers_in_last_region]
- print("truncate msg_slice from length {} to length {}".format(old_len,
- len(msg_slice)))
+ #print("truncate msg_slice from length {} to length {}".format(old_len,
+ # len(msg_slice)))
enable(osc_state.client)
@@ -123,7 +123,7 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
# Update each letter.
encoded = encodeMessage(osc_state.encoding, msg_slice)
- print("len encoded: {}".format(len(encoded)))
+ #print("len encoded: {}".format(len(encoded)))
for i in range(0, len(encoded)):
updateRegion(osc_state.client, i, encoded[i])
diff --git a/Scripts/string_matcher.py b/Scripts/string_matcher.py
index 461f180..686056c 100644
--- a/Scripts/string_matcher.py
+++ b/Scripts/string_matcher.py
@@ -52,6 +52,7 @@ def matchSpaceDelimitedStrings(old_text: str, new_text: str, window_size = 4) ->
def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
if old_text == new_text:
+ print("STRING MATCH exception path 1")
return old_text
elif len(old_text) >= window_size and len(new_text) >= window_size:
# Find the window where the cumulative string distance
@@ -67,7 +68,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
# slice in the old and new transcriptions (O(N^2) time complexity).
# This is still wildly inefficient, but good enough for continuous
# transcription in a game bound by a single CPU core, like VRChat.
- max_old_slices = 300
+ max_old_slices = 150
old_n_slices = min(max_old_slices, len(old_text))
last_old_window = len(old_text) - window_size
first_old_window = max(last_old_window - old_n_slices, 0)
@@ -104,6 +105,9 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
new_text[best_match_j:]))
return old_prefix + new_text[best_match_j:]
else:
+ print("STRING MATCH exception path 2")
+ print(" OLD: {}".format(old_text))
+ print(" NEW: {}".format(new_text))
return new_text
if __name__ == "__main__":
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 37a8407..21bb4ba 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -35,7 +35,7 @@ class AudioState:
# The maximum length that recordAudio() will put into frames before it
# starts dropping from the start.
- self.MAX_LENGTH_S = 10
+ self.MAX_LENGTH_S = 28
self.MAX_LENGTH_S_WHISPER = 30
# The minimum length that recordAudio() will wait for before saving audio.
self.MIN_LENGTH_S = 1
@@ -283,7 +283,7 @@ def transcribeAudio(audio_state, model):
old_text = audio_state.text
audio_state.text = string_matcher.matchStrings(audio_state.text,
- text, window_size = 20)
+ text, window_size = 25)
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.