summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-11-22 19:01:01 -0800
committeryum <yum.food.vr@gmail.com>2022-11-22 19:01:01 -0800
commit9f87674d1b484a2e61e87ad53d8ebcf9985dce6b (patch)
treeb3e8958a5a28901f1ecb9409db22ba15dba00a77
parenta7a52b884061b154eaae6b0a8c0d5b443cbb8abe (diff)
Shorten audio window to 10 seconds
This helps with temporal stability in long-running transcriptions, and lets us get rid of that hack where we refuse to update old pages.
-rw-r--r--Images/speech_to_text_demo.gifbin6505549 -> 5410888 bytes
-rw-r--r--osc_ctrl.py2
-rw-r--r--transcribe.py5
3 files changed, 4 insertions, 3 deletions
diff --git a/Images/speech_to_text_demo.gif b/Images/speech_to_text_demo.gif
index 509f26b..661d80b 100644
--- a/Images/speech_to_text_demo.gif
+++ b/Images/speech_to_text_demo.gif
Binary files differ
diff --git a/osc_ctrl.py b/osc_ctrl.py
index ea0c145..c72aed6 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -250,7 +250,7 @@ def sendMessageLazy(client, msg, tx_state):
last_cell = (len(tx_state.last_msg_encoded) / NUM_LAYERS) - 1
last_page = floor(last_cell / (2 ** generate_utils.INDEX_BITS))
if page < last_page:
- continue
+ pass
if cell_msg == [state.encoding[' ']] * NUM_LAYERS:
if empty_cells_sent >= tx_state.empty_cells_to_send_per_call:
diff --git a/transcribe.py b/transcribe.py
index 91fcd54..b316014 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -33,7 +33,8 @@ class AudioState:
# The maximum length that recordAudio() will put into frames before it
# starts dropping from the start.
- MAX_LENGTH_S = 30
+ MAX_LENGTH_S = 10
+ MAX_LENGTH_S_WHISPER = 30
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
@@ -202,7 +203,7 @@ def transcribe(audio_state, model, filename):
audio_state.transcribe_lock.release()
audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
- audio_state.MAX_LENGTH_S)
+ audio_state.MAX_LENGTH_S_WHISPER)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
result = None