summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-07-07 01:27:02 -0700
committeryum <yum.food.vr@gmail.com>2023-07-07 01:27:02 -0700
commitcdc4889cb5e752d00f7f8933a5486f4f3441f6e9 (patch)
tree82ddaabdb73e402e5ea399dde2062b24c41bc64a /Scripts
parentd0d3b18ad0a859e5e7a1cc5b8a569349b505c924 (diff)
Fix performance regression
Each chunk of audio samples should be encoded as a binary string, not as a list.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 5d2d893..91cdd06 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -173,7 +173,7 @@ def onAudioFramesAvailable(
# Now enforce a minimum duration on frames. This reduces cases where the
# STT hallucinates random things. In the Whisper paper, they enforce a
# minimum audio buffer duration of 5.0 seconds, so I do the same here.
- empty_chunk = [0] * int(audio_state.CHUNK / keep_every)
+ empty_chunk = b'0' * int(ceil(audio_state.CHUNK / keep_every))
chunk_duration_s = float(audio_state.CHUNK) / audio_state.RATE
cur_duration_s = len(audio_state.frames) * chunk_duration_s
desired_min_duration_s = 5.0
@@ -182,7 +182,8 @@ def onAudioFramesAvailable(
delta_chunks = int(ceil(delta_duration_s / chunk_duration_s))
if audio_state.enable_debug_mode:
print(f"Padding with {delta_duration_s} seconds ({delta_chunks} chunks) of silence")
- audio_state.frames = empty_chunk * delta_chunks + audio_state.frames
+ print(f"Each chunk has {len(empty_chunk)} samples")
+ audio_state.frames = [empty_chunk] * delta_chunks + audio_state.frames
return (frames, pyaudio.paContinue)