diff options
| author | yum <yum.food.vr@gmail.com> | 2023-07-07 01:27:02 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-07-07 01:27:02 -0700 |
| commit | cdc4889cb5e752d00f7f8933a5486f4f3441f6e9 (patch) | |
| tree | 82ddaabdb73e402e5ea399dde2062b24c41bc64a /Scripts | |
| parent | d0d3b18ad0a859e5e7a1cc5b8a569349b505c924 (diff) | |
Fix performance regression
Each chunk of audio samples should be encoded as a binary string, not as
a list.
Diffstat (limited to 'Scripts')
| -rw-r--r-- | Scripts/transcribe.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 5d2d893..91cdd06 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -173,7 +173,7 @@ def onAudioFramesAvailable( # Now enforce a minimum duration on frames. This reduces cases where the # STT hallucinates random things. In the Whisper paper, they enforce a # minimum audio buffer duration of 5.0 seconds, so I do the same here. - empty_chunk = [0] * int(audio_state.CHUNK / keep_every) + empty_chunk = b'0' * int(ceil(audio_state.CHUNK / keep_every)) chunk_duration_s = float(audio_state.CHUNK) / audio_state.RATE cur_duration_s = len(audio_state.frames) * chunk_duration_s desired_min_duration_s = 5.0 @@ -182,7 +182,8 @@ def onAudioFramesAvailable( delta_chunks = int(ceil(delta_duration_s / chunk_duration_s)) if audio_state.enable_debug_mode: print(f"Padding with {delta_duration_s} seconds ({delta_chunks} chunks) of silence") - audio_state.frames = empty_chunk * delta_chunks + audio_state.frames + print(f"Each chunk has {len(empty_chunk)} samples") + audio_state.frames = [empty_chunk] * delta_chunks + audio_state.frames return (frames, pyaudio.paContinue) |
