From 742eb86d652d7689bbf3ae8b286bf0a6b1c2380d Mon Sep 17 00:00:00 2001 From: yum Date: Fri, 7 Jul 2023 01:44:28 -0700 Subject: Use 16-bit ints with generated silence Each sample of audio data is a 16-bit int, not an 8-bit int. --- Scripts/transcribe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Scripts') diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 91cdd06..39c6563 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -137,6 +137,9 @@ def onAudioFramesAvailable( frame_len = int(len(frames) / frame_count) next_frame = 0.0 keep_every = float(input_rate) / audio_state.RATE + #print(f"Keep every {keep_every}th frame") + #print(f"len frames: {len(frames)}") + #print(f"len decimated: {len(decimated)}") i = 0 for i in range(0, frame_count): if i >= next_frame: @@ -173,7 +176,7 @@ def onAudioFramesAvailable( # Now enforce a minimum duration on frames. This reduces cases where the # STT hallucinates random things. In the Whisper paper, they enforce a # minimum audio buffer duration of 5.0 seconds, so I do the same here. - empty_chunk = b'0' * int(ceil(audio_state.CHUNK / keep_every)) + empty_chunk = b'00' * int(ceil(audio_state.CHUNK / keep_every)) chunk_duration_s = float(audio_state.CHUNK) / audio_state.RATE cur_duration_s = len(audio_state.frames) * chunk_duration_s desired_min_duration_s = 5.0 -- cgit v1.2.3