Use 16-bit ints with generated silence

Each sample of audio data is a 16-bit int, not an 8-bit int.
author: yum <yum.food.vr@gmail.com> 2023-07-07 01:44:28 -0700
committer: yum <yum.food.vr@gmail.com> 2023-07-07 01:44:28 -0700
commit: 742eb86d652d7689bbf3ae8b286bf0a6b1c2380d (patch)
tree: 4a0c1451622d86daa103e9112720ca277798b593 /Scripts
parent: cdc4889cb5e752d00f7f8933a5486f4f3441f6e9 (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 91cdd06..39c6563 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -137,6 +137,9 @@ def onAudioFramesAvailable(
     frame_len = int(len(frames) / frame_count)
     next_frame = 0.0
     keep_every = float(input_rate) / audio_state.RATE
+    #print(f"Keep every {keep_every}th frame")
+    #print(f"len frames: {len(frames)}")
+    #print(f"len decimated: {len(decimated)}")
     i = 0
     for i in range(0, frame_count):
         if i >= next_frame:
@@ -173,7 +176,7 @@ def onAudioFramesAvailable(
     # Now enforce a minimum duration on frames. This reduces cases where the
     # STT hallucinates random things. In the Whisper paper, they enforce a
     # minimum audio buffer duration of 5.0 seconds, so I do the same here.
-    empty_chunk = b'0' * int(ceil(audio_state.CHUNK / keep_every))
+    empty_chunk = b'00' * int(ceil(audio_state.CHUNK / keep_every))
     chunk_duration_s = float(audio_state.CHUNK) / audio_state.RATE
     cur_duration_s = len(audio_state.frames) * chunk_duration_s
     desired_min_duration_s = 5.0
author	yum <yum.food.vr@gmail.com>	2023-07-07 01:44:28 -0700
committer	yum <yum.food.vr@gmail.com>	2023-07-07 01:44:28 -0700
commit	742eb86d652d7689bbf3ae8b286bf0a6b1c2380d (patch)
tree	4a0c1451622d86daa103e9112720ca277798b593 /Scripts
parent	cdc4889cb5e752d00f7f8933a5486f4f3441f6e9 (diff)