Preserve audio chunk length when dropping samples

When we commit a transcription, we drop the corresponding audio data. Audio data is represented as a list of chunks. Each chunk contains a few hundred samples of audio data, representing O(10ms) of audio. If we want to drop a few seconds of data, this means simply deleting many chunks of audio. There's usually a chunk where we want to drop some portion of audio data. Instead of slicing away that part of the chunk, which would change its length, this change zeroes it out. This preserves the assumption that each chunk has the same temporal length.
author: yum <yum.food.vr@gmail.com> 2023-07-08 18:55:41 -0700
committer: yum <yum.food.vr@gmail.com> 2023-07-08 19:06:45 -0700
commit: a602bfb95665697b15a2de58694c6ac064af2916 (patch)
tree: 79cba0aeda47cd5829202d82f2644364d27b655d /Scripts
parent: 80f46a7a346e73c94a3bb8ae01099743020ef2a4 (diff)
1 files changed, 9 insertions, 6 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index f0b0338..03fb11b 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -161,18 +161,21 @@ def onAudioFramesAvailable(
         # mics usually have a higher sample rate than 16 KHz (see decimation
         # code above).
         # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`.
-        n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK
+        n_frames_to_drop = float(audio_state.drop_samples_till_i) / audio_state.CHUNK
         n_frames_to_drop *= keep_every
-        n_frames_to_drop = int(floor(n_frames_to_drop))
+        n_frames_to_drop_int = int(floor(n_frames_to_drop))
         if audio_state.enable_debug_mode:
-            print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total")
+            print(f"Dropping {n_frames_to_drop_int} frames, buffer has {len(audio_state.frames)} frames total")
         # First drop every whole chunk
-        audio_state.frames = audio_state.frames[n_frames_to_drop:]
+        audio_state.frames = audio_state.frames[n_frames_to_drop_int:]
         # Then drop the part of the most recent chunk we no longer want
         if len(audio_state.frames) > 0:
-            n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK))
+            n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK / keep_every))
+            if audio_state.enable_debug_mode:
+                print(f"Zeroing {n_samples_to_drop} samples in frame 0")
+                print(f"Frame 0 has length {len(audio_state.frames[0])}")
             bytes_per_sample = 2
-            audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
+            audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
         audio_state.drop_samples_till_i = -1
 
     # Now enforce a minimum duration on frames. This reduces cases where the
author	yum <yum.food.vr@gmail.com>	2023-07-08 18:55:41 -0700
committer	yum <yum.food.vr@gmail.com>	2023-07-08 19:06:45 -0700
commit	a602bfb95665697b15a2de58694c6ac064af2916 (patch)
tree	79cba0aeda47cd5829202d82f2644364d27b655d /Scripts
parent	80f46a7a346e73c94a3bb8ae01099743020ef2a4 (diff)