From a602bfb95665697b15a2de58694c6ac064af2916 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Sat, 8 Jul 2023 18:55:41 -0700
Subject: Preserve audio chunk length when dropping samples

When we commit a transcription, we drop the corresponding audio data.
Audio data is represented as a list of chunks. Each chunk contains a few
hundred samples of audio data, representing O(10ms) of audio.

If we want to drop a few seconds of data, this means simply deleting
many chunks of audio. There's usually a chunk where we want to drop some
portion of audio data.

Instead of slicing away that part of the chunk, which would change its
length, this change zeroes it out. This preserves the assumption that
each chunk has the same temporal length.
---
 Scripts/transcribe.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'Scripts')

diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index f0b0338..03fb11b 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -161,18 +161,21 @@ def onAudioFramesAvailable(
         # mics usually have a higher sample rate than 16 KHz (see decimation
         # code above).
         # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`.
-        n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK
+        n_frames_to_drop = float(audio_state.drop_samples_till_i) / audio_state.CHUNK
         n_frames_to_drop *= keep_every
-        n_frames_to_drop = int(floor(n_frames_to_drop))
+        n_frames_to_drop_int = int(floor(n_frames_to_drop))
         if audio_state.enable_debug_mode:
-            print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total")
+            print(f"Dropping {n_frames_to_drop_int} frames, buffer has {len(audio_state.frames)} frames total")
         # First drop every whole chunk
-        audio_state.frames = audio_state.frames[n_frames_to_drop:]
+        audio_state.frames = audio_state.frames[n_frames_to_drop_int:]
         # Then drop the part of the most recent chunk we no longer want
         if len(audio_state.frames) > 0:
-            n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK))
+            n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK / keep_every))
+            if audio_state.enable_debug_mode:
+                print(f"Zeroing {n_samples_to_drop} samples in frame 0")
+                print(f"Frame 0 has length {len(audio_state.frames[0])}")
             bytes_per_sample = 2
-            audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
+            audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
         audio_state.drop_samples_till_i = -1
 
     # Now enforce a minimum duration on frames. This reduces cases where the
-- 
cgit v1.2.3