From a602bfb95665697b15a2de58694c6ac064af2916 Mon Sep 17 00:00:00 2001 From: yum Date: Sat, 8 Jul 2023 18:55:41 -0700 Subject: Preserve audio chunk length when dropping samples When we commit a transcription, we drop the corresponding audio data. Audio data is represented as a list of chunks. Each chunk contains a few hundred samples of audio data, representing O(10ms) of audio. If we want to drop a few seconds of data, this means simply deleting many chunks of audio. There's usually a chunk where we want to drop some portion of audio data. Instead of slicing away that part of the chunk, which would change its length, this change zeroes it out. This preserves the assumption that each chunk has the same temporal length. --- Scripts/transcribe.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'Scripts') diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index f0b0338..03fb11b 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -161,18 +161,21 @@ def onAudioFramesAvailable( # mics usually have a higher sample rate than 16 KHz (see decimation # code above). # The ratio of (mic sample rate) / (16KHz) is simply `keep_every`. - n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK + n_frames_to_drop = float(audio_state.drop_samples_till_i) / audio_state.CHUNK n_frames_to_drop *= keep_every - n_frames_to_drop = int(floor(n_frames_to_drop)) + n_frames_to_drop_int = int(floor(n_frames_to_drop)) if audio_state.enable_debug_mode: - print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total") + print(f"Dropping {n_frames_to_drop_int} frames, buffer has {len(audio_state.frames)} frames total") # First drop every whole chunk - audio_state.frames = audio_state.frames[n_frames_to_drop:] + audio_state.frames = audio_state.frames[n_frames_to_drop_int:] # Then drop the part of the most recent chunk we no longer want if len(audio_state.frames) > 0: - n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK)) + n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK / keep_every)) + if audio_state.enable_debug_mode: + print(f"Zeroing {n_samples_to_drop} samples in frame 0") + print(f"Frame 0 has length {len(audio_state.frames[0])}") bytes_per_sample = 2 - audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] + audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] audio_state.drop_samples_till_i = -1 # Now enforce a minimum duration on frames. This reduces cases where the -- cgit v1.2.3