summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-07-08 18:55:41 -0700
committeryum <yum.food.vr@gmail.com>2023-07-08 19:06:45 -0700
commita602bfb95665697b15a2de58694c6ac064af2916 (patch)
tree79cba0aeda47cd5829202d82f2644364d27b655d /Scripts
parent80f46a7a346e73c94a3bb8ae01099743020ef2a4 (diff)
Preserve audio chunk length when dropping samples
When we commit a transcription, we drop the corresponding audio data. Audio data is represented as a list of chunks. Each chunk contains a few hundred samples of audio data, representing O(10ms) of audio. If we want to drop a few seconds of data, this means simply deleting many chunks of audio. There's usually a chunk where we want to drop some portion of audio data. Instead of slicing away that part of the chunk, which would change its length, this change zeroes it out. This preserves the assumption that each chunk has the same temporal length.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe.py15
1 files changed, 9 insertions, 6 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index f0b0338..03fb11b 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -161,18 +161,21 @@ def onAudioFramesAvailable(
# mics usually have a higher sample rate than 16 KHz (see decimation
# code above).
# The ratio of (mic sample rate) / (16KHz) is simply `keep_every`.
- n_frames_to_drop = audio_state.drop_samples_till_i / audio_state.CHUNK
+ n_frames_to_drop = float(audio_state.drop_samples_till_i) / audio_state.CHUNK
n_frames_to_drop *= keep_every
- n_frames_to_drop = int(floor(n_frames_to_drop))
+ n_frames_to_drop_int = int(floor(n_frames_to_drop))
if audio_state.enable_debug_mode:
- print(f"Dropping {n_frames_to_drop} frames, buffer has {len(audio_state.frames)} frames total")
+ print(f"Dropping {n_frames_to_drop_int} frames, buffer has {len(audio_state.frames)} frames total")
# First drop every whole chunk
- audio_state.frames = audio_state.frames[n_frames_to_drop:]
+ audio_state.frames = audio_state.frames[n_frames_to_drop_int:]
# Then drop the part of the most recent chunk we no longer want
if len(audio_state.frames) > 0:
- n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK))
+ n_samples_to_drop = int(ceil((n_frames_to_drop % 1.0) * audio_state.CHUNK / keep_every))
+ if audio_state.enable_debug_mode:
+ print(f"Zeroing {n_samples_to_drop} samples in frame 0")
+ print(f"Frame 0 has length {len(audio_state.frames[0])}")
bytes_per_sample = 2
- audio_state.frames[0] = audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
+ audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
audio_state.drop_samples_till_i = -1
# Now enforce a minimum duration on frames. This reduces cases where the