From acccf8ebcff0f7cc2b26e45e497f8b12ab73d8e1 Mon Sep 17 00:00:00 2001 From: yum Date: Mon, 5 Feb 2024 17:01:12 -0800 Subject: Verify that audio is clean after VAD segmentation Indeed it is. Bumped up the default max segment length to decrease error. Also add mic presets for beyond (the vr headset) and motu (my mic interface). --- Scripts/generate_fonts.py | 6 +++--- Scripts/transcribe_v2.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'Scripts') diff --git a/Scripts/generate_fonts.py b/Scripts/generate_fonts.py index cf73e6a..8dc8a89 100644 --- a/Scripts/generate_fonts.py +++ b/Scripts/generate_fonts.py @@ -155,7 +155,7 @@ def genUnicode(): def genASCII(): # Create an 8k grayscale image. 16 glyphs wide, 8 glyphs tall. # Only characters on the range [0, 128). - image = Image.new(mode="L", size=(8192,8192), color=0) + image = Image.new(mode="RGBA", size=(8192,8192), color=0) draw = ImageDraw.Draw(image) n_rows = 8 n_cols = 16 @@ -174,11 +174,11 @@ def genASCII(): else: char = " " draw.text((col * font_pixels * 8 / 2, row * font_pixels * 8 - 20), - char, font=font, fill=255) + char, font=font, fill=(255,255,255)) image.save("Fonts/Bitmaps/font-ascii.png") if __name__ == "__main__": print("Generating unicode fonts") - genUnicode() + #genUnicode() print("Generating ASCII fonts") genASCII() diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 491bc35..65a0cf8 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -337,7 +337,7 @@ class AudioSegmenter: # Returns the stable cutoff (if any) and whether there are any segments. def getStableCutoff(self, audio: bytes) -> typing.Tuple[int, bool]: min_delta_frames = int((self.vad_options.min_silence_duration_ms * - AudioStream.FPS) / 1000) + AudioStream.FPS) / 1000.0) cutoff = None last_end = None @@ -515,9 +515,10 @@ class VadCommitter: print(f"commit segment: {s}", file=sys.stderr) print(f"delta get: {delta}", file=sys.stderr) - #ts = datetime.fromtimestamp(self.collector.now() - latency_s) - #filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav" - #saveAudio(commit_audio, filename) + if True: + ts = datetime.fromtimestamp(self.collector.now() - latency_s) + filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav" + saveAudio(commit_audio, filename) preview = "" if self.cfg["enable_previews"] and has_audio: -- cgit v1.2.3