Verify that audio is clean after VAD segmentation

Indeed it is. Bumped up the default max segment length to decrease error. Also add mic presets for beyond (the vr headset) and motu (my mic interface).
author: yum <yum.food.vr@gmail.com> 2024-02-05 17:01:12 -0800
committer: yum <yum.food.vr@gmail.com> 2024-02-05 17:02:23 -0800
commit: acccf8ebcff0f7cc2b26e45e497f8b12ab73d8e1 (patch)
tree: 8c095c115b4b3447863a304d9a0f32f4e46e577f
parent: 33db3dcc23a45cae611bcf839c33d6615ccbf59e (diff)
5 files changed, 11 insertions, 8 deletions
diff --git a/Fonts/Bitmaps/font-ascii.png b/Fonts/Bitmaps/font-ascii.png
index 1a8ba99..098d651 100644
--- a/Fonts/Bitmaps/font-ascii.png
+++ b/Fonts/Bitmaps/font-ascii.png
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 2abe5b2..a92502d 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -87,7 +87,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	enable_lock_at_spawn(true),
 	gpu_idx(0),
 	min_silence_duration_ms(250),
-	max_speech_duration_s(5),
+	max_speech_duration_s(15),
 	reset_after_silence_s(10),
 	transcription_loop_delay_ms(100),
 	keybind("ctrl+x"),
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 0c83742..8e4eb6d 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -119,7 +119,9 @@ namespace {
 
     const wxString kMicChoices[] = {
         "index",
+        "beyond",
         "focusrite",
+        "motu",
         // ok now this is epic
         "0",
         "1",
diff --git a/Scripts/generate_fonts.py b/Scripts/generate_fonts.py
index cf73e6a..8dc8a89 100644
--- a/Scripts/generate_fonts.py
+++ b/Scripts/generate_fonts.py
@@ -155,7 +155,7 @@ def genUnicode():
 def genASCII():
     # Create an 8k grayscale image. 16 glyphs wide, 8 glyphs tall.
     # Only characters on the range [0, 128).
-    image = Image.new(mode="L", size=(8192,8192), color=0)
+    image = Image.new(mode="RGBA", size=(8192,8192), color=0)
     draw = ImageDraw.Draw(image)
     n_rows = 8
     n_cols = 16
@@ -174,11 +174,11 @@ def genASCII():
             else:
                 char = " "
             draw.text((col * font_pixels * 8 / 2, row * font_pixels * 8 - 20),
-                    char, font=font, fill=255)
+                    char, font=font, fill=(255,255,255))
     image.save("Fonts/Bitmaps/font-ascii.png")
 
 if __name__ == "__main__":
     print("Generating unicode fonts")
-    genUnicode()
+    #genUnicode()
     print("Generating ASCII fonts")
     genASCII()
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 491bc35..65a0cf8 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -337,7 +337,7 @@ class AudioSegmenter:
     # Returns the stable cutoff (if any) and whether there are any segments.
     def getStableCutoff(self, audio: bytes) -> typing.Tuple[int, bool]:
         min_delta_frames = int((self.vad_options.min_silence_duration_ms *
-                AudioStream.FPS) / 1000)
+                AudioStream.FPS) / 1000.0)
         cutoff = None
 
         last_end = None
@@ -515,9 +515,10 @@ class VadCommitter:
                     print(f"commit segment: {s}", file=sys.stderr)
                 print(f"delta get: {delta}", file=sys.stderr)
 
-            #ts = datetime.fromtimestamp(self.collector.now() - latency_s)
-            #filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
-            #saveAudio(commit_audio, filename)
+            if True:
+                ts = datetime.fromtimestamp(self.collector.now() - latency_s)
+                filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
+                saveAudio(commit_audio, filename)
 
         preview = ""
         if self.cfg["enable_previews"] and has_audio:
author	yum <yum.food.vr@gmail.com>	2024-02-05 17:01:12 -0800
committer	yum <yum.food.vr@gmail.com>	2024-02-05 17:02:23 -0800
commit	acccf8ebcff0f7cc2b26e45e497f8b12ab73d8e1 (patch)
tree	8c095c115b4b3447863a304d9a0f32f4e46e577f
parent	33db3dcc23a45cae611bcf839c33d6615ccbf59e (diff)