From c8ae9847aa8038a4cdd72281806e61a6f9092957 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Fri, 22 Dec 2023 14:15:07 -0800
Subject: Add gradio webui

---
 README.md        |  15 +++-----
 app.py           | 106 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 requirements.txt |   3 +-
 3 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index afc1131..ada8b5e 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,7 @@
 A black box for your yapping.
 
 This app records your mic. It uses silero-vad to split audio into contiguous
-segments of speech, and saves them to disk as .wav files. Metadata is
-saved to a corresponding .yaml file.
+segments of speech, and saves them to disk as .wav files.
 
 What's a black box? Wikipedia says this:
 ```
@@ -15,10 +14,6 @@ an outdated name which has become a misnomer—they are now required to be
 painted bright orange, to aid in their recovery after accidents.
 ```
 
-This is a CLI app. It is not polished and requires a little elbow grease to
-use properly. The intent is to assist people who want to gather high-quality
-training data of human voices. Use responsibly.
-
 ## Compatibility
 
 This application is designed for Windows 10. Functionality on any other
@@ -26,10 +21,10 @@ platform is purely coincidental.
 
 ## Running
 
-Download the latest release and double click `app.bat` in File Explorer.
-
-Read the output and change the mic to whatever you're using. To change mics,
-edit app.py. Any text editor works, including Notepad.
+Download the latest release and double click `app.bat` in File Explorer. It
+should automatically open a tab in your browser showing you the UI. If it
+doesn't, type "localhost:7860" in your browser URL field (leave out the
+quotes).
 
 ## Building from source
 
diff --git a/app.py b/app.py
index 3cb3816..f6e3894 100644
--- a/app.py
+++ b/app.py
@@ -1,6 +1,7 @@
 from datetime import datetime
 from pydub import AudioSegment
 
+import gradio as gr
 import math
 import numpy as np
 import os
@@ -11,6 +12,22 @@ import typing
 import vad
 import wave
 
+class Logger:
+    def __init__(self, filename):
+        self.terminal = sys.stdout
+        self.log = open(filename, "w")
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+
+    def isatty(self):
+        return False    
+
 class AudioStream():
     FORMAT = pyaudio.paInt16
     # Size of each frame (audio sample), in bytes. If you change FORMAT, make
@@ -102,6 +119,18 @@ class MicStream(AudioStream):
                 device_name = self.p.get_device_info_by_host_api_device_index(0, i).get('name')
                 print("Input Device id ", i, " - ", device_name)
 
+    def getMicDevices() -> str:
+        p = pyaudio.PyAudio()
+        info = p.get_host_api_info_by_index(0)
+        numdevices = info.get('deviceCount')
+
+        result = []
+        for i in range(0, numdevices):
+            if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
+                device_name = p.get_device_info_by_host_api_device_index(0, i).get('name')
+                result.append(f"Input Device id {i} - {device_name}")
+        return '\n'.join(result)
+
     def onAudioFramesAvailable(self,
             frames,
             frame_count,
@@ -320,7 +349,7 @@ def saveAudio(audio: bytes, path: str, stream: AudioStream):
         wf.setframerate(stream.fps)
         wf.writeframes(audio)
 
-def concatenate_wav_files(output_path):
+def concatenateWavFiles(output_path):
     # List all .wav files in the CWD
     wav_files = [f for f in os.listdir('.') if f.endswith('.wav')]
 
@@ -330,6 +359,9 @@ def concatenate_wav_files(output_path):
     # Open the output file
     with wave.open(output_path, 'wb') as output_wav:
         for wav_file in wav_files:
+            if os.path.abspath(wav_file) == os.path.abspath(output_path):
+                print(f"Skip adding output file ({wav_file}) to itself")
+                continue
             print(f"Processing {wav_file}")
             with wave.open(wav_file, 'rb') as input_wav:
                 # Check if parameters are the same for each file
@@ -341,17 +373,19 @@ def concatenate_wav_files(output_path):
                 frames = input_wav.readframes(input_wav.getnframes())
                 output_wav.writeframes(frames)
 
-if __name__ == "__main__":
-    abspath = os.path.abspath(__file__)
-    dname = os.path.dirname(abspath)
-    os.chdir(dname)
-    print(f"Set cwd to {os.getcwd()}", file=sys.stderr)
+class AppControl:
+    run = True
+app_ctrl = AppControl()
 
-    concatenate_wav_files("concatenated.wav")
-    sys.exit(0)
+def recordMeDaddy(
+        mic_index: int,
+        min_volume: float = -1.3,
+        max_volume: float = -0.8
+        ):
+    app_ctrl.run = True
 
-    stream = MicStream("index")
-    stream_hd = MicStream("index", fps=44100)
+    stream = MicStream(str(mic_index))
+    stream_hd = MicStream(str(mic_index), fps=44100)
 
     collector = AudioCollector(stream)
     #collector = NormalizingAudioCollector(collector)
@@ -368,7 +402,7 @@ if __name__ == "__main__":
             max_speech_s=max_speech_s,
             stream=stream)
 
-    while True:
+    while app_ctrl.run:
         audio = collector.getAudio()
         collector_hd.getAudio()
         stable_cutoff, has_audio = segmenter.getStableCutoff(audio)
@@ -398,7 +432,7 @@ if __name__ == "__main__":
             print(f"volume: {audio_v}")
             # cutoff is a fine-tuned value based on volumes seen while in vr
             # (index mic)
-            if audio_v < -1.3 or audio_v > -0.8:
+            if audio_v < min_volume or audio_v > max_volume:
                 # Discard sample
                 print("Discarding too-quiet/too-loud segment")
                 collector.keepLast(1.0)
@@ -414,4 +448,52 @@ if __name__ == "__main__":
             #print("VAD detects no audio, skip transcription", file=sys.stderr)
             collector.keepLast(1.0)
             collector_hd.keepLast(1.0)
+    print("Stopped recording")
+
+def getOutput() -> str:
+    sys.stdout.flush()
+    with open("output.log", "r") as f:
+        return f.read()
+
+def stopApp():
+    print("Requesting app stop")
+    app_ctrl.run = False
+
+if __name__ == "__main__":
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    sys.stdout = Logger("output.log")
+
+    print(f"Set cwd to {os.getcwd()}", file=sys.stderr)
+
+    with gr.Blocks() as demo:
+        dump_mics = gr.Button("Dump mics")
+        mics_output = gr.Text(label="Microphones")
+
+        mic_device = gr.Number(label="Mic device")
+        min_volume = gr.Number(label="Minimum volume", value=-1.3)
+        max_volume = gr.Number(label="Maximum volume", value=-0.8)
+        record_audio = gr.Button("Record audio")
+        stop_recording = gr.Button("Stop recording")
+        concatenated_path = gr.Text(label="Combined audio filename", value="combined.wav")
+        concatenate_audio = gr.Button("Combine audio files")
+
+        dbg_output = gr.Text(label="Output")
+
+        dump_mics.click(MicStream.getMicDevices, [], mics_output)
+
+        record_audio.click(recordMeDaddy, [mic_device, min_volume, max_volume],
+                dbg_output)
+        stop_recording.click(stopApp, [], dbg_output)
+        concatenate_audio.click(concatenateWavFiles, [concatenated_path],
+                dbg_output)
+
+        demo.load(getOutput, None, dbg_output, every=0.5)
+    demo.launch()
+    sys.exit(0)
+
+    concatenateWavFiles("concatenated.wav")
+    sys.exit(0)
 
diff --git a/requirements.txt b/requirements.txt
index 79d8212..8dd8afc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 ctranslate2
+gradio
 numpy
+onnxruntime
 pyaudio
 pydub
-onnxruntime
 
-- 
cgit v1.2.3