From c8ae9847aa8038a4cdd72281806e61a6f9092957 Mon Sep 17 00:00:00 2001 From: yum Date: Fri, 22 Dec 2023 14:15:07 -0800 Subject: Add gradio webui --- README.md | 15 +++----- app.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++------- requirements.txt | 3 +- 3 files changed, 101 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index afc1131..ada8b5e 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ A black box for your yapping. This app records your mic. It uses silero-vad to split audio into contiguous -segments of speech, and saves them to disk as .wav files. Metadata is -saved to a corresponding .yaml file. +segments of speech, and saves them to disk as .wav files. What's a black box? Wikipedia says this: ``` @@ -15,10 +14,6 @@ an outdated name which has become a misnomer—they are now required to be painted bright orange, to aid in their recovery after accidents. ``` -This is a CLI app. It is not polished and requires a little elbow grease to -use properly. The intent is to assist people who want to gather high-quality -training data of human voices. Use responsibly. - ## Compatibility This application is designed for Windows 10. Functionality on any other @@ -26,10 +21,10 @@ platform is purely coincidental. ## Running -Download the latest release and double click `app.bat` in File Explorer. - -Read the output and change the mic to whatever you're using. To change mics, -edit app.py. Any text editor works, including Notepad. +Download the latest release and double click `app.bat` in File Explorer. It +should automatically open a tab in your browser showing you the UI. If it +doesn't, type "localhost:7860" in your browser URL field (leave out the +quotes). ## Building from source diff --git a/app.py b/app.py index 3cb3816..f6e3894 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,7 @@ from datetime import datetime from pydub import AudioSegment +import gradio as gr import math import numpy as np import os @@ -11,6 +12,22 @@ import typing import vad import wave +class Logger: + def __init__(self, filename): + self.terminal = sys.stdout + self.log = open(filename, "w") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + self.log.flush() + + def isatty(self): + return False + class AudioStream(): FORMAT = pyaudio.paInt16 # Size of each frame (audio sample), in bytes. If you change FORMAT, make @@ -102,6 +119,18 @@ class MicStream(AudioStream): device_name = self.p.get_device_info_by_host_api_device_index(0, i).get('name') print("Input Device id ", i, " - ", device_name) + def getMicDevices() -> str: + p = pyaudio.PyAudio() + info = p.get_host_api_info_by_index(0) + numdevices = info.get('deviceCount') + + result = [] + for i in range(0, numdevices): + if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0: + device_name = p.get_device_info_by_host_api_device_index(0, i).get('name') + result.append(f"Input Device id {i} - {device_name}") + return '\n'.join(result) + def onAudioFramesAvailable(self, frames, frame_count, @@ -320,7 +349,7 @@ def saveAudio(audio: bytes, path: str, stream: AudioStream): wf.setframerate(stream.fps) wf.writeframes(audio) -def concatenate_wav_files(output_path): +def concatenateWavFiles(output_path): # List all .wav files in the CWD wav_files = [f for f in os.listdir('.') if f.endswith('.wav')] @@ -330,6 +359,9 @@ def concatenate_wav_files(output_path): # Open the output file with wave.open(output_path, 'wb') as output_wav: for wav_file in wav_files: + if os.path.abspath(wav_file) == os.path.abspath(output_path): + print(f"Skip adding output file ({wav_file}) to itself") + continue print(f"Processing {wav_file}") with wave.open(wav_file, 'rb') as input_wav: # Check if parameters are the same for each file @@ -341,17 +373,19 @@ def concatenate_wav_files(output_path): frames = input_wav.readframes(input_wav.getnframes()) output_wav.writeframes(frames) -if __name__ == "__main__": - abspath = os.path.abspath(__file__) - dname = os.path.dirname(abspath) - os.chdir(dname) - print(f"Set cwd to {os.getcwd()}", file=sys.stderr) +class AppControl: + run = True +app_ctrl = AppControl() - concatenate_wav_files("concatenated.wav") - sys.exit(0) +def recordMeDaddy( + mic_index: int, + min_volume: float = -1.3, + max_volume: float = -0.8 + ): + app_ctrl.run = True - stream = MicStream("index") - stream_hd = MicStream("index", fps=44100) + stream = MicStream(str(mic_index)) + stream_hd = MicStream(str(mic_index), fps=44100) collector = AudioCollector(stream) #collector = NormalizingAudioCollector(collector) @@ -368,7 +402,7 @@ if __name__ == "__main__": max_speech_s=max_speech_s, stream=stream) - while True: + while app_ctrl.run: audio = collector.getAudio() collector_hd.getAudio() stable_cutoff, has_audio = segmenter.getStableCutoff(audio) @@ -398,7 +432,7 @@ if __name__ == "__main__": print(f"volume: {audio_v}") # cutoff is a fine-tuned value based on volumes seen while in vr # (index mic) - if audio_v < -1.3 or audio_v > -0.8: + if audio_v < min_volume or audio_v > max_volume: # Discard sample print("Discarding too-quiet/too-loud segment") collector.keepLast(1.0) @@ -414,4 +448,52 @@ if __name__ == "__main__": #print("VAD detects no audio, skip transcription", file=sys.stderr) collector.keepLast(1.0) collector_hd.keepLast(1.0) + print("Stopped recording") + +def getOutput() -> str: + sys.stdout.flush() + with open("output.log", "r") as f: + return f.read() + +def stopApp(): + print("Requesting app stop") + app_ctrl.run = False + +if __name__ == "__main__": + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + sys.stdout = Logger("output.log") + + print(f"Set cwd to {os.getcwd()}", file=sys.stderr) + + with gr.Blocks() as demo: + dump_mics = gr.Button("Dump mics") + mics_output = gr.Text(label="Microphones") + + mic_device = gr.Number(label="Mic device") + min_volume = gr.Number(label="Minimum volume", value=-1.3) + max_volume = gr.Number(label="Maximum volume", value=-0.8) + record_audio = gr.Button("Record audio") + stop_recording = gr.Button("Stop recording") + concatenated_path = gr.Text(label="Combined audio filename", value="combined.wav") + concatenate_audio = gr.Button("Combine audio files") + + dbg_output = gr.Text(label="Output") + + dump_mics.click(MicStream.getMicDevices, [], mics_output) + + record_audio.click(recordMeDaddy, [mic_device, min_volume, max_volume], + dbg_output) + stop_recording.click(stopApp, [], dbg_output) + concatenate_audio.click(concatenateWavFiles, [concatenated_path], + dbg_output) + + demo.load(getOutput, None, dbg_output, every=0.5) + demo.launch() + sys.exit(0) + + concatenateWavFiles("concatenated.wav") + sys.exit(0) diff --git a/requirements.txt b/requirements.txt index 79d8212..8dd8afc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ ctranslate2 +gradio numpy +onnxruntime pyaudio pydub -onnxruntime -- cgit v1.2.3