From f97cef182de55b6dbae8d2bc0477acfca6cc1f66 Mon Sep 17 00:00:00 2001 From: yum Date: Thu, 29 May 2025 19:45:48 -0700 Subject: More UI work 1. main STT app works in new project structure 2. UI dumps mics on startup to populate mic list 3. add missing deps (hf-xet, wave) 4. normalize audio volume when transcribing. Probably still wrong tbqh. 5. add checkbox to save audio segments & improve logic so only segments with speech get saved. 6. add default config settings --- app/hi.py | 7 +- app/list_microphones.py | 24 ++++++ app/requirements.txt | 3 +- app/stt.py | 55 ++++++++++++-- app/vad.py | 3 +- config.yaml | 7 +- ui/index.html | 27 ++++--- ui/index.js | 196 ++++++++++++++++++++++++++++++++++++++---------- ui/preload.js | 5 +- ui/renderer.js | 87 ++++++++++++++++++++- ui/src/components.css | 4 + ui_design.md | 3 + 12 files changed, 355 insertions(+), 66 deletions(-) create mode 100644 app/list_microphones.py diff --git a/app/hi.py b/app/hi.py index 0129958..0d80b9d 100644 --- a/app/hi.py +++ b/app/hi.py @@ -2,6 +2,7 @@ import app_config import argparse from math import floor, ceil import msvcrt +import os from pythonosc import udp_client import sentencepiece as spm from shared_thread_data import SharedThreadData @@ -15,8 +16,11 @@ TESTS_ENABLED = True # 0 = quiet, 1 = verbose, 2 = very verbose LOG_LEVEL = 0 +APP_ROOT = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(APP_ROOT) + def get_tokenizer(): - model_path = "./custom_unigram_tokenizer_65k/unigram.model" + model_path = os.path.join(PROJECT_ROOT, "custom_unigram_tokenizer_65k", "unigram.model") print(f"Loading SentencePiece tokenizer from: {model_path}") sp = spm.SentencePieceProcessor() sp.load(model_path) @@ -346,7 +350,6 @@ if __name__ == "__main__": time.sleep(0.1) continue - try: char = char_bytes.decode('utf-8') if char == '\r' or char == '\n': diff --git a/app/list_microphones.py b/app/list_microphones.py new file mode 100644 index 0000000..a6b1f36 --- /dev/null +++ b/app/list_microphones.py @@ -0,0 +1,24 @@ +import pyaudio +import json +import sys + +try: + p = pyaudio.PyAudio() + info = p.get_host_api_info_by_index(0) + numdevices = info.get('deviceCount') + + microphones = [] + for i in range(0, numdevices): + device_info = p.get_device_info_by_host_api_device_index(0, i) + if device_info.get('maxInputChannels') > 0: + microphones.append({ + 'index': i, + 'name': device_info.get('name'), + 'defaultSampleRate': device_info.get('defaultSampleRate') + }) + + print(json.dumps(microphones)) + p.terminate() +except Exception as e: + print(json.dumps({'error': str(e)}), file=sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt index 4e79312..07f94cd 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,7 +1,8 @@ faster-whisper +hf-xet langcodes pyaudio pydub python-osc sentencepiece - +wave diff --git a/app/stt.py b/app/stt.py index 34ef2e9..c157f6d 100644 --- a/app/stt.py +++ b/app/stt.py @@ -1,3 +1,4 @@ +from datetime import datetime from faster_whisper import WhisperModel import langcodes import numpy as np @@ -9,6 +10,11 @@ import sys import time import typing import vad +import wave + + +APP_ROOT = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(APP_ROOT) class AudioStream(): FORMAT = pyaudio.paInt16 @@ -242,6 +248,26 @@ class NormalizingAudioCollector(AudioCollectorFilter): return frames +class BoostingAudioCollector(AudioCollectorFilter): + def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict): + AudioCollectorFilter.__init__(self, parent) + self.target_dBFS = target_dBFS + self.cfg = cfg + + def getAudio(self) -> bytes: + audio = self.parent.getAudio() + + audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ, + frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS) + if self.cfg["enable_debug_mode"]: + print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr) + audio = audio.apply_gain(self.target_dBFS - audio.dBFS) + + frames = np.array(audio.get_array_of_samples()) + frames = np.int16(frames).tobytes() + + return frames + class CompressingAudioCollector(AudioCollectorFilter): def __init__(self, parent: AudioCollector): AudioCollectorFilter.__init__(self, parent) @@ -441,6 +467,16 @@ class TranscriptCommit: self.duration_s = duration_s +def saveAudio(audio: bytes, path: str, cfg: typing.Dict): + with wave.open(path, 'wb') as wf: + if cfg["enable_debug_mode"]: + print(f"Saving audio to {path}", file=sys.stderr) + wf.setnchannels(AudioStream.CHANNELS) + wf.setsampwidth(AudioStream.FRAME_SZ) + wf.setframerate(AudioStream.FPS) + wf.writeframes(audio) + + class VadCommitter: def __init__(self, cfg: typing.Dict, @@ -463,7 +499,6 @@ class VadCommitter: start_ts = self.collector.begin() if has_audio and stable_cutoff: - #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() duration_s = stable_cutoff / AudioStream.FPS start_ts = self.collector.begin() @@ -475,12 +510,16 @@ class VadCommitter: if self.cfg["enable_debug_mode"]: for s in segments: print(f"commit segment: {s}", file=sys.stderr) - print(f"delta get: {delta}", file=sys.stderr) + if len(delta) > 0: + print(f"delta get: {delta}", file=sys.stderr) - if False: + if self.cfg["save_audio"] and len(delta) > 0: ts = datetime.fromtimestamp(self.collector.now() - latency_s) filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav" - saveAudio(commit_audio, filename) + audio_dir = os.path.join(PROJECT_ROOT, "audio") + if not os.path.exists(audio_dir): + os.makedirs(audio_dir) + saveAudio(commit_audio, os.path.join(audio_dir, filename), self.cfg) preview = "" if self.cfg["enable_previews"] and has_audio: @@ -488,7 +527,6 @@ class VadCommitter: preview = "".join(s.transcript for s in segments) if not has_audio: - #print("VAD detects no audio, skip transcription", file=sys.stderr) self.collector.keepLast(1.0) return TranscriptCommit( @@ -504,8 +542,9 @@ def transcriptionThread(shared_data: SharedThreadData): stream = MicStream(shared_data.cfg["microphone"]) collector = AudioCollector(stream) - collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) + collector = BoostingAudioCollector(collector, 0.0, shared_data.cfg) whisper = Whisper(collector, shared_data.cfg) segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"], max_speech_s=shared_data.cfg["max_speech_duration_s"]) @@ -552,13 +591,13 @@ def transcriptionThread(shared_data: SharedThreadData): preview = commit.preview try: - print(f"Transcript: {transcript}") + print(f"Transcript: {transcript}", flush=True) except UnicodeEncodeError: print("Failed to encode transcript - discarding delta", file=sys.stderr) continue try: - print(f"Preview: {preview}") + print(f"Preview: {preview}", flush=True) except UnicodeEncodeError: print("Failed to encode preview - discarding", file=sys.stderr) diff --git a/app/vad.py b/app/vad.py index 10a72d3..1dea765 100644 --- a/app/vad.py +++ b/app/vad.py @@ -259,7 +259,8 @@ def get_vad_model(): """Returns the VAD model instance.""" abspath = os.path.abspath(__file__) my_dir = os.path.dirname(abspath) - path = os.path.join(my_dir, "Models/silero_vad.onnx") + parent_dir = os.path.dirname(my_dir) + path = os.path.join(parent_dir, "Models", "silero_vad.onnx") return SileroVADModel(path) diff --git a/config.yaml b/config.yaml index 164b4e6..34d88f1 100644 --- a/config.yaml +++ b/config.yaml @@ -1,18 +1,17 @@ -compute_type: int8 +compute_type: float16 enable_debug_mode: 0 enable_previews: 1 +save_audio: 0 language: english gpu_idx: 0 max_speech_duration_s: 10 min_silence_duration_ms: 250 -microphone: motu +microphone: 0 model: turbo reset_after_silence_s: 15 transcription_loop_delay_ms: 100 use_cpu: 0 - block_width: 2 num_blocks: 40 rows: 10 cols: 24 - diff --git a/ui/index.html b/ui/index.html index 14cc354..b06e56b 100644 --- a/ui/index.html +++ b/ui/index.html @@ -8,11 +8,9 @@
-

TaSTT

-
- -
+ +
@@ -127,6 +125,10 @@ Enable Previews +
@@ -156,9 +158,17 @@
- +
+ + + +
@@ -167,9 +177,8 @@
-
+
-

Python Output

diff --git a/ui/index.js b/ui/index.js index 0a7fdf9..a056156 100644 --- a/ui/index.js +++ b/ui/index.js @@ -4,14 +4,16 @@ const fs = require('node:fs').promises; const yaml = require('js-yaml'); const { spawn } = require('child_process'); +const APP_ROOT = path.join(__dirname, '..'); +const CONFIG_PATH = path.join(APP_ROOT, 'config.yaml'); + let mainWindow; +let runningProcess = null; // Track the running Python process // Helper function to get the correct Python executable from venv function getVenvPython() { - const venvPath = path.join(__dirname, '..', 'venv'); - const isWindows = process.platform === 'win32'; - const pythonExecutable = isWindows ? 'python.exe' : 'python'; - const pythonPath = path.join(venvPath, isWindows ? 'Scripts' : 'bin', pythonExecutable); + const venvPath = path.join(APP_ROOT, 'venv'); + const pythonPath = path.join(venvPath, 'Scripts', 'python.exe'); return pythonPath; } @@ -29,7 +31,17 @@ function executePythonCommand(args, options = {}) { const commandStr = `${path.basename(pythonPath)} ${args.join(' ')}`; sendPythonOutput(`> ${commandStr}`, 'info'); - const pythonProcess = spawn(pythonPath, args, options); + // Add dll directory to PATH for Windows DLL loading + const dllPath = path.join(APP_ROOT, 'dll'); + const env = { ...process.env }; + env.PATH = `${dllPath};${env.PATH}`; + + const spawnOptions = { + ...options, + env + }; + + const pythonProcess = spawn(pythonPath, args, spawnOptions); let stdout = ''; let stderr = ''; @@ -76,15 +88,47 @@ function createWindow () { mainWindow.loadFile('index.html'); } -// Path to config.yaml (one level up from ui directory) -const configPath = path.join(__dirname, '..', 'config.yaml'); +// Default configuration based on user's current config.yaml +const DEFAULT_CONFIG = { + compute_type: 'float16', + enable_debug_mode: 0, + enable_previews: 1, + save_audio: 0, + language: 'english', + gpu_idx: 0, + max_speech_duration_s: 10, + min_silence_duration_ms: 250, + microphone: 0, + model: 'turbo', + reset_after_silence_s: 15, + transcription_loop_delay_ms: 100, + use_cpu: 0, + block_width: 2, + num_blocks: 40, + rows: 10, + cols: 24 +}; // IPC handlers ipcMain.handle('load-config', async () => { try { - const fileContent = await fs.readFile(configPath, 'utf8'); + const fileContent = await fs.readFile(CONFIG_PATH, 'utf8'); return yaml.load(fileContent); } catch (error) { + if (error.code === 'ENOENT') { + // Config file doesn't exist, create it with defaults + console.log('Config file not found, creating with defaults...'); + try { + const yamlContent = yaml.dump(DEFAULT_CONFIG, { lineWidth: -1 }); + await fs.writeFile(CONFIG_PATH, yamlContent, 'utf8'); + console.log('Created config.yaml with default values'); + return DEFAULT_CONFIG; + } catch (writeError) { + console.error('Error creating default config:', writeError); + // Return defaults even if we can't write the file + return DEFAULT_CONFIG; + } + } console.error('Error loading config:', error); throw error; } @@ -93,7 +137,7 @@ ipcMain.handle('load-config', async () => { ipcMain.handle('save-config', async (event, config) => { try { const yamlContent = yaml.dump(config, { lineWidth: -1 }); - await fs.writeFile(configPath, yamlContent, 'utf8'); + await fs.writeFile(CONFIG_PATH, yamlContent, 'utf8'); return { success: true }; } catch (error) { console.error('Error saving config:', error); @@ -107,7 +151,7 @@ ipcMain.handle('restart-app', () => { }); ipcMain.handle('install-requirements', async (event) => { - const requirementsPath = path.join(__dirname, '..', 'app', 'requirements.txt'); + const requirementsPath = path.join(APP_ROOT, 'app', 'requirements.txt'); try { // Check if requirements.txt exists @@ -126,35 +170,10 @@ ipcMain.handle('install-requirements', async (event) => { }); ipcMain.handle('get-microphones', async () => { - const pythonScript = ` -import pyaudio -import json -import sys - -try: - p = pyaudio.PyAudio() - info = p.get_host_api_info_by_index(0) - numdevices = info.get('deviceCount') - - microphones = [] - for i in range(0, numdevices): - device_info = p.get_device_info_by_host_api_device_index(0, i) - if device_info.get('maxInputChannels') > 0: - microphones.append({ - 'index': i, - 'name': device_info.get('name'), - 'defaultSampleRate': device_info.get('defaultSampleRate') - }) - - print(json.dumps(microphones)) - p.terminate() -except Exception as e: - print(json.dumps({'error': str(e)}), file=sys.stderr) - sys.exit(1) -`; - + const scriptPath = path.join(APP_ROOT, 'app', 'list_microphones.py'); + try { - const result = await executePythonCommand(['-c', pythonScript]); + const result = await executePythonCommand([scriptPath]); const microphones = JSON.parse(result.stdout.trim()); console.log('Successfully retrieved microphones:', microphones); return microphones; @@ -164,6 +183,105 @@ except Exception as e: } }); +// Add handlers for starting and stopping the process +ipcMain.handle('start-process', async () => { + if (runningProcess) { + throw new Error('Process is already running'); + } + + const scriptPath = path.join(APP_ROOT, 'app', 'hi.py'); + const configPath = CONFIG_PATH; + + try { + const pythonPath = getVenvPython(); + const args = [scriptPath, '--config', configPath]; + + sendPythonOutput(`Starting process: ${path.basename(pythonPath)} ${args.join(' ')}`, 'info'); + + // Add dll directory to PATH for Windows DLL loading + const dllPath = path.join(APP_ROOT, 'dll'); + const env = { ...process.env }; + env.PATH = `${dllPath};${env.PATH}`; + + runningProcess = spawn(pythonPath, args, { env }); + + runningProcess.stdout.on('data', (data) => { + const text = data.toString(); + sendPythonOutput(text.trimEnd(), 'stdout'); + }); + + runningProcess.stderr.on('data', (data) => { + const text = data.toString(); + sendPythonOutput(text.trimEnd(), 'stderr'); + }); + + runningProcess.on('error', (error) => { + sendPythonOutput(`Process error: ${error.message}`, 'stderr'); + runningProcess = null; + if (mainWindow && !mainWindow.isDestroyed()) { + mainWindow.webContents.send('process-stopped'); + } + }); + + runningProcess.on('close', (code) => { + sendPythonOutput(`Process exited with code ${code}`, 'info'); + runningProcess = null; + if (mainWindow && !mainWindow.isDestroyed()) { + mainWindow.webContents.send('process-stopped'); + } + }); + + return { success: true }; + } catch (error) { + runningProcess = null; + throw error; + } +}); + +ipcMain.handle('stop-process', async () => { + if (!runningProcess) { + throw new Error('No process is running'); + } + + return new Promise((resolve, reject) => { + let forcefullyKilled = false; + + // Set up a timeout to force kill after 10 seconds + const killTimeout = setTimeout(() => { + if (runningProcess) { + sendPythonOutput('Process did not stop gracefully, forcing termination...', 'stderr'); + forcefullyKilled = true; + runningProcess.kill(); + } + }, 10000); + + // Listen for the process to exit + runningProcess.once('exit', (code, signal) => { + clearTimeout(killTimeout); + runningProcess = null; + + if (forcefullyKilled) { + sendPythonOutput('Process forcefully terminated', 'info'); + } else { + sendPythonOutput('Process stopped gracefully', 'info'); + } + + resolve({ success: true, forcefullyKilled }); + }); + + // Send termination signal + sendPythonOutput('Stopping process gracefully...', 'info'); + runningProcess.kill(); + }); +}); + +// Clean up on app quit +app.on('before-quit', () => { + if (runningProcess) { + runningProcess.kill(); + } +}); + app.whenReady().then(() => { createWindow(); @@ -173,6 +291,6 @@ app.whenReady().then(() => { }); app.on('window-all-closed', function () { - if (process.platform !== 'darwin') app.quit(); + app.quit(); }); diff --git a/ui/preload.js b/ui/preload.js index 108bffe..e6c0623 100644 --- a/ui/preload.js +++ b/ui/preload.js @@ -6,7 +6,10 @@ contextBridge.exposeInMainWorld('electronAPI', { restartApp: () => ipcRenderer.invoke('restart-app'), getMicrophones: () => ipcRenderer.invoke('get-microphones'), installRequirements: () => ipcRenderer.invoke('install-requirements'), - onPythonOutput: (callback) => ipcRenderer.on('python-output', (event, data) => callback(data)) + startProcess: () => ipcRenderer.invoke('start-process'), + stopProcess: () => ipcRenderer.invoke('stop-process'), + onPythonOutput: (callback) => ipcRenderer.on('python-output', (event, data) => callback(data)), + onProcessStopped: (callback) => ipcRenderer.on('process-stopped', (event) => callback()) }); console.log('Preload script loaded.'); diff --git a/ui/renderer.js b/ui/renderer.js index 83c652c..b3f05a6 100644 --- a/ui/renderer.js +++ b/ui/renderer.js @@ -22,15 +22,20 @@ function showStatus(message, type = 'info') { // Get form values function getFormValues() { + const microphoneValue = document.getElementById('microphone').value; + // Convert to number if it's a numeric string (device index) + const microphoneForConfig = /^\d+$/.test(microphoneValue) ? parseInt(microphoneValue) : microphoneValue; + return { compute_type: document.getElementById('compute_type').value, enable_debug_mode: document.getElementById('enable_debug_mode').checked ? 1 : 0, enable_previews: document.getElementById('enable_previews').checked ? 1 : 0, + save_audio: document.getElementById('save_audio').checked ? 1 : 0, language: document.getElementById('language').value, gpu_idx: parseInt(document.getElementById('gpu_idx').value), max_speech_duration_s: parseInt(document.getElementById('max_speech_duration_s').value), min_silence_duration_ms: parseInt(document.getElementById('min_silence_duration_ms').value), - microphone: document.getElementById('microphone').value, + microphone: microphoneForConfig, model: document.getElementById('model').value, reset_after_silence_s: parseInt(document.getElementById('reset_after_silence_s').value), transcription_loop_delay_ms: parseInt(document.getElementById('transcription_loop_delay_ms').value), @@ -52,6 +57,7 @@ function setFormValues(config) { document.getElementById('compute_type').value = config.compute_type || 'int8'; document.getElementById('enable_debug_mode').checked = config.enable_debug_mode === 1; document.getElementById('enable_previews').checked = config.enable_previews === 1; + document.getElementById('save_audio').checked = config.save_audio === 1; document.getElementById('language').value = config.language || 'english'; document.getElementById('gpu_idx').value = config.gpu_idx || 0; document.getElementById('max_speech_duration_s').value = config.max_speech_duration_s || 10; @@ -97,6 +103,30 @@ async function handleAsyncAction(actionName, actionFn) { } } +// Process control buttons +const startButton = document.getElementById('start-process'); +const stopButton = document.getElementById('stop-process'); + +// Helper functions for button state management +function setButtonState(button, disabled) { + button.disabled = disabled; + if (disabled) { + button.classList.add('opacity-50', 'cursor-not-allowed'); + } else { + button.classList.remove('opacity-50', 'cursor-not-allowed'); + } +} + +function setProcessRunningState() { + setButtonState(startButton, true); + setButtonState(stopButton, false); +} + +function setProcessStoppedState() { + setButtonState(startButton, false); + setButtonState(stopButton, true); +} + // Auto-save functionality with debouncing let saveTimeout; const SAVE_DELAY = 500; // milliseconds @@ -110,6 +140,31 @@ async function autoSaveConfig() { const config = getFormValues(); await window.electronAPI.saveConfig(config); showStatus('Configuration saved', 'success'); + + // Check if process is running (stop button is enabled means process is running) + const stopButton = document.getElementById('stop-process'); + + if (!stopButton.disabled) { + // Process is running, restart it with new config + appendToConsole('Restarting process with new configuration...', 'info'); + + try { + await window.electronAPI.stopProcess(); + + await new Promise(resolve => setTimeout(resolve, 1000)); + + await window.electronAPI.startProcess(); + + // Update button states to reflect running process + setProcessRunningState(); + + appendToConsole('Process restarted with new configuration', 'info'); + } catch (error) { + appendToConsole(`Failed to restart process: ${error.message}`, 'stderr'); + // Process is stopped, update button states + setProcessStoppedState(); + } + } } catch (error) { showStatus(`Failed to save configuration: ${error.message}`, 'error'); } @@ -246,4 +301,34 @@ document.getElementById('clear-console').addEventListener('click', () => { // Listen for Python output window.electronAPI.onPythonOutput((data) => { appendToConsole(data.message, data.type); +}); + +document.getElementById('start-process').addEventListener('click', async () => { + setButtonState(startButton, true); + + try { + await window.electronAPI.startProcess(); + setProcessRunningState(); + appendToConsole('Process started successfully', 'info'); + } catch (error) { + appendToConsole(`Failed to start process: ${error.message}`, 'stderr'); + setButtonState(startButton, false); + } +}); + +document.getElementById('stop-process').addEventListener('click', async () => { + setButtonState(stopButton, true); + + try { + const result = await window.electronAPI.stopProcess(); + appendToConsole('Process stop initiated', 'info'); + } catch (error) { + appendToConsole(`Failed to stop process: ${error.message}`, 'stderr'); + setButtonState(stopButton, false); + } +}); + +// Listen for process stopped event +window.electronAPI.onProcessStopped(() => { + setProcessStoppedState(); }); \ No newline at end of file diff --git a/ui/src/components.css b/ui/src/components.css index be046ea..d8d909d 100644 --- a/ui/src/components.css +++ b/ui/src/components.css @@ -42,6 +42,10 @@ .btn-gray { @apply bg-gray-600 text-white hover:bg-gray-700 focus:ring-gray-500; } + + .btn-red { + @apply bg-red-600 text-white hover:bg-red-700 focus:ring-red-500; + } } /* Console styling */ diff --git a/ui_design.md b/ui_design.md index e38c632..06eee65 100644 --- a/ui_design.md +++ b/ui_design.md @@ -26,4 +26,7 @@ npm install --save-dev electron # Get tailwind and deps npm install --save-dev tailwindcss@3 postcss autoprefixer concurrently cross-env npx tailwindcss init -p +# Install vue.js +npm install --save-dev vue@3 @vitejs/plugin-vue vite yaml +npm install --save-dev js-yaml ``` -- cgit v1.2.3