- Hello World! -

- Welcome to your Electron app with Tailwind CSS! -

+ TaSTT + + + +

TaSTT

+ +

+ Model + +

+ Language + +

+ Microphone + +

+ + + + + +

+ +

Compute Settings

+ Compute Type + +

+ GPU Index + +

+ + + Use CPU + +

+ + +

Audio Settings

+ Max Speech Duration (seconds) + +

+ Min Silence Duration (ms) + +

+ Reset After Silence (seconds) + +

+ + +

Performance Settings

+ Transcription Loop Delay (ms) + +

+ + +

Debug/Preview Settings

+ + + Enable Debug Mode + + + + Enable Previews + +

+ + +

Display Settings

+ Block Width + +

+ Number of Blocks + +

+ Rows + +

+ Columns + +

+ + +

+ +

+ + + +

+ + +

Python Output

+ +

- + + diff --git a/ui/index.js b/ui/index.js index 9751fb2..0a7fdf9 100644 --- a/ui/index.js +++ b/ui/index.js @@ -1,10 +1,71 @@ const { app, BrowserWindow, ipcMain } = require('electron'); const path = require('node:path'); +const fs = require('node:fs').promises; +const yaml = require('js-yaml'); +const { spawn } = require('child_process'); + +let mainWindow; + +// Helper function to get the correct Python executable from venv +function getVenvPython() { + const venvPath = path.join(__dirname, '..', 'venv'); + const isWindows = process.platform === 'win32'; + const pythonExecutable = isWindows ? 'python.exe' : 'python'; + const pythonPath = path.join(venvPath, isWindows ? 'Scripts' : 'bin', pythonExecutable); + return pythonPath; +} + +// Helper function to send Python output to renderer +function sendPythonOutput(message, type = 'stdout') { + if (mainWindow && !mainWindow.isDestroyed()) { + mainWindow.webContents.send('python-output', { message, type }); + } +} + +// Helper function to execute Python commands using venv +function executePythonCommand(args, options = {}) { + return new Promise((resolve, reject) => { + const pythonPath = getVenvPython(); + const commandStr = `${path.basename(pythonPath)} ${args.join(' ')}`; + sendPythonOutput(`> ${commandStr}`, 'info'); + + const pythonProcess = spawn(pythonPath, args, options); + + let stdout = ''; + let stderr = ''; + + pythonProcess.stdout.on('data', (data) => { + const text = data.toString(); + stdout += text; + sendPythonOutput(text.trimEnd(), 'stdout'); + }); + + pythonProcess.stderr.on('data', (data) => { + const text = data.toString(); + stderr += text; + sendPythonOutput(text.trimEnd(), 'stderr'); + }); + + pythonProcess.on('error', (error) => { + sendPythonOutput(`Failed to start Python process: ${error.message}`, 'stderr'); + reject({ error: error.message, stdout, stderr }); + }); + + pythonProcess.on('close', (code) => { + if (code !== 0) { + sendPythonOutput(`Process exited with code ${code}`, 'stderr'); + reject({ code, stdout, stderr }); + } else { + resolve({ stdout, stderr }); + } + }); + }); +} function createWindow () { - const mainWindow = new BrowserWindow({ - width: 800, - height: 600, + mainWindow = new BrowserWindow({ + width: 1000, + height: 800, webPreferences: { preload: path.join(__dirname, 'preload.js'), contextIsolation: true, @@ -15,6 +76,94 @@ function createWindow () { mainWindow.loadFile('index.html'); } +// Path to config.yaml (one level up from ui directory) +const configPath = path.join(__dirname, '..', 'config.yaml'); + +// IPC handlers +ipcMain.handle('load-config', async () => { + try { + const fileContent = await fs.readFile(configPath, 'utf8'); + return yaml.load(fileContent); + } catch (error) { + console.error('Error loading config:', error); + throw error; + } +}); + +ipcMain.handle('save-config', async (event, config) => { + try { + const yamlContent = yaml.dump(config, { lineWidth: -1 }); + await fs.writeFile(configPath, yamlContent, 'utf8'); + return { success: true }; + } catch (error) { + console.error('Error saving config:', error); + throw error; + } +}); + +ipcMain.handle('restart-app', () => { + app.relaunch(); + app.exit(); +}); + +ipcMain.handle('install-requirements', async (event) => { + const requirementsPath = path.join(__dirname, '..', 'app', 'requirements.txt'); + + try { + // Check if requirements.txt exists + await fs.access(requirementsPath); + + const result = await executePythonCommand(['-m', 'pip', 'install', '-r', requirementsPath]); + + return { success: true, message: 'Requirements installed successfully' }; + } catch (error) { + console.error('Error installing requirements:', error); + if (error.code === 'ENOENT') { + throw new Error('requirements.txt not found'); + } + throw new Error(`Installation failed: ${error.stderr || error.error || 'Unknown error'}`); + } +}); + +ipcMain.handle('get-microphones', async () => { + const pythonScript = ` +import pyaudio +import json +import sys + +try: + p = pyaudio.PyAudio() + info = p.get_host_api_info_by_index(0) + numdevices = info.get('deviceCount') + + microphones = [] + for i in range(0, numdevices): + device_info = p.get_device_info_by_host_api_device_index(0, i) + if device_info.get('maxInputChannels') > 0: + microphones.append({ + 'index': i, + 'name': device_info.get('name'), + 'defaultSampleRate': device_info.get('defaultSampleRate') + }) + + print(json.dumps(microphones)) + p.terminate() +except Exception as e: + print(json.dumps({'error': str(e)}), file=sys.stderr) + sys.exit(1) +`; + + try { + const result = await executePythonCommand(['-c', pythonScript]); + const microphones = JSON.parse(result.stdout.trim()); + console.log('Successfully retrieved microphones:', microphones); + return microphones; + } catch (error) { + console.error('Failed to get microphones:', error); + throw new Error(`Failed to get microphones: ${error.stderr || error.error || 'Unknown error'}`); + } +}); + app.whenReady().then(() => { createWindow(); diff --git a/ui/package.json b/ui/package.json index 1c56341..fee2d67 100644 --- a/ui/package.json +++ b/ui/package.json @@ -5,20 +5,26 @@ "main": "index.js", "scripts": { "start": "npm run build:css && electron .", - "build:css": "tailwindcss -i ./src/input.css -o ./build/output.css", - "watch:css": "tailwindcss -i ./src/input.css -o ./build/output.css --watch", - "dev": "npm run watch:css & electron .", + "build:css": "tailwindcss -i ./src/components.css -o ./build/output.css", + "watch:css": "tailwindcss -i ./src/components.css -o ./build/output.css --watch", + "dev": "concurrently \"npm run watch:css\" \"electron .\"", "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": [], "author": "yum_food", "license": "MIT", + "dependencies": { + "js-yaml": "^4.1.0" + }, "devDependencies": { + "@vitejs/plugin-vue": "^5.2.4", "autoprefixer": "^10.4.21", "concurrently": "^9.1.2", "cross-env": "^7.0.3", "electron": "^36.3.2", "postcss": "^8.5.4", - "tailwindcss": "^3.4.17" + "tailwindcss": "^3.4.17", + "vite": "^6.3.5", + "vue": "^3.5.16" } } diff --git a/ui/preload.js b/ui/preload.js index 9f87d19..108bffe 100644 --- a/ui/preload.js +++ b/ui/preload.js @@ -1,6 +1,12 @@ const { contextBridge, ipcRenderer } = require('electron'); contextBridge.exposeInMainWorld('electronAPI', { + loadConfig: () => ipcRenderer.invoke('load-config'), + saveConfig: (config) => ipcRenderer.invoke('save-config', config), + restartApp: () => ipcRenderer.invoke('restart-app'), + getMicrophones: () => ipcRenderer.invoke('get-microphones'), + installRequirements: () => ipcRenderer.invoke('install-requirements'), + onPythonOutput: (callback) => ipcRenderer.on('python-output', (event, data) => callback(data)) }); console.log('Preload script loaded.'); diff --git a/ui/renderer.js b/ui/renderer.js new file mode 100644 index 0000000..83c652c --- /dev/null +++ b/ui/renderer.js @@ -0,0 +1,249 @@ +// Handle status messages +function showStatus(message, type = 'info') { + const statusEl = document.getElementById('status-message'); + statusEl.textContent = message; + statusEl.classList.remove('hidden', 'bg-green-100', 'bg-red-100', 'bg-blue-100', 'text-green-800', 'text-red-800', 'text-blue-800'); + + if (type === 'success') { + statusEl.classList.add('bg-green-100', 'text-green-800'); + } else if (type === 'error') { + statusEl.classList.add('bg-red-100', 'text-red-800'); + } else { + statusEl.classList.add('bg-blue-100', 'text-blue-800'); + } + + // Also log to console + appendToConsole(message, type === 'error' ? 'stderr' : 'info'); + + setTimeout(() => { + statusEl.classList.add('hidden'); + }, 5000); +} + +// Get form values +function getFormValues() { + return { + compute_type: document.getElementById('compute_type').value, + enable_debug_mode: document.getElementById('enable_debug_mode').checked ? 1 : 0, + enable_previews: document.getElementById('enable_previews').checked ? 1 : 0, + language: document.getElementById('language').value, + gpu_idx: parseInt(document.getElementById('gpu_idx').value), + max_speech_duration_s: parseInt(document.getElementById('max_speech_duration_s').value), + min_silence_duration_ms: parseInt(document.getElementById('min_silence_duration_ms').value), + microphone: document.getElementById('microphone').value, + model: document.getElementById('model').value, + reset_after_silence_s: parseInt(document.getElementById('reset_after_silence_s').value), + transcription_loop_delay_ms: parseInt(document.getElementById('transcription_loop_delay_ms').value), + use_cpu: document.getElementById('use_cpu').checked ? 1 : 0, + block_width: parseInt(document.getElementById('block_width').value), + num_blocks: parseInt(document.getElementById('num_blocks').value), + rows: parseInt(document.getElementById('rows').value), + cols: parseInt(document.getElementById('cols').value) + }; +} + +// Add a flag to prevent auto-save during programmatic updates +let isSettingValues = false; + +// Set form values +function setFormValues(config) { + isSettingValues = true; // Disable auto-save temporarily + + document.getElementById('compute_type').value = config.compute_type || 'int8'; + document.getElementById('enable_debug_mode').checked = config.enable_debug_mode === 1; + document.getElementById('enable_previews').checked = config.enable_previews === 1; + document.getElementById('language').value = config.language || 'english'; + document.getElementById('gpu_idx').value = config.gpu_idx || 0; + document.getElementById('max_speech_duration_s').value = config.max_speech_duration_s || 10; + document.getElementById('min_silence_duration_ms').value = config.min_silence_duration_ms || 250; + document.getElementById('microphone').value = config.microphone || 'motu'; + document.getElementById('model').value = config.model || 'turbo'; + document.getElementById('reset_after_silence_s').value = config.reset_after_silence_s || 15; + document.getElementById('transcription_loop_delay_ms').value = config.transcription_loop_delay_ms || 100; + document.getElementById('use_cpu').checked = config.use_cpu === 1; + document.getElementById('block_width').value = config.block_width || 2; + document.getElementById('num_blocks').value = config.num_blocks || 40; + document.getElementById('rows').value = config.rows || 10; + document.getElementById('cols').value = config.cols || 24; + + isSettingValues = false; // Re-enable auto-save +} + +// Toggle advanced settings +document.getElementById('toggle-advanced').addEventListener('click', () => { + const advancedSettings = document.getElementById('advanced-settings'); + const chevron = document.getElementById('chevron'); + + if (advancedSettings.classList.contains('hidden')) { + advancedSettings.classList.remove('hidden'); + chevron.classList.add('rotate-90'); + } else { + advancedSettings.classList.add('hidden'); + chevron.classList.remove('rotate-90'); + } +}); + +// Simplify button handlers by extracting common patterns +async function handleAsyncAction(actionName, actionFn) { + try { + const result = await actionFn(); + if (result && result.message) { + showStatus(result.message, 'success'); + } + return result; + } catch (error) { + showStatus(`${actionName} failed: ${error.message}`, 'error'); + throw error; + } +} + +// Auto-save functionality with debouncing +let saveTimeout; +const SAVE_DELAY = 500; // milliseconds + +async function autoSaveConfig() { + if (isSettingValues) return; // Don't save during programmatic updates + + clearTimeout(saveTimeout); + saveTimeout = setTimeout(async () => { + try { + const config = getFormValues(); + await window.electronAPI.saveConfig(config); + showStatus('Configuration saved', 'success'); + } catch (error) { + showStatus(`Failed to save configuration: ${error.message}`, 'error'); + } + }, SAVE_DELAY); +} + +// Add event listeners to all form inputs for auto-save +function setupAutoSave() { + // Get all form inputs + const form = document.getElementById('config-form'); + const inputs = form.querySelectorAll('input, select'); + + // Add change listener to each input + inputs.forEach(input => { + if (input.type === 'checkbox') { + input.addEventListener('change', autoSaveConfig); + } else if (input.type === 'number' || input.type === 'text') { + input.addEventListener('input', autoSaveConfig); + } else if (input.tagName === 'SELECT') { + input.addEventListener('change', autoSaveConfig); + } + }); +} + +// Update the setup-venv handler +document.getElementById('setup-venv').addEventListener('click', async () => { + const setupButton = document.getElementById('setup-venv'); + setupButton.disabled = true; + setupButton.classList.add('opacity-50', 'cursor-not-allowed'); + + try { + await handleAsyncAction('Install requirements', async () => { + return await window.electronAPI.installRequirements(); + }); + // Reload microphones after successful installation + await loadMicrophones(); + } finally { + setupButton.disabled = false; + setupButton.classList.remove('opacity-50', 'cursor-not-allowed'); + } +}); + +// Simplified microphone loading +async function loadMicrophones() { + const microphoneSelect = document.getElementById('microphone'); + + try { + appendToConsole('Loading available microphones...', 'info'); + const microphones = await window.electronAPI.getMicrophones(); + + microphoneSelect.innerHTML = ''; + + if (microphones.length === 0) { + microphoneSelect.innerHTML = ''; + appendToConsole('No microphones found', 'stderr'); + return; + } + + appendToConsole(`Found ${microphones.length} microphone(s)`, 'info'); + microphones.forEach(mic => { + const option = document.createElement('option'); + option.value = mic.index.toString(); + option.textContent = mic.name; + microphoneSelect.appendChild(option); + appendToConsole(` - ${mic.name} (Device ${mic.index})`, 'stdout'); + }); + + // Restore previously selected microphone if possible + try { + const config = await window.electronAPI.loadConfig(); + if (config.microphone) { + microphoneSelect.value = config.microphone; + } + } catch (error) { + // Ignore config load errors here + } + + } catch (error) { + appendToConsole(`Failed to load microphones: ${error.message}`, 'stderr'); + microphoneSelect.innerHTML = ''; + } +} + +// Update window load to include auto-save setup +window.addEventListener('load', async () => { + appendToConsole('TaSTT Configuration UI initialized', 'info'); + + // Load config first + try { + const config = await window.electronAPI.loadConfig(); + setFormValues(config); + appendToConsole('Configuration loaded', 'info'); + } catch (error) { + appendToConsole(`Failed to load configuration: ${error.message}`, 'stderr'); + } + + // Load microphones + await loadMicrophones(); + + // Set up auto-save after everything is loaded + setupAutoSave(); +}); + +// Console management +const consoleContent = document.getElementById('console-content'); + +function appendToConsole(message, type = 'stdout') { + const timestamp = new Date().toLocaleTimeString(); + const timestampSpan = document.createElement('span'); + timestampSpan.className = 'console-timestamp'; + timestampSpan.textContent = `[${timestamp}] `; + + const messageSpan = document.createElement('span'); + messageSpan.className = `console-${type}`; + messageSpan.textContent = message; + + const lineDiv = document.createElement('div'); + lineDiv.appendChild(timestampSpan); + lineDiv.appendChild(messageSpan); + + consoleContent.appendChild(lineDiv); + + // Auto-scroll to bottom + const pythonConsole = document.getElementById('python-console'); + pythonConsole.scrollTop = pythonConsole.scrollHeight; +} + +// Clear console button +document.getElementById('clear-console').addEventListener('click', () => { + consoleContent.innerHTML = ''; + appendToConsole('Console cleared', 'info'); +}); + +// Listen for Python output +window.electronAPI.onPythonOutput((data) => { + appendToConsole(data.message, data.type); +}); \ No newline at end of file diff --git a/ui/src/components.css b/ui/src/components.css new file mode 100644 index 0000000..be046ea --- /dev/null +++ b/ui/src/components.css @@ -0,0 +1,110 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +@layer components { + .config-section { + @apply bg-white rounded-lg shadow-md p-6; + } + + .section-title { + @apply text-xl font-semibold text-gray-700 mb-4; + } + + .form-label { + @apply block text-sm font-medium text-gray-700 mb-2; + } + + .form-input { + @apply w-full px-3 py-2 border border-gray-300 rounded-md shadow-sm focus:outline-none focus:ring-blue-500 focus:border-blue-500 sm:text-sm; + } + + .checkbox-label { + @apply flex items-center cursor-pointer hover:bg-gray-50 p-2 rounded; + } + + .checkbox-text { + @apply text-sm text-gray-700; + } + + .btn { + @apply px-4 py-2 font-medium text-sm rounded-md transition-colors focus:outline-none focus:ring-2 focus:ring-offset-2; + } + + .btn-blue { + @apply bg-blue-600 text-white hover:bg-blue-700 focus:ring-blue-500; + } + + .btn-green { + @apply bg-green-600 text-white hover:bg-green-700 focus:ring-green-500; + } + + .btn-gray { + @apply bg-gray-600 text-white hover:bg-gray-700 focus:ring-gray-500; + } +} + +/* Console styling */ +#python-console { + background-color: #1a1a1a; + font-family: 'Consolas', 'Monaco', 'Courier New', monospace; + line-height: 1.4; +} + +#console-content { + word-wrap: break-word; +} + +/* Console text colors */ +.console-stdout { + color: #a8cc8c; +} + +.console-stderr { + color: #e88388; +} + +.console-info { + color: #66c2cd; +} + +.console-timestamp { + color: #6c7986; + font-size: 0.875rem; +} + +/* Ensure full height layout */ +html, body { + height: 100%; + margin: 0; + padding: 0; +} + +.container-fluid { + max-width: 100%; + height: 100vh; +} + +/* Scrollbar styling for console */ +#python-console::-webkit-scrollbar { + width: 8px; +} + +#python-console::-webkit-scrollbar-track { + background: #2a2a2a; +} + +#python-console::-webkit-scrollbar-thumb { + background: #4a4a4a; + border-radius: 4px; +} + +#python-console::-webkit-scrollbar-thumb:hover { + background: #5a5a5a; +} + +/* Ensure buttons have proper disabled states */ +button:disabled { + cursor: not-allowed; + opacity: 0.5; +} diff --git a/ui/tailwind.config.js b/ui/tailwind.config.js index fa93053..804b7f0 100644 --- a/ui/tailwind.config.js +++ b/ui/tailwind.config.js @@ -1,8 +1,9 @@ /** @type {import('tailwindcss').Config} */ module.exports = { content: [ - "./index.html", - "./src/**/*.{html,js}", + "./*.html", + "./*.js", + "./src/**/*.{html,js}" ], theme: { extend: {}, -- cgit v1.2.3 From f97cef182de55b6dbae8d2bc0477acfca6cc1f66 Mon Sep 17 00:00:00 2001 From: yum Date: Thu, 29 May 2025 19:45:48 -0700 Subject: More UI work 1. main STT app works in new project structure 2. UI dumps mics on startup to populate mic list 3. add missing deps (hf-xet, wave) 4. normalize audio volume when transcribing. Probably still wrong tbqh. 5. add checkbox to save audio segments & improve logic so only segments with speech get saved. 6. add default config settings --- app/hi.py | 7 +- app/list_microphones.py | 24 ++++++ app/requirements.txt | 3 +- app/stt.py | 55 ++++++++++++-- app/vad.py | 3 +- config.yaml | 7 +- ui/index.html | 27 ++++--- ui/index.js | 196 ++++++++++++++++++++++++++++++++++++++---------- ui/preload.js | 5 +- ui/renderer.js | 87 ++++++++++++++++++++- ui/src/components.css | 4 + ui_design.md | 3 + 12 files changed, 355 insertions(+), 66 deletions(-) create mode 100644 app/list_microphones.py (limited to 'ui') diff --git a/app/hi.py b/app/hi.py index 0129958..0d80b9d 100644 --- a/app/hi.py +++ b/app/hi.py @@ -2,6 +2,7 @@ import app_config import argparse from math import floor, ceil import msvcrt +import os from pythonosc import udp_client import sentencepiece as spm from shared_thread_data import SharedThreadData @@ -15,8 +16,11 @@ TESTS_ENABLED = True # 0 = quiet, 1 = verbose, 2 = very verbose LOG_LEVEL = 0 +APP_ROOT = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(APP_ROOT) + def get_tokenizer(): - model_path = "./custom_unigram_tokenizer_65k/unigram.model" + model_path = os.path.join(PROJECT_ROOT, "custom_unigram_tokenizer_65k", "unigram.model") print(f"Loading SentencePiece tokenizer from: {model_path}") sp = spm.SentencePieceProcessor() sp.load(model_path) @@ -346,7 +350,6 @@ if __name__ == "__main__": time.sleep(0.1) continue - try: char = char_bytes.decode('utf-8') if char == '\r' or char == '\n': diff --git a/app/list_microphones.py b/app/list_microphones.py new file mode 100644 index 0000000..a6b1f36 --- /dev/null +++ b/app/list_microphones.py @@ -0,0 +1,24 @@ +import pyaudio +import json +import sys + +try: + p = pyaudio.PyAudio() + info = p.get_host_api_info_by_index(0) + numdevices = info.get('deviceCount') + + microphones = [] + for i in range(0, numdevices): + device_info = p.get_device_info_by_host_api_device_index(0, i) + if device_info.get('maxInputChannels') > 0: + microphones.append({ + 'index': i, + 'name': device_info.get('name'), + 'defaultSampleRate': device_info.get('defaultSampleRate') + }) + + print(json.dumps(microphones)) + p.terminate() +except Exception as e: + print(json.dumps({'error': str(e)}), file=sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt index 4e79312..07f94cd 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,7 +1,8 @@ faster-whisper +hf-xet langcodes pyaudio pydub python-osc sentencepiece - +wave diff --git a/app/stt.py b/app/stt.py index 34ef2e9..c157f6d 100644 --- a/app/stt.py +++ b/app/stt.py @@ -1,3 +1,4 @@ +from datetime import datetime from faster_whisper import WhisperModel import langcodes import numpy as np @@ -9,6 +10,11 @@ import sys import time import typing import vad +import wave + + +APP_ROOT = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(APP_ROOT) class AudioStream(): FORMAT = pyaudio.paInt16 @@ -242,6 +248,26 @@ class NormalizingAudioCollector(AudioCollectorFilter): return frames +class BoostingAudioCollector(AudioCollectorFilter): + def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict): + AudioCollectorFilter.__init__(self, parent) + self.target_dBFS = target_dBFS + self.cfg = cfg + + def getAudio(self) -> bytes: + audio = self.parent.getAudio() + + audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ, + frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS) + if self.cfg["enable_debug_mode"]: + print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr) + audio = audio.apply_gain(self.target_dBFS - audio.dBFS) + + frames = np.array(audio.get_array_of_samples()) + frames = np.int16(frames).tobytes() + + return frames + class CompressingAudioCollector(AudioCollectorFilter): def __init__(self, parent: AudioCollector): AudioCollectorFilter.__init__(self, parent) @@ -441,6 +467,16 @@ class TranscriptCommit: self.duration_s = duration_s +def saveAudio(audio: bytes, path: str, cfg: typing.Dict): + with wave.open(path, 'wb') as wf: + if cfg["enable_debug_mode"]: + print(f"Saving audio to {path}", file=sys.stderr) + wf.setnchannels(AudioStream.CHANNELS) + wf.setsampwidth(AudioStream.FRAME_SZ) + wf.setframerate(AudioStream.FPS) + wf.writeframes(audio) + + class VadCommitter: def __init__(self, cfg: typing.Dict, @@ -463,7 +499,6 @@ class VadCommitter: start_ts = self.collector.begin() if has_audio and stable_cutoff: - #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() duration_s = stable_cutoff / AudioStream.FPS start_ts = self.collector.begin() @@ -475,12 +510,16 @@ class VadCommitter: if self.cfg["enable_debug_mode"]: for s in segments: print(f"commit segment: {s}", file=sys.stderr) - print(f"delta get: {delta}", file=sys.stderr) + if len(delta) > 0: + print(f"delta get: {delta}", file=sys.stderr) - if False: + if self.cfg["save_audio"] and len(delta) > 0: ts = datetime.fromtimestamp(self.collector.now() - latency_s) filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav" - saveAudio(commit_audio, filename) + audio_dir = os.path.join(PROJECT_ROOT, "audio") + if not os.path.exists(audio_dir): + os.makedirs(audio_dir) + saveAudio(commit_audio, os.path.join(audio_dir, filename), self.cfg) preview = "" if self.cfg["enable_previews"] and has_audio: @@ -488,7 +527,6 @@ class VadCommitter: preview = "".join(s.transcript for s in segments) if not has_audio: - #print("VAD detects no audio, skip transcription", file=sys.stderr) self.collector.keepLast(1.0) return TranscriptCommit( @@ -504,8 +542,9 @@ def transcriptionThread(shared_data: SharedThreadData): stream = MicStream(shared_data.cfg["microphone"]) collector = AudioCollector(stream) - collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) + collector = BoostingAudioCollector(collector, 0.0, shared_data.cfg) whisper = Whisper(collector, shared_data.cfg) segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"], max_speech_s=shared_data.cfg["max_speech_duration_s"]) @@ -552,13 +591,13 @@ def transcriptionThread(shared_data: SharedThreadData): preview = commit.preview try: - print(f"Transcript: {transcript}") + print(f"Transcript: {transcript}", flush=True) except UnicodeEncodeError: print("Failed to encode transcript - discarding delta", file=sys.stderr) continue try: - print(f"Preview: {preview}") + print(f"Preview: {preview}", flush=True) except UnicodeEncodeError: print("Failed to encode preview - discarding", file=sys.stderr) diff --git a/app/vad.py b/app/vad.py index 10a72d3..1dea765 100644 --- a/app/vad.py +++ b/app/vad.py @@ -259,7 +259,8 @@ def get_vad_model(): """Returns the VAD model instance.""" abspath = os.path.abspath(__file__) my_dir = os.path.dirname(abspath) - path = os.path.join(my_dir, "Models/silero_vad.onnx") + parent_dir = os.path.dirname(my_dir) + path = os.path.join(parent_dir, "Models", "silero_vad.onnx") return SileroVADModel(path) diff --git a/config.yaml b/config.yaml index 164b4e6..34d88f1 100644 --- a/config.yaml +++ b/config.yaml @@ -1,18 +1,17 @@ -compute_type: int8 +compute_type: float16 enable_debug_mode: 0 enable_previews: 1 +save_audio: 0 language: english gpu_idx: 0 max_speech_duration_s: 10 min_silence_duration_ms: 250 -microphone: motu +microphone: 0 model: turbo reset_after_silence_s: 15 transcription_loop_delay_ms: 100 use_cpu: 0 - block_width: 2 num_blocks: 40 rows: 10 cols: 24 - diff --git a/ui/index.html b/ui/index.html index 14cc354..b06e56b 100644 --- a/ui/index.html +++ b/ui/index.html @@ -8,11 +8,9 @@

TaSTT

- -

+ +

@@ -156,9 +158,17 @@

- +

+ + + +

@@ -167,9 +177,8 @@

Python Output

diff --git a/ui/index.js b/ui/index.js index 0a7fdf9..a056156 100644 --- a/ui/index.js +++ b/ui/index.js @@ -4,14 +4,16 @@ const fs = require('node:fs').promises; const yaml = require('js-yaml'); const { spawn } = require('child_process'); +const APP_ROOT = path.join(__dirname, '..'); +const CONFIG_PATH = path.join(APP_ROOT, 'config.yaml'); + let mainWindow; +let runningProcess = null; // Track the running Python process // Helper function to get the correct Python executable from venv function getVenvPython() { - const venvPath = path.join(__dirname, '..', 'venv'); - const isWindows = process.platform === 'win32'; - const pythonExecutable = isWindows ? 'python.exe' : 'python'; - const pythonPath = path.join(venvPath, isWindows ? 'Scripts' : 'bin', pythonExecutable); + const venvPath = path.join(APP_ROOT, 'venv'); + const pythonPath = path.join(venvPath, 'Scripts', 'python.exe'); return pythonPath; } @@ -29,7 +31,17 @@ function executePythonCommand(args, options = {}) { const commandStr = `${path.basename(pythonPath)} ${args.join(' ')}`; sendPythonOutput(`> ${commandStr}`, 'info'); - const pythonProcess = spawn(pythonPath, args, options); + // Add dll directory to PATH for Windows DLL loading + const dllPath = path.join(APP_ROOT, 'dll'); + const env = { ...process.env }; + env.PATH = `${dllPath};${env.PATH}`; + + const spawnOptions = { + ...options, + env + }; + + const pythonProcess = spawn(pythonPath, args, spawnOptions); let stdout = ''; let stderr = ''; @@ -76,15 +88,47 @@ function createWindow () { mainWindow.loadFile('index.html'); } -// Path to config.yaml (one level up from ui directory) -const configPath = path.join(__dirname, '..', 'config.yaml'); +// Default configuration based on user's current config.yaml +const DEFAULT_CONFIG = { + compute_type: 'float16', + enable_debug_mode: 0, + enable_previews: 1, + save_audio: 0, + language: 'english', + gpu_idx: 0, + max_speech_duration_s: 10, + min_silence_duration_ms: 250, + microphone: 0, + model: 'turbo', + reset_after_silence_s: 15, + transcription_loop_delay_ms: 100, + use_cpu: 0, + block_width: 2, + num_blocks: 40, + rows: 10, + cols: 24 +}; // IPC handlers ipcMain.handle('load-config', async () => { try { - const fileContent = await fs.readFile(configPath, 'utf8'); + const fileContent = await fs.readFile(CONFIG_PATH, 'utf8'); return yaml.load(fileContent); } catch (error) { + if (error.code === 'ENOENT') { + // Config file doesn't exist, create it with defaults + console.log('Config file not found, creating with defaults...'); + try { + const yamlContent = yaml.dump(DEFAULT_CONFIG, { lineWidth: -1 }); + await fs.writeFile(CONFIG_PATH, yamlContent, 'utf8'); + console.log('Created config.yaml with default values'); + return DEFAULT_CONFIG; + } catch (writeError) { + console.error('Error creating default config:', writeError); + // Return defaults even if we can't write the file + return DEFAULT_CONFIG; + } + } console.error('Error loading config:', error); throw error; } @@ -93,7 +137,7 @@ ipcMain.handle('load-config', async () => { ipcMain.handle('save-config', async (event, config) => { try { const yamlContent = yaml.dump(config, { lineWidth: -1 }); - await fs.writeFile(configPath, yamlContent, 'utf8'); + await fs.writeFile(CONFIG_PATH, yamlContent, 'utf8'); return { success: true }; } catch (error) { console.error('Error saving config:', error); @@ -107,7 +151,7 @@ ipcMain.handle('restart-app', () => { }); ipcMain.handle('install-requirements', async (event) => { - const requirementsPath = path.join(__dirname, '..', 'app', 'requirements.txt'); + const requirementsPath = path.join(APP_ROOT, 'app', 'requirements.txt'); try { // Check if requirements.txt exists @@ -126,35 +170,10 @@ ipcMain.handle('install-requirements', async (event) => { }); ipcMain.handle('get-microphones', async () => { - const pythonScript = ` -import pyaudio -import json -import sys - -try: - p = pyaudio.PyAudio() - info = p.get_host_api_info_by_index(0) - numdevices = info.get('deviceCount') - - microphones = [] - for i in range(0, numdevices): - device_info = p.get_device_info_by_host_api_device_index(0, i) - if device_info.get('maxInputChannels') > 0: - microphones.append({ - 'index': i, - 'name': device_info.get('name'), - 'defaultSampleRate': device_info.get('defaultSampleRate') - }) - - print(json.dumps(microphones)) - p.terminate() -except Exception as e: - print(json.dumps({'error': str(e)}), file=sys.stderr) - sys.exit(1) -`; - + const scriptPath = path.join(APP_ROOT, 'app', 'list_microphones.py'); + try { - const result = await executePythonCommand(['-c', pythonScript]); + const result = await executePythonCommand([scriptPath]); const microphones = JSON.parse(result.stdout.trim()); console.log('Successfully retrieved microphones:', microphones); return microphones; @@ -164,6 +183,105 @@ except Exception as e: } }); +// Add handlers for starting and stopping the process +ipcMain.handle('start-process', async () => { + if (runningProcess) { + throw new Error('Process is already running'); + } + + const scriptPath = path.join(APP_ROOT, 'app', 'hi.py'); + const configPath = CONFIG_PATH; + + try { + const pythonPath = getVenvPython(); + const args = [scriptPath, '--config', configPath]; + + sendPythonOutput(`Starting process: ${path.basename(pythonPath)} ${args.join(' ')}`, 'info'); + + // Add dll directory to PATH for Windows DLL loading + const dllPath = path.join(APP_ROOT, 'dll'); + const env = { ...process.env }; + env.PATH = `${dllPath};${env.PATH}`; + + runningProcess = spawn(pythonPath, args, { env }); + + runningProcess.stdout.on('data', (data) => { + const text = data.toString(); + sendPythonOutput(text.trimEnd(), 'stdout'); + }); + + runningProcess.stderr.on('data', (data) => { + const text = data.toString(); + sendPythonOutput(text.trimEnd(), 'stderr'); + }); + + runningProcess.on('error', (error) => { + sendPythonOutput(`Process error: ${error.message}`, 'stderr'); + runningProcess = null; + if (mainWindow && !mainWindow.isDestroyed()) { + mainWindow.webContents.send('process-stopped'); + } + }); + + runningProcess.on('close', (code) => { + sendPythonOutput(`Process exited with code ${code}`, 'info'); + runningProcess = null; + if (mainWindow && !mainWindow.isDestroyed()) { + mainWindow.webContents.send('process-stopped'); + } + }); + + return { success: true }; + } catch (error) { + runningProcess = null; + throw error; + } +}); + +ipcMain.handle('stop-process', async () => { + if (!runningProcess) { + throw new Error('No process is running'); + } + + return new Promise((resolve, reject) => { + let forcefullyKilled = false; + + // Set up a timeout to force kill after 10 seconds + const killTimeout = setTimeout(() => { + if (runningProcess) { + sendPythonOutput('Process did not stop gracefully, forcing termination...', 'stderr'); + forcefullyKilled = true; + runningProcess.kill(); + } + }, 10000); + + // Listen for the process to exit + runningProcess.once('exit', (code, signal) => { + clearTimeout(killTimeout); + runningProcess = null; + + if (forcefullyKilled) { + sendPythonOutput('Process forcefully terminated', 'info'); + } else { + sendPythonOutput('Process stopped gracefully', 'info'); + } + + resolve({ success: true, forcefullyKilled }); + }); + + // Send termination signal + sendPythonOutput('Stopping process gracefully...', 'info'); + runningProcess.kill(); + }); +}); + +// Clean up on app quit +app.on('before-quit', () => { + if (runningProcess) { + runningProcess.kill(); + } +}); + app.whenReady().then(() => { createWindow(); @@ -173,6 +291,6 @@ app.whenReady().then(() => { }); app.on('window-all-closed', function () { - if (process.platform !== 'darwin') app.quit(); + app.quit(); }); diff --git a/ui/preload.js b/ui/preload.js index 108bffe..e6c0623 100644 --- a/ui/preload.js +++ b/ui/preload.js @@ -6,7 +6,10 @@ contextBridge.exposeInMainWorld('electronAPI', { restartApp: () => ipcRenderer.invoke('restart-app'), getMicrophones: () => ipcRenderer.invoke('get-microphones'), installRequirements: () => ipcRenderer.invoke('install-requirements'), - onPythonOutput: (callback) => ipcRenderer.on('python-output', (event, data) => callback(data)) + startProcess: () => ipcRenderer.invoke('start-process'), + stopProcess: () => ipcRenderer.invoke('stop-process'), + onPythonOutput: (callback) => ipcRenderer.on('python-output', (event, data) => callback(data)), + onProcessStopped: (callback) => ipcRenderer.on('process-stopped', (event) => callback()) }); console.log('Preload script loaded.'); diff --git a/ui/renderer.js b/ui/renderer.js index 83c652c..b3f05a6 100644 --- a/ui/renderer.js +++ b/ui/renderer.js @@ -22,15 +22,20 @@ function showStatus(message, type = 'info') { // Get form values function getFormValues() { + const microphoneValue = document.getElementById('microphone').value; + // Convert to number if it's a numeric string (device index) + const microphoneForConfig = /^\d+$/.test(microphoneValue) ? parseInt(microphoneValue) : microphoneValue; + return { compute_type: document.getElementById('compute_type').value, enable_debug_mode: document.getElementById('enable_debug_mode').checked ? 1 : 0, enable_previews: document.getElementById('enable_previews').checked ? 1 : 0, + save_audio: document.getElementById('save_audio').checked ? 1 : 0, language: document.getElementById('language').value, gpu_idx: parseInt(document.getElementById('gpu_idx').value), max_speech_duration_s: parseInt(document.getElementById('max_speech_duration_s').value), min_silence_duration_ms: parseInt(document.getElementById('min_silence_duration_ms').value), - microphone: document.getElementById('microphone').value, + microphone: microphoneForConfig, model: document.getElementById('model').value, reset_after_silence_s: parseInt(document.getElementById('reset_after_silence_s').value), transcription_loop_delay_ms: parseInt(document.getElementById('transcription_loop_delay_ms').value), @@ -52,6 +57,7 @@ function setFormValues(config) { document.getElementById('compute_type').value = config.compute_type || 'int8'; document.getElementById('enable_debug_mode').checked = config.enable_debug_mode === 1; document.getElementById('enable_previews').checked = config.enable_previews === 1; + document.getElementById('save_audio').checked = config.save_audio === 1; document.getElementById('language').value = config.language || 'english'; document.getElementById('gpu_idx').value = config.gpu_idx || 0; document.getElementById('max_speech_duration_s').value = config.max_speech_duration_s || 10; @@ -97,6 +103,30 @@ async function handleAsyncAction(actionName, actionFn) { } } +// Process control buttons +const startButton = document.getElementById('start-process'); +const stopButton = document.getElementById('stop-process'); + +// Helper functions for button state management +function setButtonState(button, disabled) { + button.disabled = disabled; + if (disabled) { + button.classList.add('opacity-50', 'cursor-not-allowed'); + } else { + button.classList.remove('opacity-50', 'cursor-not-allowed'); + } +} + +function setProcessRunningState() { + setButtonState(startButton, true); + setButtonState(stopButton, false); +} + +function setProcessStoppedState() { + setButtonState(startButton, false); + setButtonState(stopButton, true); +} + // Auto-save functionality with debouncing let saveTimeout; const SAVE_DELAY = 500; // milliseconds @@ -110,6 +140,31 @@ async function autoSaveConfig() { const config = getFormValues(); await window.electronAPI.saveConfig(config); showStatus('Configuration saved', 'success'); + + // Check if process is running (stop button is enabled means process is running) + const stopButton = document.getElementById('stop-process'); + + if (!stopButton.disabled) { + // Process is running, restart it with new config + appendToConsole('Restarting process with new configuration...', 'info'); + + try { + await window.electronAPI.stopProcess(); + + await new Promise(resolve => setTimeout(resolve, 1000)); + + await window.electronAPI.startProcess(); + + // Update button states to reflect running process + setProcessRunningState(); + + appendToConsole('Process restarted with new configuration', 'info'); + } catch (error) { + appendToConsole(`Failed to restart process: ${error.message}`, 'stderr'); + // Process is stopped, update button states + setProcessStoppedState(); + } + } } catch (error) { showStatus(`Failed to save configuration: ${error.message}`, 'error'); } @@ -246,4 +301,34 @@ document.getElementById('clear-console').addEventListener('click', () => { // Listen for Python output window.electronAPI.onPythonOutput((data) => { appendToConsole(data.message, data.type); +}); + +document.getElementById('start-process').addEventListener('click', async () => { + setButtonState(startButton, true); + + try { + await window.electronAPI.startProcess(); + setProcessRunningState(); + appendToConsole('Process started successfully', 'info'); + } catch (error) { + appendToConsole(`Failed to start process: ${error.message}`, 'stderr'); + setButtonState(startButton, false); + } +}); + +document.getElementById('stop-process').addEventListener('click', async () => { + setButtonState(stopButton, true); + + try { + const result = await window.electronAPI.stopProcess(); + appendToConsole('Process stop initiated', 'info'); + } catch (error) { + appendToConsole(`Failed to stop process: ${error.message}`, 'stderr'); + setButtonState(stopButton, false); + } +}); + +// Listen for process stopped event +window.electronAPI.onProcessStopped(() => { + setProcessStoppedState(); }); \ No newline at end of file diff --git a/ui/src/components.css b/ui/src/components.css index be046ea..d8d909d 100644 --- a/ui/src/components.css +++ b/ui/src/components.css @@ -42,6 +42,10 @@ .btn-gray { @apply bg-gray-600 text-white hover:bg-gray-700 focus:ring-gray-500; } + + .btn-red { + @apply bg-red-600 text-white hover:bg-red-700 focus:ring-red-500; + } } /* Console styling */ diff --git a/ui_design.md b/ui_design.md index e38c632..06eee65 100644 --- a/ui_design.md +++ b/ui_design.md @@ -26,4 +26,7 @@ npm install --save-dev electron # Get tailwind and deps npm install --save-dev tailwindcss@3 postcss autoprefixer concurrently cross-env npx tailwindcss init -p +# Install vue.js +npm install --save-dev vue@3 @vitejs/plugin-vue vite yaml +npm install --save-dev js-yaml ``` -- cgit v1.2.3 From e1b3f638a1ea448de9691f69eb62ebf4c3944c9f Mon Sep 17 00:00:00 2001 From: yum Date: Fri, 30 May 2025 02:50:55 -0700 Subject: More polish - Filters actually get applied now, huge accuracy boost - Use silero-vad python library instead of rolling our own - Expose prompt parameter - Auto setup venv on launch - Clean up python output - Auto acquire all dependencies on launch - Add icon --- .cursorignore | 2 + .gitignore | 2 +- Images/favicon.ico | Bin 0 -> 92015 bytes app/hi.py | 12 +- app/requirements.txt | 2 +- app/stt.py | 128 +++++++++--- app/vad.py | 314 ---------------------------- config.yaml | 1 + ui/index.html | 336 +++++++++++++++++------------- ui/index.js | 382 +++++++++++++++++++++++++++++----- ui/package.json | 76 ++++++- ui/preload.js | 7 +- ui/renderer.js | 564 +++++++++++++++++++++++++++++++------------------- ui/src/components.css | 8 + ui_design.md | 9 +- 15 files changed, 1085 insertions(+), 758 deletions(-) create mode 100644 .cursorignore create mode 100644 Images/favicon.ico delete mode 100644 app/vad.py (limited to 'ui') diff --git a/.cursorignore b/.cursorignore new file mode 100644 index 0000000..a8f4624 --- /dev/null +++ b/.cursorignore @@ -0,0 +1,2 @@ +**/node_modules +**/site-packages \ No newline at end of file diff --git a/.gitignore b/.gitignore index a102cf0..d3886ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ .*.sw[po] *.meta - +.venv_is_set_up diff --git a/Images/favicon.ico b/Images/favicon.ico new file mode 100644 index 0000000..25ea9ac Binary files /dev/null and b/Images/favicon.ico differ diff --git a/app/hi.py b/app/hi.py index 0d80b9d..e6877ff 100644 --- a/app/hi.py +++ b/app/hi.py @@ -330,10 +330,11 @@ if __name__ == "__main__": cli_args = parse_args() cfg = app_config.getConfig(cli_args.config) shared_data = SharedThreadData(cfg) - osc_thread = threading.Thread( - target=osc_thread, - args=(shared_data,)) - osc_thread.start() + if False: + osc_thread = threading.Thread( + target=osc_thread, + args=(shared_data,)) + osc_thread.start() transcribe_thread = threading.Thread( target=stt.transcriptionThread, @@ -382,6 +383,7 @@ if __name__ == "__main__": local_word = shared_data.word print(local_word + "_") shared_data.exit_event.set() - osc_thread.join() + if False: + osc_thread.join() transcribe_thread.join() diff --git a/app/requirements.txt b/app/requirements.txt index 07f94cd..f8b7069 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -5,4 +5,4 @@ pyaudio pydub python-osc sentencepiece -wave +silero-vad diff --git a/app/stt.py b/app/stt.py index c157f6d..7d76333 100644 --- a/app/stt.py +++ b/app/stt.py @@ -6,10 +6,10 @@ import os import pyaudio from pydub import AudioSegment from shared_thread_data import SharedThreadData +from silero_vad import load_silero_vad, get_speech_timestamps import sys import time import typing -import vad import wave @@ -33,7 +33,7 @@ class AudioStream(): class MicStream(AudioStream): CHUNK_SZ = 1024 - def __init__(self, which_mic: str): + def __init__(self, cfg: typing.Dict): self.p = pyaudio.PyAudio() self.stream = None self.sample_rate = None @@ -45,8 +45,11 @@ class MicStream(AudioStream): # If set, incoming frames are simply discarded. self.paused = False - print(f"Finding mic {which_mic}", file=sys.stderr) - self.dumpMicDevices() + which_mic = cfg["microphone"] + + if cfg["enable_debug_mode"]: + print(f"Finding mic {which_mic}", file=sys.stderr) + self.dumpMicDevices() got_match = False device_index = -1 @@ -59,8 +62,9 @@ class MicStream(AudioStream): elif which_mic == "beyond": target_str = "Microphone (Beyond)" else: - print(f"Mic {which_mic} requested, treating it as a numerical " + - "device ID", file=sys.stderr) + if cfg["enable_debug_mode"]: + print(f"Mic {which_mic} requested, treating it as a numerical " + + "device ID", file=sys.stderr) device_index = int(which_mic) got_match = True if not got_match: @@ -79,9 +83,11 @@ class MicStream(AudioStream): raise KeyError(f"Mic {which_mic} not found") info = self.p.get_device_info_by_host_api_device_index(0, device_index) - print(f"Found mic {which_mic}: {info['name']}", file=sys.stderr) + if cfg["enable_debug_mode"]: + print(f"Found mic {which_mic}: {info['name']}", file=sys.stderr) self.sample_rate = int(info['defaultSampleRate']) - print(f"Mic sample rate: {self.sample_rate}", file=sys.stderr) + if cfg["enable_debug_mode"]: + print(f"Mic sample rate: {self.sample_rate}", file=sys.stderr) self.stream = self.p.open( rate=self.sample_rate, @@ -289,19 +295,40 @@ class AudioSegmenter: def __init__(self, min_silence_ms=250, max_speech_s=5): - self.vad_options = vad.VadOptions( - min_silence_duration_ms=min_silence_ms, - max_speech_duration_s=max_speech_s) - pass + self.min_silence_ms = min_silence_ms + self.max_speech_s = max_speech_s + + # Load Silero VAD model + self.model = load_silero_vad() + + self.vad_threshold = 0.3 + self.min_silence_duration_ms = min_silence_ms + self.max_speech_duration_s = max_speech_s + + self.speech_pad_ms = 300 def segmentAudio(self, audio: bytes): - audio = np.frombuffer(audio, + # Convert audio bytes to numpy array expected by silero-vad + audio_array = np.frombuffer(audio, dtype=np.int16).flatten().astype(np.float32) / 32768.0 - return vad.get_speech_timestamps(audio, vad_options=self.vad_options) + + # Get speech timestamps using silero-vad + # Note: silero-vad expects sample rate of 16000 Hz which matches AudioStream.FPS + speech_timestamps = get_speech_timestamps( + audio_array, + self.model, + sampling_rate=AudioStream.FPS, + threshold=self.vad_threshold, + min_silence_duration_ms=self.min_silence_duration_ms, + max_speech_duration_s=self.max_speech_duration_s, + return_seconds=False # We want frame indices, not seconds + ) + + return speech_timestamps # Returns the stable cutoff (if any) and whether there are any segments. def getStableCutoff(self, audio: bytes) -> typing.Tuple[int, bool]: - min_delta_frames = int((self.vad_options.min_silence_duration_ms * + min_delta_frames = int((self.min_silence_duration_ms * AudioStream.FPS) / 1000.0) cutoff = None @@ -379,8 +406,9 @@ class Whisper: model_str = cfg["model"] model_root = os.path.join(parent_dir, "Models", os.path.normpath(model_str)) - print(f"Model {cfg['model']} will be saved to {model_root}", - file=sys.stderr) + if cfg["enable_debug_mode"]: + print(f"Model {cfg['model']} will be saved to {model_root}", + file=sys.stderr) model_device = "cuda" if cfg["use_cpu"]: @@ -395,21 +423,42 @@ class Whisper: download_root = model_root, local_files_only = already_downloaded) + self.context_window_chars = 200 # Keep last 200 chars of context + self.recent_context = "" # Store recent committed text + + def update_context(self, committed_text: str): + """Update the context with recently committed text.""" + self.recent_context = (self.recent_context + " " + committed_text).strip() + # Keep only the last N characters to avoid prompt getting too long + if len(self.recent_context) > self.context_window_chars: + self.recent_context = self.recent_context[-self.context_window_chars:] + def transcribe(self, frames: bytes = None) -> typing.List[Segment]: if frames is None: frames = self.collector.getAudio() - # Convert from signed 16-bit int [-32768, 32767] to signed 32-bit float on - # [-1, 1]. + + # Convert audio to float32 audio = np.frombuffer(frames, dtype=np.int16).flatten().astype(np.float32) / 32768.0 + # Build context-aware prompt + prompt = self._build_prompt() + t0 = time.time() segments, info = self.model.transcribe( audio, language = langcodes.find(self.cfg["language"]).language, vad_filter = True, temperature=0.0, - without_timestamps = False) + without_timestamps = False, + initial_prompt=prompt, + beam_size=5, + best_of=5, + condition_on_previous_text=True, + compression_ratio_threshold=2.4, + log_prob_threshold=-1.0, + no_speech_threshold=0.6 + ) res = [] for s in segments: # Manual touchup. I see a decent number of hallucinations sneaking @@ -445,6 +494,17 @@ class Whisper: print(f"Transcription latency (s): {t1 - t0}") return res + def _build_prompt(self) -> str: + """Build a context-aware prompt for Whisper.""" + user_prompt = self.cfg["user_prompt"] + context_prompt = "" + if self.recent_context and len(self.recent_context) > 0: + context_prompt = f"Here is the context so far: {self.recent_context}" + + prompts = [user_prompt, context_prompt] + prompts = [p for p in prompts if p and len(p) > 0] + return " ".join(prompts) + class TranscriptCommit: def __init__(self, delta: str, @@ -502,10 +562,21 @@ class VadCommitter: latency_s = self.collector.now() - self.collector.begin() duration_s = stable_cutoff / AudioStream.FPS start_ts = self.collector.begin() - commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) + + # Get the filtered audio first, then extract the portion we need + filtered_audio = self.collector.getAudio() + commit_audio = filtered_audio[:stable_cutoff * AudioStream.FRAME_SZ] + + # Now drop the prefix from the collector + self.collector.dropAudioPrefixByFrames(stable_cutoff) segments = self.whisper.transcribe(commit_audio) delta = ''.join(s.transcript for s in segments) + + # Update whisper's context with the committed text + if delta.strip(): + self.whisper.update_context(delta.strip()) + audio = self.collector.getAudio() if self.cfg["enable_debug_mode"]: for s in segments: @@ -540,11 +611,11 @@ class VadCommitter: def transcriptionThread(shared_data: SharedThreadData): last_stable_commit = None - stream = MicStream(shared_data.cfg["microphone"]) + stream = MicStream(shared_data.cfg) collector = AudioCollector(stream) collector = CompressingAudioCollector(collector) + collector = BoostingAudioCollector(collector, -12.0, shared_data.cfg) collector = NormalizingAudioCollector(collector) - collector = BoostingAudioCollector(collector, 0.0, shared_data.cfg) whisper = Whisper(collector, shared_data.cfg) segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"], max_speech_s=shared_data.cfg["max_speech_duration_s"]) @@ -553,6 +624,8 @@ def transcriptionThread(shared_data: SharedThreadData): transcript = "" preview = "" + print(f"Ready to go!", flush=True) + while not shared_data.exit_event.is_set(): time.sleep(shared_data.cfg["transcription_loop_delay_ms"] / 1000.0); @@ -561,8 +634,7 @@ def transcriptionThread(shared_data: SharedThreadData): commit = committer.getDelta() if len(commit.delta) > 0 or len(commit.preview) > 0: - # Avoid re-sending text after long pauses. User controls the length - # of the pause in the UI. + # Avoid re-sending text after long pauses if shared_data.cfg["reset_after_silence_s"] > 0: silence_duration = 0 if last_stable_commit: @@ -571,10 +643,12 @@ def transcriptionThread(shared_data: SharedThreadData): last_stable_commit.duration_s silence_duration = commit.start_ts - last_commit_end_ts if silence_duration > shared_data.cfg["reset_after_silence_s"]: - print(f"Resetting transcript after {silence_duration}-second " - "silence", file=sys.stderr) + if shared_data.cfg["enable_debug_mode"]: + print(f"Resetting transcript after {silence_duration}-second " + "silence", file=sys.stderr) transcript = "" preview = "" + whisper.recent_context = "" # Reset context too if commit.delta: last_stable_commit = commit diff --git a/app/vad.py b/app/vad.py deleted file mode 100644 index 1dea765..0000000 --- a/app/vad.py +++ /dev/null @@ -1,314 +0,0 @@ -# MIT License -# -# Copyright (c) 2023 Guillaume Klein -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import bisect -import functools -import os -import warnings - -from typing import List, NamedTuple, Optional - -import numpy as np - - -# The code below is adapted from https://github.com/snakers4/silero-vad. -class VadOptions(NamedTuple): - """VAD options. - - Attributes: - threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, - probabilities ABOVE this value are considered as SPEECH. It is better to tune this - parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. - min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. - max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer - than max_speech_duration_s will be split at the timestamp of the last silence that - lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be - split aggressively just before max_speech_duration_s. - min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms - before separating it - window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. - WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate. - Values other than these may affect model performance!! - speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side - """ - - threshold: float = 0.5 - min_speech_duration_ms: int = 250 - max_speech_duration_s: float = float("inf") - min_silence_duration_ms: int = 2000 - window_size_samples: int = 1024 - speech_pad_ms: int = 400 - - -def get_speech_timestamps( - audio: np.ndarray, - vad_options: Optional[VadOptions] = None, - **kwargs, -) -> List[dict]: - """This method is used for splitting long audios into speech chunks using silero VAD. - - Args: - audio: One dimensional float array. - vad_options: Options for VAD processing. - kwargs: VAD options passed as keyword arguments for backward compatibility. - - Returns: - List of dicts containing begin and end samples of each speech chunk. - """ - if vad_options is None: - vad_options = VadOptions(**kwargs) - - threshold = vad_options.threshold - min_speech_duration_ms = vad_options.min_speech_duration_ms - max_speech_duration_s = vad_options.max_speech_duration_s - min_silence_duration_ms = vad_options.min_silence_duration_ms - window_size_samples = vad_options.window_size_samples - speech_pad_ms = vad_options.speech_pad_ms - - if window_size_samples not in [512, 1024, 1536]: - warnings.warn( - "Unusual window_size_samples! Supported window_size_samples:\n" - " - [512, 1024, 1536] for 16000 sampling_rate" - ) - - sampling_rate = 16000 - min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 - speech_pad_samples = sampling_rate * speech_pad_ms / 1000 - max_speech_samples = ( - sampling_rate * max_speech_duration_s - - window_size_samples - - 2 * speech_pad_samples - ) - min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 - min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 - - audio_length_samples = len(audio) - - model = get_vad_model() - state = model.get_initial_state(batch_size=1) - - speech_probs = [] - for current_start_sample in range(0, audio_length_samples, window_size_samples): - chunk = audio[current_start_sample : current_start_sample + window_size_samples] - if len(chunk) < window_size_samples: - chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk)))) - speech_prob, state = model(chunk, state, sampling_rate) - speech_probs.append(speech_prob) - - triggered = False - speeches = [] - current_speech = {} - neg_threshold = threshold - 0.15 - - # to save potential segment end (and tolerate some silence) - temp_end = 0 - # to save potential segment limits in case of maximum segment size reached - prev_end = next_start = 0 - - for i, speech_prob in enumerate(speech_probs): - if (speech_prob >= threshold) and temp_end: - temp_end = 0 - if next_start < prev_end: - next_start = window_size_samples * i - - if (speech_prob >= threshold) and not triggered: - triggered = True - current_speech["start"] = window_size_samples * i - continue - - if ( - triggered - and (window_size_samples * i) - current_speech["start"] > max_speech_samples - ): - if prev_end: - current_speech["end"] = prev_end - speeches.append(current_speech) - current_speech = {} - # previously reached silence (< neg_thres) and is still not speech (< thres) - if next_start < prev_end: - triggered = False - else: - current_speech["start"] = next_start - prev_end = next_start = temp_end = 0 - else: - current_speech["end"] = window_size_samples * i - speeches.append(current_speech) - current_speech = {} - prev_end = next_start = temp_end = 0 - triggered = False - continue - - if (speech_prob < neg_threshold) and triggered: - if not temp_end: - temp_end = window_size_samples * i - # condition to avoid cutting in very short silence - if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech: - prev_end = temp_end - if (window_size_samples * i) - temp_end < min_silence_samples: - continue - else: - current_speech["end"] = temp_end - if ( - current_speech["end"] - current_speech["start"] - ) > min_speech_samples: - speeches.append(current_speech) - current_speech = {} - prev_end = next_start = temp_end = 0 - triggered = False - continue - - if ( - current_speech - and (audio_length_samples - current_speech["start"]) > min_speech_samples - ): - current_speech["end"] = audio_length_samples - speeches.append(current_speech) - - for i, speech in enumerate(speeches): - if i == 0: - speech["start"] = int(max(0, speech["start"] - speech_pad_samples)) - if i != len(speeches) - 1: - silence_duration = speeches[i + 1]["start"] - speech["end"] - if silence_duration < 2 * speech_pad_samples: - speech["end"] += int(silence_duration // 2) - speeches[i + 1]["start"] = int( - max(0, speeches[i + 1]["start"] - silence_duration // 2) - ) - else: - speech["end"] = int( - min(audio_length_samples, speech["end"] + speech_pad_samples) - ) - speeches[i + 1]["start"] = int( - max(0, speeches[i + 1]["start"] - speech_pad_samples) - ) - else: - speech["end"] = int( - min(audio_length_samples, speech["end"] + speech_pad_samples) - ) - - return speeches - - -def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray: - """Collects and concatenates audio chunks.""" - if not chunks: - return np.array([], dtype=np.float32) - - return np.concatenate([audio[chunk["start"] : chunk["end"]] for chunk in chunks]) - - -class SpeechTimestampsMap: - """Helper class to restore original speech timestamps.""" - - def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2): - self.sampling_rate = sampling_rate - self.time_precision = time_precision - self.chunk_end_sample = [] - self.total_silence_before = [] - - previous_end = 0 - silent_samples = 0 - - for chunk in chunks: - silent_samples += chunk["start"] - previous_end - previous_end = chunk["end"] - - self.chunk_end_sample.append(chunk["end"] - silent_samples) - self.total_silence_before.append(silent_samples / sampling_rate) - - def get_original_time( - self, - time: float, - chunk_index: Optional[int] = None, - ) -> float: - if chunk_index is None: - chunk_index = self.get_chunk_index(time) - - total_silence_before = self.total_silence_before[chunk_index] - return round(total_silence_before + time, self.time_precision) - - def get_chunk_index(self, time: float) -> int: - sample = int(time * self.sampling_rate) - return min( - bisect.bisect(self.chunk_end_sample, sample), - len(self.chunk_end_sample) - 1, - ) - - -@functools.lru_cache -def get_vad_model(): - """Returns the VAD model instance.""" - abspath = os.path.abspath(__file__) - my_dir = os.path.dirname(abspath) - parent_dir = os.path.dirname(my_dir) - path = os.path.join(parent_dir, "Models", "silero_vad.onnx") - return SileroVADModel(path) - - -class SileroVADModel: - def __init__(self, path): - try: - import onnxruntime - except ImportError as e: - raise RuntimeError( - "Applying the VAD filter requires the onnxruntime package" - ) from e - - opts = onnxruntime.SessionOptions() - opts.inter_op_num_threads = 1 - opts.intra_op_num_threads = 1 - opts.log_severity_level = 4 - - self.session = onnxruntime.InferenceSession( - path, - providers=["CPUExecutionProvider"], - sess_options=opts, - ) - - def get_initial_state(self, batch_size: int): - h = np.zeros((2, batch_size, 64), dtype=np.float32) - c = np.zeros((2, batch_size, 64), dtype=np.float32) - return h, c - - def __call__(self, x, state, sr: int): - if len(x.shape) == 1: - x = np.expand_dims(x, 0) - if len(x.shape) > 2: - raise ValueError( - f"Too many dimensions for input audio chunk {len(x.shape)}" - ) - if sr / x.shape[1] > 31.25: - raise ValueError("Input audio chunk is too short") - - h, c = state - - ort_inputs = { - "input": x, - "h": h, - "c": c, - "sr": np.array(sr, dtype="int64"), - } - - out, h, c = self.session.run(None, ort_inputs) - state = (h, c) - - return out, state diff --git a/config.yaml b/config.yaml index 34d88f1..5eec7a2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,7 @@ compute_type: float16 enable_debug_mode: 0 enable_previews: 1 +user_prompt: Use proper punctuation and grammar. Prefer spelled out numbers like one, eleven, twenty, etc. save_audio: 0 language: english gpu_idx: 0 diff --git a/ui/index.html b/ui/index.html index b06e56b..90f78c1 100644 --- a/ui/index.html +++ b/ui/index.html @@ -10,179 +10,229 @@

+ Hello World! +

- Hello World! -

TaSTT

Compute Settings

Audio Settings

Performance Settings

Debug/Preview Settings

Display Settings

Python Output

TaSTT

Python Output

Compute Settings

Audio Settings

Compute Settings

Voice Activity Detection

Transcription Settings

Performance Settings

Debug/Preview Settings

Display Settings

Performance Settings

Debug/Preview Settings

Custom Chatbox Settings

Configuration

Virtual Environment

Transcription Settings

Text Filters

Custom Chatbox Settings

Input Settings

Custom Chatbox Settings