diff options
| author | yum <yum.food.vr@gmail.com> | 2025-07-23 17:41:49 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2025-07-23 17:41:49 -0700 |
| commit | 790c91d7ad515c3c0a22ca1341316265b8f0d779 (patch) | |
| tree | 28527bbcf87e8fab1d27eb76a1f5ea325b94d599 /app | |
| parent | 73de7cb2d8fb964e7f76ab55420e9bc331bf7bea (diff) | |
bugfixes
* fix model acquisition
* fix local beepsnd
* fix volume control
Diffstat (limited to 'app')
| -rw-r--r-- | app/hi.py | 45 | ||||
| -rw-r--r-- | app/requirements.txt | 1 | ||||
| -rw-r--r-- | app/stt.py | 62 |
3 files changed, 63 insertions, 45 deletions
@@ -26,9 +26,6 @@ TESTS_ENABLED = True # 0 = quiet, 1 = verbose, 2 = very verbose LOG_LEVEL = 0 -# Global volume control (0.0 to 1.0) -VOLUME = 0.3 - APP_ROOT = os.path.dirname(os.path.abspath(__file__)) PROJECT_ROOT = os.path.dirname(APP_ROOT) @@ -347,7 +344,8 @@ def osc_thread(shared_data: SharedThreadData): if time.time() - last_change < 1.5: continue addr = "/chatbox/input" - print(f"Send {local_word}", flush=True) + if shared_data.cfg["enable_debug_mode"]: + print(f"Send {local_word}", flush=True) osc_client.send_message(addr, (local_word, True, False)) last_change = time.time() remote_word = local_word @@ -420,20 +418,16 @@ def vrInputThread(shared_data: SharedThreadData): if last_rising - last_medium_press_end < 1.0: # Type transcription - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform3) + play_sound_with_volume(waveform3, shared_data.cfg) else: - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform1) + play_sound_with_volume(waveform1, shared_data.cfg) elif now - last_rising > 0.5: # Medium press print("CLEARING", file=sys.stderr) last_medium_press_end = now state = PAUSE_STATE - - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform2) + play_sound_with_volume(waveform2, shared_data.cfg) # Flush the *entire* pipeline. shared_data.stream.pause(True) @@ -449,9 +443,7 @@ def vrInputThread(shared_data: SharedThreadData): state = PAUSE_STATE shared_data.stream.pause(True) - - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform1) + play_sound_with_volume(waveform1, shared_data.cfg) elif state == PAUSE_STATE: print("RECORDING", file=sys.stderr) state = RECORD_STATE @@ -469,9 +461,7 @@ def vrInputThread(shared_data: SharedThreadData): #audio_state.text += audio_state.preview_text shared_data.stream.pause(False) - - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform0) + play_sound_with_volume(waveform0, shared_data.cfg) def kbInputThread(shared_data: SharedThreadData): @@ -514,9 +504,7 @@ def kbInputThread(shared_data: SharedThreadData): if event == EVENT_DOUBLE_PRESS: print("CLEARING", file=sys.stderr) state = PAUSE_STATE - - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform2) + play_sound_with_volume(waveform2, shared_data.cfg) # Flush the *entire* pipeline. shared_data.stream.pause(True) @@ -530,11 +518,8 @@ def kbInputThread(shared_data: SharedThreadData): if state == RECORD_STATE: print("PAUSED", file=sys.stderr) state = PAUSE_STATE - shared_data.stream.pause(True) - - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform1) + play_sound_with_volume(waveform1, shared_data.cfg) elif state == PAUSE_STATE: print("RECORDING", file=sys.stderr) state = RECORD_STATE @@ -548,20 +533,16 @@ def kbInputThread(shared_data: SharedThreadData): if shared_data.cfg["enable_debug_mode"]: print("Toggle detected, committing preview text (2)", file=sys.stderr) - #audio_state.text += audio_state.preview_text - shared_data.stream.pause(False) + play_sound_with_volume(waveform0, shared_data.cfg) - if shared_data.cfg["enable_local_beep"]: - play_sound_with_volume(waveform0) - -def play_sound_with_volume(filepath): +def play_sound_with_volume(filepath, cfg): """Play a WAV file with adjusted volume""" - volume = VOLUME + volume = cfg.get("volume", 30) try: sound = pygame.mixer.Sound(filepath) - sound.set_volume(volume) + sound.set_volume(volume * 0.01) sound.play() except Exception as e: print(f"Error playing sound {filepath}: {e}", file=sys.stderr) diff --git a/app/requirements.txt b/app/requirements.txt index e68a16c..c8d69df 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -2,6 +2,7 @@ faster-whisper hf-xet keyboard langcodes +noisereduce pyaudio pygame pydub @@ -3,6 +3,7 @@ from faster_whisper import WhisperModel import langcodes import numpy as np import os +import noisereduce as nr try: from profanity_filter import ProfanityFilter PROFANITY_FILTER_AVAILABLE = True @@ -260,9 +261,13 @@ class NormalizingAudioCollector(AudioCollectorFilter): return frames class BoostingAudioCollector(AudioCollectorFilter): - def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict): + def __init__(self, parent: AudioCollector, + target_dBFS: float, + max_gain_dB: float, + cfg: typing.Dict): AudioCollectorFilter.__init__(self, parent) self.target_dBFS = target_dBFS + self.max_gain_dB = max_gain_dB self.cfg = cfg def getAudio(self) -> bytes: @@ -270,9 +275,10 @@ class BoostingAudioCollector(AudioCollectorFilter): audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ, frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS) + gain = min(self.target_dBFS - audio.dBFS, self.max_gain_dB) if self.cfg["enable_debug_mode"]: - print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr) - audio = audio.apply_gain(self.target_dBFS - audio.dBFS) + print(f"Boosting audio by {gain} dB (from {audio.dBFS} to {audio.dBFS + gain})", flush=True) + audio = audio.apply_gain(gain) frames = np.array(audio.get_array_of_samples()) frames = np.int16(frames).tobytes() @@ -296,6 +302,26 @@ class CompressingAudioCollector(AudioCollectorFilter): return frames +class NoiseReducingAudioCollector(AudioCollectorFilter): + def __init__(self, parent: AudioCollector, cfg: typing.Dict): + AudioCollectorFilter.__init__(self, parent) + self.cfg = cfg + + def getAudio(self) -> bytes: + audio = self.parent.getAudio() + audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) + + reduced_audio = nr.reduce_noise( + y=audio_array, + sr=AudioStream.FPS, + ) + + # Convert back to int16 + reduced_audio = np.clip(reduced_audio, -32768, 32767) + frames = np.int16(reduced_audio).tobytes() + + return frames + class AudioSegmenter: def __init__(self, min_silence_ms=250, @@ -398,6 +424,12 @@ class Segment: avg_logprob = f"(avg_logprob: {self.avg_logprob}) " return f"{self.transcript} " + ts + wall_ts + no_speech + avg_logprob +def join_segments(a, b): + if len(a) > 0 and a[-1] != ' ': + return a + ' ' + b + else: + return a + b + class Whisper: def __init__(self, collector: AudioCollector, @@ -421,6 +453,9 @@ class Whisper: already_downloaded = os.path.exists(model_root) + if not already_downloaded: + print(f"Model {model_str} not already downloaded, downloading now...", flush=True) + self.model = WhisperModel(model_str, device = model_device, device_index = cfg["gpu_idx"], @@ -433,10 +468,12 @@ class Whisper: def update_context(self, committed_text: str): """Update the context with recently committed text.""" - self.recent_context = (self.recent_context + " " + committed_text).strip() - # Keep only the last N characters to avoid prompt getting too long + self.recent_context = join_segments(self.recent_context, committed_text).strip() + # Drop half of the context window. if len(self.recent_context) > self.context_window_chars: - self.recent_context = self.recent_context[-self.context_window_chars:] + words = self.recent_context.split() + words = words[len(words)//2:] + self.recent_context = ' '.join(words) def transcribe(self, frames: bytes = None) -> typing.List[Segment]: if frames is None: @@ -449,6 +486,8 @@ class Whisper: # Build context-aware prompt prompt = self._build_prompt() + print(f"Prompt: {prompt}", flush=True) + t0 = time.time() segments, info = self.model.transcribe( audio, @@ -698,8 +737,10 @@ def transcriptionThread(shared_data: SharedThreadData): stream = MicStream(shared_data.cfg) collector = AudioCollector(stream) collector = CompressingAudioCollector(collector) - collector = BoostingAudioCollector(collector, -12.0, shared_data.cfg) - collector = NormalizingAudioCollector(collector) + collector = BoostingAudioCollector(collector, -24.0, 24.0, + shared_data.cfg) + collector = NoiseReducingAudioCollector(collector, shared_data.cfg) + #collector = NormalizingAudioCollector(collector) whisper = Whisper(collector, shared_data.cfg) segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"], max_speech_s=shared_data.cfg["max_speech_duration_s"], @@ -761,11 +802,6 @@ def transcriptionThread(shared_data: SharedThreadData): # breaking OSC pager. if len(shared_data.transcript) >= 1024: shared_data.transcript = shared_data.transcript[-512:] - def join_segments(a, b): - if len(a) > 0 and a[-1] != ' ': - return a + ' ' + b - else: - return a + b shared_data.transcript = \ join_segments(shared_data.transcript, commit.delta) shared_data.preview = commit.preview |
