bugfixes

* fix model acquisition * fix local beepsnd * fix volume control
author: yum <yum.food.vr@gmail.com> 2025-07-23 17:41:49 -0700
committer: yum <yum.food.vr@gmail.com> 2025-07-23 17:41:49 -0700
commit: 790c91d7ad515c3c0a22ca1341316265b8f0d779 (patch)
tree: 28527bbcf87e8fab1d27eb76a1f5ea325b94d599 /app
parent: 73de7cb2d8fb964e7f76ab55420e9bc331bf7bea (diff)
3 files changed, 63 insertions, 45 deletions
diff --git a/app/hi.py b/app/hi.py
index 1297b37..bb09418 100644
--- a/app/hi.py
+++ b/app/hi.py
@@ -26,9 +26,6 @@ TESTS_ENABLED = True
 # 0 = quiet, 1 = verbose, 2 = very verbose
 LOG_LEVEL = 0
 
-# Global volume control (0.0 to 1.0)
-VOLUME = 0.3
-
 APP_ROOT = os.path.dirname(os.path.abspath(__file__))
 PROJECT_ROOT = os.path.dirname(APP_ROOT)
 
@@ -347,7 +344,8 @@ def osc_thread(shared_data: SharedThreadData):
             if time.time() - last_change < 1.5:
                 continue
             addr = "/chatbox/input"
-            print(f"Send {local_word}", flush=True)
+            if shared_data.cfg["enable_debug_mode"]:
+                print(f"Send {local_word}", flush=True)
             osc_client.send_message(addr, (local_word, True, False))
             last_change = time.time()
             remote_word = local_word
@@ -420,20 +418,16 @@ def vrInputThread(shared_data: SharedThreadData):
 
                     if last_rising - last_medium_press_end < 1.0:
                         # Type transcription
-                        if shared_data.cfg["enable_local_beep"]:
-                            play_sound_with_volume(waveform3)
+                        play_sound_with_volume(waveform3, shared_data.cfg)
                     else:
-                        if shared_data.cfg["enable_local_beep"]:
-                            play_sound_with_volume(waveform1)
+                        play_sound_with_volume(waveform1, shared_data.cfg)
 
                 elif now - last_rising > 0.5:
                     # Medium press
                     print("CLEARING", file=sys.stderr)
                     last_medium_press_end = now
                     state = PAUSE_STATE
-
-                    if shared_data.cfg["enable_local_beep"]:
-                        play_sound_with_volume(waveform2)
+                    play_sound_with_volume(waveform2, shared_data.cfg)
 
                     # Flush the *entire* pipeline.
                     shared_data.stream.pause(True)
@@ -449,9 +443,7 @@ def vrInputThread(shared_data: SharedThreadData):
                     state = PAUSE_STATE
 
                     shared_data.stream.pause(True)
-
-                    if shared_data.cfg["enable_local_beep"]:
-                        play_sound_with_volume(waveform1)
+                    play_sound_with_volume(waveform1, shared_data.cfg)
                 elif state == PAUSE_STATE:
                     print("RECORDING", file=sys.stderr)
                     state = RECORD_STATE
@@ -469,9 +461,7 @@ def vrInputThread(shared_data: SharedThreadData):
                         #audio_state.text += audio_state.preview_text
 
                     shared_data.stream.pause(False)
-
-                    if shared_data.cfg["enable_local_beep"]:
-                        play_sound_with_volume(waveform0)
+                    play_sound_with_volume(waveform0, shared_data.cfg)
 
 
 def kbInputThread(shared_data: SharedThreadData):
@@ -514,9 +504,7 @@ def kbInputThread(shared_data: SharedThreadData):
             if event == EVENT_DOUBLE_PRESS:
                 print("CLEARING", file=sys.stderr)
                 state = PAUSE_STATE
-
-                if shared_data.cfg["enable_local_beep"]:
-                    play_sound_with_volume(waveform2)
+                play_sound_with_volume(waveform2, shared_data.cfg)
 
                 # Flush the *entire* pipeline.
                 shared_data.stream.pause(True)
@@ -530,11 +518,8 @@ def kbInputThread(shared_data: SharedThreadData):
             if state == RECORD_STATE:
                 print("PAUSED", file=sys.stderr)
                 state = PAUSE_STATE
-
                 shared_data.stream.pause(True)
-
-                if shared_data.cfg["enable_local_beep"]:
-                    play_sound_with_volume(waveform1)
+                play_sound_with_volume(waveform1, shared_data.cfg)
             elif state == PAUSE_STATE:
                 print("RECORDING", file=sys.stderr)
                 state = RECORD_STATE
@@ -548,20 +533,16 @@ def kbInputThread(shared_data: SharedThreadData):
                     if shared_data.cfg["enable_debug_mode"]:
                         print("Toggle detected, committing preview text (2)",
                                 file=sys.stderr)
-                    #audio_state.text += audio_state.preview_text
-
                 shared_data.stream.pause(False)
+                play_sound_with_volume(waveform0, shared_data.cfg)
 
-                if shared_data.cfg["enable_local_beep"]:
-                    play_sound_with_volume(waveform0)
-
-def play_sound_with_volume(filepath):
+def play_sound_with_volume(filepath, cfg):
     """Play a WAV file with adjusted volume"""
-    volume = VOLUME
+    volume = cfg.get("volume", 30)
     
     try:
         sound = pygame.mixer.Sound(filepath)
-        sound.set_volume(volume)
+        sound.set_volume(volume * 0.01)
         sound.play()
     except Exception as e:
         print(f"Error playing sound {filepath}: {e}", file=sys.stderr)
diff --git a/app/requirements.txt b/app/requirements.txt
index e68a16c..c8d69df 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -2,6 +2,7 @@ faster-whisper
 hf-xet
 keyboard
 langcodes
+noisereduce
 pyaudio
 pygame
 pydub
diff --git a/app/stt.py b/app/stt.py
index c1f4836..79ab0d1 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -3,6 +3,7 @@ from faster_whisper import WhisperModel
 import langcodes
 import numpy as np
 import os
+import noisereduce as nr
 try:
     from profanity_filter import ProfanityFilter
     PROFANITY_FILTER_AVAILABLE = True
@@ -260,9 +261,13 @@ class NormalizingAudioCollector(AudioCollectorFilter):
         return frames
 
 class BoostingAudioCollector(AudioCollectorFilter):
-    def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict):
+    def __init__(self, parent: AudioCollector,
+                 target_dBFS: float,
+                 max_gain_dB: float,
+                 cfg: typing.Dict):
         AudioCollectorFilter.__init__(self, parent)
         self.target_dBFS = target_dBFS
+        self.max_gain_dB = max_gain_dB
         self.cfg = cfg
 
     def getAudio(self) -> bytes:
@@ -270,9 +275,10 @@ class BoostingAudioCollector(AudioCollectorFilter):
 
         audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ,
                 frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS)
+        gain = min(self.target_dBFS - audio.dBFS, self.max_gain_dB)
         if self.cfg["enable_debug_mode"]:
-            print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr)
-        audio = audio.apply_gain(self.target_dBFS - audio.dBFS)
+            print(f"Boosting audio by {gain} dB (from {audio.dBFS} to {audio.dBFS + gain})", flush=True)
+        audio = audio.apply_gain(gain)
 
         frames = np.array(audio.get_array_of_samples())
         frames = np.int16(frames).tobytes()
@@ -296,6 +302,26 @@ class CompressingAudioCollector(AudioCollectorFilter):
 
         return frames
 
+class NoiseReducingAudioCollector(AudioCollectorFilter):
+    def __init__(self, parent: AudioCollector, cfg: typing.Dict):
+        AudioCollectorFilter.__init__(self, parent)
+        self.cfg = cfg
+
+    def getAudio(self) -> bytes:
+        audio = self.parent.getAudio()
+        audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32)
+
+        reduced_audio = nr.reduce_noise(
+            y=audio_array,
+            sr=AudioStream.FPS,
+        )
+
+        # Convert back to int16
+        reduced_audio = np.clip(reduced_audio, -32768, 32767)
+        frames = np.int16(reduced_audio).tobytes()
+
+        return frames
+
 class AudioSegmenter:
     def __init__(self,
             min_silence_ms=250,
@@ -398,6 +424,12 @@ class Segment:
         avg_logprob = f"(avg_logprob: {self.avg_logprob}) "
         return f"{self.transcript} " + ts + wall_ts + no_speech + avg_logprob
 
+def join_segments(a, b):
+    if len(a) > 0 and a[-1] != ' ':
+        return a + ' ' + b
+    else:
+        return a + b
+
 class Whisper:
     def __init__(self,
             collector: AudioCollector,
@@ -421,6 +453,9 @@ class Whisper:
 
         already_downloaded = os.path.exists(model_root)
 
+        if not already_downloaded:
+            print(f"Model {model_str} not already downloaded, downloading now...", flush=True)
+
         self.model = WhisperModel(model_str,
                 device = model_device,
                 device_index = cfg["gpu_idx"],
@@ -433,10 +468,12 @@ class Whisper:
 
     def update_context(self, committed_text: str):
         """Update the context with recently committed text."""
-        self.recent_context = (self.recent_context + " " + committed_text).strip()
-        # Keep only the last N characters to avoid prompt getting too long
+        self.recent_context = join_segments(self.recent_context, committed_text).strip()
+        # Drop half of the context window.
         if len(self.recent_context) > self.context_window_chars:
-            self.recent_context = self.recent_context[-self.context_window_chars:]
+            words = self.recent_context.split()
+            words = words[len(words)//2:]
+            self.recent_context = ' '.join(words)
 
     def transcribe(self, frames: bytes = None) -> typing.List[Segment]:
         if frames is None:
@@ -449,6 +486,8 @@ class Whisper:
         # Build context-aware prompt
         prompt = self._build_prompt()
 
+        print(f"Prompt: {prompt}", flush=True)
+
         t0 = time.time()
         segments, info = self.model.transcribe(
                 audio,
@@ -698,8 +737,10 @@ def transcriptionThread(shared_data: SharedThreadData):
     stream = MicStream(shared_data.cfg)
     collector = AudioCollector(stream)
     collector = CompressingAudioCollector(collector)
-    collector = BoostingAudioCollector(collector, -12.0, shared_data.cfg)
-    collector = NormalizingAudioCollector(collector)
+    collector = BoostingAudioCollector(collector, -24.0, 24.0,
+                                       shared_data.cfg)
+    collector = NoiseReducingAudioCollector(collector, shared_data.cfg)
+    #collector = NormalizingAudioCollector(collector)
     whisper = Whisper(collector, shared_data.cfg)
     segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"],
             max_speech_s=shared_data.cfg["max_speech_duration_s"],
@@ -761,11 +802,6 @@ def transcriptionThread(shared_data: SharedThreadData):
                 # breaking OSC pager.
                 if len(shared_data.transcript) >= 1024:
                     shared_data.transcript = shared_data.transcript[-512:]
-                def join_segments(a, b):
-                    if len(a) > 0 and a[-1] != ' ':
-                        return a + ' ' + b
-                    else:
-                        return a + b
                 shared_data.transcript = \
                         join_segments(shared_data.transcript, commit.delta)
                 shared_data.preview = commit.preview
author	yum <yum.food.vr@gmail.com>	2025-07-23 17:41:49 -0700
committer	yum <yum.food.vr@gmail.com>	2025-07-23 17:41:49 -0700
commit	790c91d7ad515c3c0a22ca1341316265b8f0d779 (patch)
tree	28527bbcf87e8fab1d27eb76a1f5ea325b94d599 /app
parent	73de7cb2d8fb964e7f76ab55420e9bc331bf7bea (diff)