summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
Diffstat (limited to 'app')
-rw-r--r--app/hi.py45
-rw-r--r--app/requirements.txt1
-rw-r--r--app/stt.py62
3 files changed, 63 insertions, 45 deletions
diff --git a/app/hi.py b/app/hi.py
index 1297b37..bb09418 100644
--- a/app/hi.py
+++ b/app/hi.py
@@ -26,9 +26,6 @@ TESTS_ENABLED = True
# 0 = quiet, 1 = verbose, 2 = very verbose
LOG_LEVEL = 0
-# Global volume control (0.0 to 1.0)
-VOLUME = 0.3
-
APP_ROOT = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(APP_ROOT)
@@ -347,7 +344,8 @@ def osc_thread(shared_data: SharedThreadData):
if time.time() - last_change < 1.5:
continue
addr = "/chatbox/input"
- print(f"Send {local_word}", flush=True)
+ if shared_data.cfg["enable_debug_mode"]:
+ print(f"Send {local_word}", flush=True)
osc_client.send_message(addr, (local_word, True, False))
last_change = time.time()
remote_word = local_word
@@ -420,20 +418,16 @@ def vrInputThread(shared_data: SharedThreadData):
if last_rising - last_medium_press_end < 1.0:
# Type transcription
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform3)
+ play_sound_with_volume(waveform3, shared_data.cfg)
else:
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform1)
+ play_sound_with_volume(waveform1, shared_data.cfg)
elif now - last_rising > 0.5:
# Medium press
print("CLEARING", file=sys.stderr)
last_medium_press_end = now
state = PAUSE_STATE
-
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform2)
+ play_sound_with_volume(waveform2, shared_data.cfg)
# Flush the *entire* pipeline.
shared_data.stream.pause(True)
@@ -449,9 +443,7 @@ def vrInputThread(shared_data: SharedThreadData):
state = PAUSE_STATE
shared_data.stream.pause(True)
-
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform1)
+ play_sound_with_volume(waveform1, shared_data.cfg)
elif state == PAUSE_STATE:
print("RECORDING", file=sys.stderr)
state = RECORD_STATE
@@ -469,9 +461,7 @@ def vrInputThread(shared_data: SharedThreadData):
#audio_state.text += audio_state.preview_text
shared_data.stream.pause(False)
-
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform0)
+ play_sound_with_volume(waveform0, shared_data.cfg)
def kbInputThread(shared_data: SharedThreadData):
@@ -514,9 +504,7 @@ def kbInputThread(shared_data: SharedThreadData):
if event == EVENT_DOUBLE_PRESS:
print("CLEARING", file=sys.stderr)
state = PAUSE_STATE
-
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform2)
+ play_sound_with_volume(waveform2, shared_data.cfg)
# Flush the *entire* pipeline.
shared_data.stream.pause(True)
@@ -530,11 +518,8 @@ def kbInputThread(shared_data: SharedThreadData):
if state == RECORD_STATE:
print("PAUSED", file=sys.stderr)
state = PAUSE_STATE
-
shared_data.stream.pause(True)
-
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform1)
+ play_sound_with_volume(waveform1, shared_data.cfg)
elif state == PAUSE_STATE:
print("RECORDING", file=sys.stderr)
state = RECORD_STATE
@@ -548,20 +533,16 @@ def kbInputThread(shared_data: SharedThreadData):
if shared_data.cfg["enable_debug_mode"]:
print("Toggle detected, committing preview text (2)",
file=sys.stderr)
- #audio_state.text += audio_state.preview_text
-
shared_data.stream.pause(False)
+ play_sound_with_volume(waveform0, shared_data.cfg)
- if shared_data.cfg["enable_local_beep"]:
- play_sound_with_volume(waveform0)
-
-def play_sound_with_volume(filepath):
+def play_sound_with_volume(filepath, cfg):
"""Play a WAV file with adjusted volume"""
- volume = VOLUME
+ volume = cfg.get("volume", 30)
try:
sound = pygame.mixer.Sound(filepath)
- sound.set_volume(volume)
+ sound.set_volume(volume * 0.01)
sound.play()
except Exception as e:
print(f"Error playing sound {filepath}: {e}", file=sys.stderr)
diff --git a/app/requirements.txt b/app/requirements.txt
index e68a16c..c8d69df 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -2,6 +2,7 @@ faster-whisper
hf-xet
keyboard
langcodes
+noisereduce
pyaudio
pygame
pydub
diff --git a/app/stt.py b/app/stt.py
index c1f4836..79ab0d1 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -3,6 +3,7 @@ from faster_whisper import WhisperModel
import langcodes
import numpy as np
import os
+import noisereduce as nr
try:
from profanity_filter import ProfanityFilter
PROFANITY_FILTER_AVAILABLE = True
@@ -260,9 +261,13 @@ class NormalizingAudioCollector(AudioCollectorFilter):
return frames
class BoostingAudioCollector(AudioCollectorFilter):
- def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict):
+ def __init__(self, parent: AudioCollector,
+ target_dBFS: float,
+ max_gain_dB: float,
+ cfg: typing.Dict):
AudioCollectorFilter.__init__(self, parent)
self.target_dBFS = target_dBFS
+ self.max_gain_dB = max_gain_dB
self.cfg = cfg
def getAudio(self) -> bytes:
@@ -270,9 +275,10 @@ class BoostingAudioCollector(AudioCollectorFilter):
audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ,
frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS)
+ gain = min(self.target_dBFS - audio.dBFS, self.max_gain_dB)
if self.cfg["enable_debug_mode"]:
- print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr)
- audio = audio.apply_gain(self.target_dBFS - audio.dBFS)
+ print(f"Boosting audio by {gain} dB (from {audio.dBFS} to {audio.dBFS + gain})", flush=True)
+ audio = audio.apply_gain(gain)
frames = np.array(audio.get_array_of_samples())
frames = np.int16(frames).tobytes()
@@ -296,6 +302,26 @@ class CompressingAudioCollector(AudioCollectorFilter):
return frames
+class NoiseReducingAudioCollector(AudioCollectorFilter):
+ def __init__(self, parent: AudioCollector, cfg: typing.Dict):
+ AudioCollectorFilter.__init__(self, parent)
+ self.cfg = cfg
+
+ def getAudio(self) -> bytes:
+ audio = self.parent.getAudio()
+ audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32)
+
+ reduced_audio = nr.reduce_noise(
+ y=audio_array,
+ sr=AudioStream.FPS,
+ )
+
+ # Convert back to int16
+ reduced_audio = np.clip(reduced_audio, -32768, 32767)
+ frames = np.int16(reduced_audio).tobytes()
+
+ return frames
+
class AudioSegmenter:
def __init__(self,
min_silence_ms=250,
@@ -398,6 +424,12 @@ class Segment:
avg_logprob = f"(avg_logprob: {self.avg_logprob}) "
return f"{self.transcript} " + ts + wall_ts + no_speech + avg_logprob
+def join_segments(a, b):
+ if len(a) > 0 and a[-1] != ' ':
+ return a + ' ' + b
+ else:
+ return a + b
+
class Whisper:
def __init__(self,
collector: AudioCollector,
@@ -421,6 +453,9 @@ class Whisper:
already_downloaded = os.path.exists(model_root)
+ if not already_downloaded:
+ print(f"Model {model_str} not already downloaded, downloading now...", flush=True)
+
self.model = WhisperModel(model_str,
device = model_device,
device_index = cfg["gpu_idx"],
@@ -433,10 +468,12 @@ class Whisper:
def update_context(self, committed_text: str):
"""Update the context with recently committed text."""
- self.recent_context = (self.recent_context + " " + committed_text).strip()
- # Keep only the last N characters to avoid prompt getting too long
+ self.recent_context = join_segments(self.recent_context, committed_text).strip()
+ # Drop half of the context window.
if len(self.recent_context) > self.context_window_chars:
- self.recent_context = self.recent_context[-self.context_window_chars:]
+ words = self.recent_context.split()
+ words = words[len(words)//2:]
+ self.recent_context = ' '.join(words)
def transcribe(self, frames: bytes = None) -> typing.List[Segment]:
if frames is None:
@@ -449,6 +486,8 @@ class Whisper:
# Build context-aware prompt
prompt = self._build_prompt()
+ print(f"Prompt: {prompt}", flush=True)
+
t0 = time.time()
segments, info = self.model.transcribe(
audio,
@@ -698,8 +737,10 @@ def transcriptionThread(shared_data: SharedThreadData):
stream = MicStream(shared_data.cfg)
collector = AudioCollector(stream)
collector = CompressingAudioCollector(collector)
- collector = BoostingAudioCollector(collector, -12.0, shared_data.cfg)
- collector = NormalizingAudioCollector(collector)
+ collector = BoostingAudioCollector(collector, -24.0, 24.0,
+ shared_data.cfg)
+ collector = NoiseReducingAudioCollector(collector, shared_data.cfg)
+ #collector = NormalizingAudioCollector(collector)
whisper = Whisper(collector, shared_data.cfg)
segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"],
max_speech_s=shared_data.cfg["max_speech_duration_s"],
@@ -761,11 +802,6 @@ def transcriptionThread(shared_data: SharedThreadData):
# breaking OSC pager.
if len(shared_data.transcript) >= 1024:
shared_data.transcript = shared_data.transcript[-512:]
- def join_segments(a, b):
- if len(a) > 0 and a[-1] != ' ':
- return a + ' ' + b
- else:
- return a + b
shared_data.transcript = \
join_segments(shared_data.transcript, commit.delta)
shared_data.preview = commit.preview