summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
Diffstat (limited to 'app')
-rw-r--r--app/hi.py7
-rw-r--r--app/list_microphones.py24
-rw-r--r--app/requirements.txt3
-rw-r--r--app/stt.py55
-rw-r--r--app/vad.py3
5 files changed, 80 insertions, 12 deletions
diff --git a/app/hi.py b/app/hi.py
index 0129958..0d80b9d 100644
--- a/app/hi.py
+++ b/app/hi.py
@@ -2,6 +2,7 @@ import app_config
import argparse
from math import floor, ceil
import msvcrt
+import os
from pythonosc import udp_client
import sentencepiece as spm
from shared_thread_data import SharedThreadData
@@ -15,8 +16,11 @@ TESTS_ENABLED = True
# 0 = quiet, 1 = verbose, 2 = very verbose
LOG_LEVEL = 0
+APP_ROOT = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(APP_ROOT)
+
def get_tokenizer():
- model_path = "./custom_unigram_tokenizer_65k/unigram.model"
+ model_path = os.path.join(PROJECT_ROOT, "custom_unigram_tokenizer_65k", "unigram.model")
print(f"Loading SentencePiece tokenizer from: {model_path}")
sp = spm.SentencePieceProcessor()
sp.load(model_path)
@@ -346,7 +350,6 @@ if __name__ == "__main__":
time.sleep(0.1)
continue
-
try:
char = char_bytes.decode('utf-8')
if char == '\r' or char == '\n':
diff --git a/app/list_microphones.py b/app/list_microphones.py
new file mode 100644
index 0000000..a6b1f36
--- /dev/null
+++ b/app/list_microphones.py
@@ -0,0 +1,24 @@
+import pyaudio
+import json
+import sys
+
+try:
+ p = pyaudio.PyAudio()
+ info = p.get_host_api_info_by_index(0)
+ numdevices = info.get('deviceCount')
+
+ microphones = []
+ for i in range(0, numdevices):
+ device_info = p.get_device_info_by_host_api_device_index(0, i)
+ if device_info.get('maxInputChannels') > 0:
+ microphones.append({
+ 'index': i,
+ 'name': device_info.get('name'),
+ 'defaultSampleRate': device_info.get('defaultSampleRate')
+ })
+
+ print(json.dumps(microphones))
+ p.terminate()
+except Exception as e:
+ print(json.dumps({'error': str(e)}), file=sys.stderr)
+ sys.exit(1) \ No newline at end of file
diff --git a/app/requirements.txt b/app/requirements.txt
index 4e79312..07f94cd 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -1,7 +1,8 @@
faster-whisper
+hf-xet
langcodes
pyaudio
pydub
python-osc
sentencepiece
-
+wave
diff --git a/app/stt.py b/app/stt.py
index 34ef2e9..c157f6d 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -1,3 +1,4 @@
+from datetime import datetime
from faster_whisper import WhisperModel
import langcodes
import numpy as np
@@ -9,6 +10,11 @@ import sys
import time
import typing
import vad
+import wave
+
+
+APP_ROOT = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(APP_ROOT)
class AudioStream():
FORMAT = pyaudio.paInt16
@@ -242,6 +248,26 @@ class NormalizingAudioCollector(AudioCollectorFilter):
return frames
+class BoostingAudioCollector(AudioCollectorFilter):
+ def __init__(self, parent: AudioCollector, target_dBFS: float, cfg: typing.Dict):
+ AudioCollectorFilter.__init__(self, parent)
+ self.target_dBFS = target_dBFS
+ self.cfg = cfg
+
+ def getAudio(self) -> bytes:
+ audio = self.parent.getAudio()
+
+ audio = AudioSegment(audio, sample_width=AudioStream.FRAME_SZ,
+ frame_rate=AudioStream.FPS, channels=AudioStream.CHANNELS)
+ if self.cfg["enable_debug_mode"]:
+ print(f"Boosting audio from {audio.dBFS}dB to {self.target_dBFS}dB", file=sys.stderr)
+ audio = audio.apply_gain(self.target_dBFS - audio.dBFS)
+
+ frames = np.array(audio.get_array_of_samples())
+ frames = np.int16(frames).tobytes()
+
+ return frames
+
class CompressingAudioCollector(AudioCollectorFilter):
def __init__(self, parent: AudioCollector):
AudioCollectorFilter.__init__(self, parent)
@@ -441,6 +467,16 @@ class TranscriptCommit:
self.duration_s = duration_s
+def saveAudio(audio: bytes, path: str, cfg: typing.Dict):
+ with wave.open(path, 'wb') as wf:
+ if cfg["enable_debug_mode"]:
+ print(f"Saving audio to {path}", file=sys.stderr)
+ wf.setnchannels(AudioStream.CHANNELS)
+ wf.setsampwidth(AudioStream.FRAME_SZ)
+ wf.setframerate(AudioStream.FPS)
+ wf.writeframes(audio)
+
+
class VadCommitter:
def __init__(self,
cfg: typing.Dict,
@@ -463,7 +499,6 @@ class VadCommitter:
start_ts = self.collector.begin()
if has_audio and stable_cutoff:
- #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
latency_s = self.collector.now() - self.collector.begin()
duration_s = stable_cutoff / AudioStream.FPS
start_ts = self.collector.begin()
@@ -475,12 +510,16 @@ class VadCommitter:
if self.cfg["enable_debug_mode"]:
for s in segments:
print(f"commit segment: {s}", file=sys.stderr)
- print(f"delta get: {delta}", file=sys.stderr)
+ if len(delta) > 0:
+ print(f"delta get: {delta}", file=sys.stderr)
- if False:
+ if self.cfg["save_audio"] and len(delta) > 0:
ts = datetime.fromtimestamp(self.collector.now() - latency_s)
filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
- saveAudio(commit_audio, filename)
+ audio_dir = os.path.join(PROJECT_ROOT, "audio")
+ if not os.path.exists(audio_dir):
+ os.makedirs(audio_dir)
+ saveAudio(commit_audio, os.path.join(audio_dir, filename), self.cfg)
preview = ""
if self.cfg["enable_previews"] and has_audio:
@@ -488,7 +527,6 @@ class VadCommitter:
preview = "".join(s.transcript for s in segments)
if not has_audio:
- #print("VAD detects no audio, skip transcription", file=sys.stderr)
self.collector.keepLast(1.0)
return TranscriptCommit(
@@ -504,8 +542,9 @@ def transcriptionThread(shared_data: SharedThreadData):
stream = MicStream(shared_data.cfg["microphone"])
collector = AudioCollector(stream)
- collector = NormalizingAudioCollector(collector)
collector = CompressingAudioCollector(collector)
+ collector = NormalizingAudioCollector(collector)
+ collector = BoostingAudioCollector(collector, 0.0, shared_data.cfg)
whisper = Whisper(collector, shared_data.cfg)
segmenter = AudioSegmenter(min_silence_ms=shared_data.cfg["min_silence_duration_ms"],
max_speech_s=shared_data.cfg["max_speech_duration_s"])
@@ -552,13 +591,13 @@ def transcriptionThread(shared_data: SharedThreadData):
preview = commit.preview
try:
- print(f"Transcript: {transcript}")
+ print(f"Transcript: {transcript}", flush=True)
except UnicodeEncodeError:
print("Failed to encode transcript - discarding delta",
file=sys.stderr)
continue
try:
- print(f"Preview: {preview}")
+ print(f"Preview: {preview}", flush=True)
except UnicodeEncodeError:
print("Failed to encode preview - discarding", file=sys.stderr)
diff --git a/app/vad.py b/app/vad.py
index 10a72d3..1dea765 100644
--- a/app/vad.py
+++ b/app/vad.py
@@ -259,7 +259,8 @@ def get_vad_model():
"""Returns the VAD model instance."""
abspath = os.path.abspath(__file__)
my_dir = os.path.dirname(abspath)
- path = os.path.join(my_dir, "Models/silero_vad.onnx")
+ parent_dir = os.path.dirname(my_dir)
+ path = os.path.join(parent_dir, "Models", "silero_vad.onnx")
return SileroVADModel(path)