summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe_v2.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-09-08 15:28:40 -0700
committeryum <yum.food.vr@gmail.com>2023-09-08 15:28:40 -0700
commit31d84a0cfdd3c30c2fdf1dbfe4e855ae78555707 (patch)
tree100553fc91e9e8a83854c053454516a56f38e7c5 /Scripts/transcribe_v2.py
parenta8a0d2ce9188f6f8917ac5e5b4dfc289badaf858 (diff)
Add keyboard controls to transcribe_v2.py
Also parameterize `min_silence_duration_ms` in AudioSegmenter. I suspect that for conversational speech, segmenting closer to 500 ms (rather than the 2000ms default) is a better tradeoff between accuracy and compute efficiency.
Diffstat (limited to 'Scripts/transcribe_v2.py')
-rw-r--r--Scripts/transcribe_v2.py131
1 files changed, 103 insertions, 28 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 0a43b5e..f13a5aa 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -9,6 +9,7 @@ from emotes_v2 import EmotesState
import app_config
import argparse
import editdistance
+import keybind_event_machine
import langcodes
import math
import numpy as np
@@ -307,16 +308,20 @@ class CompressingAudioCollector(AudioCollectorFilter):
return frames
class AudioSegmenter:
- def __init__(self):
+ def __init__(self,
+ min_silence_ms=500):
+ self.vad_options = vad.VadOptions(
+ min_silence_duration_ms=min_silence_ms)
pass
def segmentAudio(self, audio: bytes):
audio = np.frombuffer(audio,
dtype=np.int16).flatten().astype(np.float32) / 32768.0
- return vad.get_speech_timestamps(audio)
+ return vad.get_speech_timestamps(audio, vad_options=self.vad_options)
- def getStableCutoff(self, audio: bytes, min_delta_s = 1.5) -> int:
- min_delta_frames = min_delta_s * AudioStream.FPS
+ def getStableCutoff(self, audio: bytes) -> int:
+ min_delta_frames = int((self.vad_options.min_silence_duration_ms *
+ AudioStream.FPS) / 1000)
cutoff = None
last_end = None
@@ -330,7 +335,7 @@ class AudioSegmenter:
delta_frames = s['start'] - last_end
#print(f"delta frames: {delta_frames}")
if delta_frames > min_delta_frames:
- cutoff = s['start'] - int(min_delta_frames / 2)
+ cutoff = s['start']
else:
last_end = s['end']
@@ -444,10 +449,11 @@ class TranscriptCommit:
class VadCommitter:
def __init__(self,
collector: AudioCollector,
- whisper: Whisper):
+ whisper: Whisper,
+ segmenter: AudioSegmenter):
self.collector = collector
self.whisper = whisper
- self.segmenter = AudioSegmenter()
+ self.segmenter = segmenter
def getDelta(self) -> TranscriptCommit:
audio = self.collector.getAudio()
@@ -515,13 +521,13 @@ def evaluate(cfg,
collector = AudioCollector(stream)
collector = CompressingAudioCollector(collector)
whisper = Whisper(collector, cfg)
- com = None
- com = VadCommitter(collector, whisper)
+ segmenter = AudioSegmenter(min_silence_ms=500)
+ committer = VadCommitter(collector, whisper, segmenter)
transcript = ""
commits = []
while len(stream.frames) > 0:
- commit = com.getDelta()
+ commit = committer.getDelta()
if len(stream.frames) == 0:
commit.delta = commit.preview
@@ -614,10 +620,10 @@ def transcriptionThread(ctrl: ThreadControl):
commit = ctrl.committer.getDelta()
- if False:
- print(f"Transcript: {ctrl.transcript}{commit.delta}{commit.preview}")
+ if True:
+ print(f"Transcript: {ctrl.transcript}{commit.delta}[{commit.preview}]")
- if len(commit.delta):
+ if False and len(commit.delta):
print(f"Transcript: {ctrl.transcript}{commit.delta}{commit.preview}")
if cfg["enable_debug_mode"]:
print(f"commit latency: {commit.latency_s}", file=sys.stderr)
@@ -678,7 +684,7 @@ def vrInputThread(ctrl: ThreadControl):
elif now - last_rising > 0.5:
# Medium press
- print("CLEARING")
+ print("CLEARING", file=sys.stderr)
last_medium_press_end = now
state = PAUSE_STATE
@@ -697,7 +703,7 @@ def vrInputThread(ctrl: ThreadControl):
else:
# Short hold
if state == RECORD_STATE:
- print("PAUSED")
+ print("PAUSED", file=sys.stderr)
state = PAUSE_STATE
if not ctrl.cfg["use_builtin"]:
ctrl.pager.lockWorld(True)
@@ -734,23 +740,92 @@ def vrInputThread(ctrl: ThreadControl):
pass
def kbInputThread(ctrl: ThreadControl):
- while ctrl.run_app:
+ machine = keybind_event_machine.KeybindEventMachine(ctrl.cfg["keybind"])
+ last_press_time = 0
+
+ # double pressing the keybind
+ double_press_timeout = 0.5
+
+ RECORD_STATE = 0
+ PAUSE_STATE = 1
+ state = PAUSE_STATE
+
+ while ctrl.run_app == True:
time.sleep(0.01)
+ cur_press_time = machine.getNextPressTime()
+ if cur_press_time == 0:
+ continue
+
+ EVENT_SINGLE_PRESS = 0
+ EVENT_DOUBLE_PRESS = 1
+ if last_press_time == 0:
+ event = EVENT_SINGLE_PRESS
+ elif cur_press_time - last_press_time < double_press_timeout:
+ event = EVENT_DOUBLE_PRESS
+ else:
+ event = EVENT_SINGLE_PRESS
+ last_press_time = cur_press_time
+
+ if event == EVENT_DOUBLE_PRESS:
+ print("CLEARING", file=sys.stderr)
+ state = PAUSE_STATE
+
+ if ctrl.cfg["enable_local_beep"]:
+ #audio_state.audio_events.append(audio_state.AUDIO_EVENT_DISMISS)
+ pass
+
+ if not ctrl.cfg["use_builtin"]:
+ ctrl.pager.toggleBoard(False)
+
+ # Flush the *entire* pipeline.
+ ctrl.stream.pause(True)
+ ctrl.stream.getSamples()
+ ctrl.collector.dropAudio()
+ ctrl.pager.clear()
+ continue
+
+ # Short hold
+ if state == RECORD_STATE:
+ print("PAUSED", file=sys.stderr)
+ state = PAUSE_STATE
+ if not ctrl.cfg["use_builtin"]:
+ ctrl.pager.lockWorld(True)
+
+ ctrl.stream.pause(True)
+
+ if ctrl.cfg["enable_local_beep"]:
+ #audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF)
+ pass
+ elif state == PAUSE_STATE:
+ print("RECORDING", file=sys.stderr)
+ state = RECORD_STATE
+ if not ctrl.cfg["use_builtin"]:
+ ctrl.pager.toggleBoard(True)
+ ctrl.pager.lockWorld(False)
+ ctrl.pager.ellipsis(True)
+ if ctrl.cfg["reset_on_toggle"]:
+ if ctrl.cfg["enable_debug_mode"]:
+ print("Toggle detected, dropping transcript (2)")
+ ctrl.transcript = ""
+ ctrl.preview = ""
+ else:
+ if ctrl.cfg["enable_debug_mode"]:
+ print("Toggle detected, committing preview text (2)")
+ #audio_state.text += audio_state.preview_text
+
+ ctrl.stream.pause(False)
+ ctrl.pager.clear()
+
+ if ctrl.cfg["enable_local_beep"]:
+ #audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_ON)
+ pass
+
def oscThread(ctrl: ThreadControl):
while ctrl.run_app:
ctrl.pager.page(ctrl.preview)
time.sleep(0.01)
-def dev_run(cfg):
- stream = MicStream(cfg["microphone"])
- collector = AudioCollector(stream)
- segmenter = AudioSegmenter()
- while True:
- audio = collector.getAudio()
- cutoff = segmenter.getStableCutoff(audio)
- print(f"audio cutoff: {cutoff}")
-
def run(cfg):
stream = MicStream(cfg["microphone"])
@@ -759,7 +834,8 @@ def run(cfg):
#collector = NormalizingAudioCollector(collector)
collector = CompressingAudioCollector(collector)
whisper = Whisper(collector, cfg)
- committer = VadCommitter(collector, whisper)
+ segmenter = AudioSegmenter(min_silence_ms=500)
+ committer = VadCommitter(collector, whisper, segmenter)
pager = OscPager(cfg)
ctrl = ThreadControl(cfg)
@@ -813,7 +889,7 @@ if __name__ == "__main__":
"Evaluate/vei/control.txt"),
]
- if False:
+ if True:
sum = 0
for audio, control in experiments:
sum += evaluate(cfg, audio, control)
@@ -821,5 +897,4 @@ if __name__ == "__main__":
else:
#optimize(cfg, experiments)
run(cfg)
- #dev_run(cfg)