From 78de7f02bc364606b0939d66903f02b2f91c141b Mon Sep 17 00:00:00 2001 From: yum Date: Tue, 23 May 2023 15:12:04 -0700 Subject: Add ability to type using STT To use it, do a medium hold + long hold. Keep the long hold depressed until you're done speaking. The transcription will be typed into the currently selected input field. * Add more audio feedback * Make audio feedback play asynchronously so it doesn't slow down the controller input state machine as much. --- README.md | 6 +++-- Scripts/transcribe.py | 53 +++++++++++++++++++++++++++++++---------- Sounds/Dismiss_Noise.wav | Bin 0 -> 192078 bytes Sounds/Dismiss_Noise_Quiet.wav | Bin 0 -> 192078 bytes Sounds/KB_Noise_Off.wav | Bin 0 -> 192078 bytes Sounds/KB_Noise_Off_Quiet.wav | Bin 0 -> 192078 bytes Sounds/KB_Noise_On.wav | Bin 0 -> 266318 bytes Sounds/KB_Noise_On_Quiet.wav | Bin 0 -> 266318 bytes 8 files changed, 45 insertions(+), 14 deletions(-) create mode 100644 Sounds/Dismiss_Noise.wav create mode 100644 Sounds/Dismiss_Noise_Quiet.wav create mode 100644 Sounds/KB_Noise_Off.wav create mode 100644 Sounds/KB_Noise_Off_Quiet.wav create mode 100644 Sounds/KB_Noise_On.wav create mode 100644 Sounds/KB_Noise_On_Quiet.wav diff --git a/README.md b/README.md index 057acb4..6b75998 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,10 @@ To build your own package from source, see GUI/README.md. Basic controls: * Short click to toggle transcription. -* Long click to hide the text box. -* Scale it up/down in the radial menu. +* Medium click to hide the text box. +* Hold to update text box without unlocking from worldspace. +* Medium click + hold to type using STT. +* Scale up/down in the radial menu. ## Design philosophy diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 422e9c0..b4b7198 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -8,13 +8,14 @@ from playsound import playsound import argparse import copy -import os -import osc_ctrl import generate_utils import keybind_event_machine +import keyboard import langcodes -import pyaudio import numpy as np +import os +import osc_ctrl +import pyaudio import steamvr import string_matcher import sys @@ -334,7 +335,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, audio_state.audio_paused = True if enable_local_beep == 1: - playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav")) + playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"), + block=False) elif state == PAUSE_STATE: state = RECORD_STATE if not use_builtin: @@ -348,7 +350,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, audio_state.audio_paused = False if enable_local_beep == 1: - playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav")) + playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"), + block=False) def readControllerInput(audio_state, enable_local_beep: bool, use_builtin: bool, button: str): @@ -371,7 +374,14 @@ def readControllerInput(audio_state, enable_local_beep: bool, hand_id = steamvr.hands[button.split()[0]] button_id = steamvr.buttons[button.split()[1]] + # Rough description of state machine: + # Single short press: toggle transcription + # Medium press: dismiss custom chatbox + # Long press: update chatbox in place + # Medium press + long press: type transcription + last_rising = time.time() + last_medium_press_end = 0 while audio_state.run_app == True: time.sleep(0.05) @@ -390,7 +400,7 @@ def readControllerInput(audio_state, enable_local_beep: bool, elif event == steamvr.EVENT_FALLING_EDGE: now = time.time() if now - last_rising > 1.5: - # Very long hold: treat as the end of transcription. + # Long press: treat as the end of transcription. state = PAUSE_STATE if not use_builtin: osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) @@ -398,13 +408,29 @@ def readControllerInput(audio_state, enable_local_beep: bool, audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s audio_state.audio_paused = True + if last_rising - last_medium_press_end < 1.0: + # Type transcription + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/KB_Noise_Off_Quiet.wav"), + block=False) + keyboard.write(audio_state.text) + else: + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"), + block=False) + elif now - last_rising > 0.5: - # Long hold + # Medium press + last_medium_press_end = now state = PAUSE_STATE + + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/Dismiss_Noise_Quiet.wav"), + block=False) + if not use_builtin: osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) osc_ctrl.toggleBoard(audio_state.osc_state.client, False) - #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav")) resetAudioLocked(audio_state) resetDisplayLocked(audio_state) @@ -422,17 +448,20 @@ def readControllerInput(audio_state, enable_local_beep: bool, audio_state.audio_paused = True if enable_local_beep == 1: - playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav")) + playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"), + block=False) elif state == PAUSE_STATE: state = RECORD_STATE + + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"), + block=False) + if not use_builtin: osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) osc_ctrl.toggleBoard(audio_state.osc_state.client, True) osc_ctrl.lockWorld(audio_state.osc_state.client, False) - if enable_local_beep == 1: - playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav")) - # model should correspond to one of the Whisper models defined in # whisper/__init__.py. Examples: tiny, base, small, medium. def transcribeLoop(mic: str, language: str, model: str, diff --git a/Sounds/Dismiss_Noise.wav b/Sounds/Dismiss_Noise.wav new file mode 100644 index 0000000..fe60f21 Binary files /dev/null and b/Sounds/Dismiss_Noise.wav differ diff --git a/Sounds/Dismiss_Noise_Quiet.wav b/Sounds/Dismiss_Noise_Quiet.wav new file mode 100644 index 0000000..5c3b1cb Binary files /dev/null and b/Sounds/Dismiss_Noise_Quiet.wav differ diff --git a/Sounds/KB_Noise_Off.wav b/Sounds/KB_Noise_Off.wav new file mode 100644 index 0000000..64d9c6f Binary files /dev/null and b/Sounds/KB_Noise_Off.wav differ diff --git a/Sounds/KB_Noise_Off_Quiet.wav b/Sounds/KB_Noise_Off_Quiet.wav new file mode 100644 index 0000000..b965e6a Binary files /dev/null and b/Sounds/KB_Noise_Off_Quiet.wav differ diff --git a/Sounds/KB_Noise_On.wav b/Sounds/KB_Noise_On.wav new file mode 100644 index 0000000..a959041 Binary files /dev/null and b/Sounds/KB_Noise_On.wav differ diff --git a/Sounds/KB_Noise_On_Quiet.wav b/Sounds/KB_Noise_On_Quiet.wav new file mode 100644 index 0000000..e49513e Binary files /dev/null and b/Sounds/KB_Noise_On_Quiet.wav differ -- cgit v1.2.3