summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-05-22 03:59:45 -0700
committeryum <yum.food.vr@gmail.com>2023-05-22 04:04:09 -0700
commit8fafea9d026b2b65599456e70d3f5aa61ef073d1 (patch)
treee0a759534c0042797e07ae8b18e2841c64c303c9 /Scripts
parentb2626a798768066e9611a8ec9c103b003a6debb8 (diff)
Add keyboard togglev0.11.4
Users can now configure a keybind to start/stop/dismiss the STT when in desktop mode. The default keybind is ctrl+x, since by default VRC doesn't use 'x' for anything.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/keybind_event_machine.py21
-rw-r--r--Scripts/requirements.txt1
-rw-r--r--Scripts/transcribe.py85
3 files changed, 104 insertions, 3 deletions
diff --git a/Scripts/keybind_event_machine.py b/Scripts/keybind_event_machine.py
new file mode 100644
index 0000000..3ce6794
--- /dev/null
+++ b/Scripts/keybind_event_machine.py
@@ -0,0 +1,21 @@
+import keyboard
+import time
+
+class KeybindEventMachine:
+ def __init__(self, keybind: str):
+ self.keybind = keybind
+ self.events = []
+ keyboard.add_hotkey(keybind, self.onPress)
+
+ def onPress(self) -> None:
+ self.events.append(time.time())
+
+ # Returns the timestamp when the keybind was pressed, or 0 if no keypresses
+ # are queued.
+ def getNextPressTime(self) -> int:
+ if len(self.events) == 0:
+ return 0
+ ret = self.events[0]
+ self.events = self.events[1:]
+ return ret
+
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index c3b91d8..c887808 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -1,6 +1,7 @@
editdistance
faster-whisper@https://github.com/guillaumekln/faster-whisper/archive/358d373691c95205021bd4bbf28cde7ce4d10030.tar.gz
future==0.18.2
+keyboard
langcodes
language-data
openvr
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index c066957..366f3a2 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -11,6 +11,7 @@ import copy
import os
import osc_ctrl
import generate_utils
+import keybind_event_machine
import langcodes
import pyaudio
import numpy as np
@@ -280,6 +281,75 @@ def sendAudio(audio_state, use_builtin: bool, estate: EmotesState):
# Pace this out
time.sleep(0.01)
+def readKeyboardInput(audio_state, enable_local_beep: bool,
+ use_builtin: bool, keybind: str):
+ machine = keybind_event_machine.KeybindEventMachine(keybind)
+ last_press_time = 0
+
+ # double pressing the keybind
+ double_press_timeout = 0.25
+
+ RECORD_STATE = 0
+ PAUSE_STATE = 1
+ state = PAUSE_STATE
+
+ while audio_state.run_app == True:
+ time.sleep(0.05)
+
+ cur_press_time = machine.getNextPressTime()
+ if cur_press_time == 0:
+ continue
+
+ EVENT_SINGLE_PRESS = 0
+ EVENT_DOUBLE_PRESS = 1
+ if last_press_time == 0:
+ event = EVENT_SINGLE_PRESS
+ elif cur_press_time - last_press_time < double_press_timeout:
+ event = EVENT_DOUBLE_PRESS
+ else:
+ event = EVENT_SINGLE_PRESS
+ last_press_time = cur_press_time
+
+ if event == EVENT_DOUBLE_PRESS:
+ state = PAUSE_STATE
+ if not use_builtin:
+ osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
+ osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
+ #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
+
+ resetAudioLocked(audio_state)
+ resetDisplayLocked(audio_state)
+ audio_state.drop_transcription = True
+ audio_state.audio_paused = True
+ continue
+
+ # Short hold
+ if state == RECORD_STATE:
+ state = PAUSE_STATE
+ if not use_builtin:
+ osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
+ osc_ctrl.lockWorld(audio_state.osc_state.client, True)
+ audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
+
+ audio_state.audio_paused = True
+
+ if enable_local_beep == 1:
+ playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"))
+ elif state == PAUSE_STATE:
+ state = RECORD_STATE
+ if not use_builtin:
+ osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
+ osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
+ osc_ctrl.lockWorld(audio_state.osc_state.client, False)
+ resetAudioLocked(audio_state)
+ resetDisplayLocked(audio_state)
+
+ audio_state.drop_transcription = True
+ audio_state.audio_paused = False
+
+ if enable_local_beep == 1:
+ playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"))
+
def readControllerInput(audio_state, enable_local_beep: bool,
use_builtin: bool, button: str):
session = None
@@ -357,7 +427,8 @@ def readControllerInput(audio_state, enable_local_beep: bool,
def transcribeLoop(mic: str, language: str, model: str,
enable_local_beep: bool, use_cpu: bool, use_builtin: bool,
button: str, estate: EmotesState,
- window_duration_s: int, gpu_idx: int):
+ window_duration_s: int, gpu_idx: int,
+ keyboard_hotkey: str):
audio_state = getMicStream(mic)
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
@@ -391,10 +462,16 @@ def transcribeLoop(mic: str, language: str, model: str,
send_audio_thd.daemon = True
send_audio_thd.start()
- controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep, use_builtin, button])
+ controller_input_thd = threading.Thread(target = readControllerInput, args
+ = [audio_state, enable_local_beep, use_builtin, button])
controller_input_thd.daemon = True
controller_input_thd.start()
+ keyboard_input_thd = threading.Thread(target = readKeyboardInput, args
+ = [audio_state, enable_local_beep, use_builtin, keyboard_hotkey])
+ keyboard_input_thd.daemon = True
+ keyboard_input_thd.start()
+
print("Press enter to start a new message.")
for line in sys.stdin:
audio_state.transcribe_lock.acquire()
@@ -412,6 +489,7 @@ def transcribeLoop(mic: str, language: str, model: str,
audio_state.run_app = False
transcribe_audio_thd.join()
controller_input_thd.join()
+ keyboard_input_thd.join()
if __name__ == "__main__":
sys.stdout.reconfigure(encoding="utf-8")
@@ -443,6 +521,7 @@ if __name__ == "__main__":
parser.add_argument("--button", type=str, help="The controller button used to start/stop transcription. E.g. \"left joystick\"")
parser.add_argument("--emotes_pickle", type=str, help="The path to emotes pickle. See emotes_v2.py for details.")
parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
+ parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
args = parser.parse_args()
if not args.mic:
@@ -501,5 +580,5 @@ if __name__ == "__main__":
transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep,
args.cpu, args.use_builtin, args.button, estate, window_duration_s,
- args.gpu_idx)
+ args.gpu_idx, args.keybind)