From 8fafea9d026b2b65599456e70d3f5aa61ef073d1 Mon Sep 17 00:00:00 2001 From: yum Date: Mon, 22 May 2023 03:59:45 -0700 Subject: Add keyboard toggle Users can now configure a keybind to start/stop/dismiss the STT when in desktop mode. The default keybind is ctrl+x, since by default VRC doesn't use 'x' for anything. --- GUI/GUI/GUI/Config.cpp | 3 ++ GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 19 +++++++++ GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.cpp | 1 + Scripts/keybind_event_machine.py | 21 ++++++++++ Scripts/requirements.txt | 1 + Scripts/transcribe.py | 85 ++++++++++++++++++++++++++++++++++++++-- 8 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 Scripts/keybind_event_machine.py diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 02646ab..73b28bc 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -72,6 +72,7 @@ AppConfig::AppConfig(wxTextCtrl* out) use_cpu(false), use_builtin(false), gpu_idx(0), + keybind("ctrl+x"), chars_per_sync(8), bytes_per_char(1), @@ -115,6 +116,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("use_cpu", use_cpu); cm.Set("use_builtin", use_builtin); cm.Set("gpu_idx", gpu_idx); + cm.Set("keybind", keybind); cm.Set("chars_per_sync", chars_per_sync); cm.Set("bytes_per_char", bytes_per_char); @@ -171,6 +173,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("use_cpu", c.use_cpu); cm.Get("use_builtin", c.use_builtin); cm.Get("gpu_idx", c.gpu_idx); + cm.Get("keybind", c.keybind); cm.Get("chars_per_sync", c.chars_per_sync); cm.Get("bytes_per_char", c.bytes_per_char); diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index d86c8d8..f53b700 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -58,6 +58,7 @@ public: bool use_cpu; bool use_builtin; int gpu_idx; + std::string keybind; // Unity and transcription shared settings. int chars_per_sync; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index ae07ad9..3bf1545 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -43,6 +43,7 @@ namespace { ID_PY_APP_COLS, ID_PY_APP_WINDOW_DURATION, ID_PY_APP_GPU_IDX, + ID_PY_APP_KEYBIND, ID_UNITY_PANEL, ID_UNITY_CONFIG_PANEL, ID_UNITY_OUT, @@ -537,6 +538,16 @@ Frame::Frame() "discrete GPU."); py_app_gpu_idx_ = py_app_gpu_idx; + auto* py_app_keybind = new wxTextCtrl( + py_app_config_panel_pairs, ID_PY_APP_KEYBIND, + app_c_->keybind, wxDefaultPosition, + wxDefaultSize, /*style=*/0); + py_app_keybind->SetToolTip( + "The keybind to use to toggle the STT when in desktop " + "mode. To dismiss the STT, double press the keybind " + "quickly."); + py_app_keybind_ = py_app_keybind; + auto* sizer = new wxFlexGridSizer(/*cols=*/2); py_app_config_panel_pairs->SetSizer(sizer); @@ -570,6 +581,11 @@ Frame::Frame() sizer->Add(py_app_button, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(new wxStaticText(py_app_config_panel_pairs, + wxID_ANY, /*label=*/"Desktop keybind:")); + sizer->Add(py_app_keybind, /*proportion=*/0, + /*flags=*/wxEXPAND); + sizer->Add(new wxStaticText(py_app_config_panel_pairs, wxID_ANY, /*label=*/"Text box rows:")); sizer->Add(py_app_rows, /*proportion=*/0, @@ -2118,6 +2134,8 @@ void Frame::OnAppStart(wxCommandEvent& event) { py_app_window_duration_->GetValue().ToStdString(); std::string gpu_idx_str = py_app_gpu_idx_->GetValue().ToStdString(); + std::string keybind = + py_app_keybind_->GetValue().ToStdString(); int rows, cols, chars_per_sync, bytes_per_char, window_duration, gpu_idx; try { rows = std::stoi(rows_str); @@ -2175,6 +2193,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->use_cpu = use_cpu; app_c_->use_builtin = use_builtin; app_c_->gpu_idx = gpu_idx; + app_c_->keybind = keybind; app_c_->Serialize(AppConfig::kConfigPath); auto out_cb = [&](const std::string& out, const std::string& err) { diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 1252542..7e55347 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -41,6 +41,7 @@ private: wxTextCtrl* py_app_cols_; wxTextCtrl* py_app_window_duration_; wxTextCtrl* py_app_gpu_idx_; + wxTextCtrl* py_app_keybind_; wxTextCtrl* unity_rows_; wxTextCtrl* unity_cols_; wxTextCtrl* whisper_rows_; diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index c5f355a..c875874 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -473,6 +473,7 @@ std::future PythonWrapper::StartApp( "--use_builtin", config.use_builtin ? "1" : "0", "--emotes_pickle", kEmotesPickle, "--gpu_idx", std::to_string(config.gpu_idx), + "--keybind", Quote(config.keybind), }, std::move(out_cb), std::move(in_cb), diff --git a/Scripts/keybind_event_machine.py b/Scripts/keybind_event_machine.py new file mode 100644 index 0000000..3ce6794 --- /dev/null +++ b/Scripts/keybind_event_machine.py @@ -0,0 +1,21 @@ +import keyboard +import time + +class KeybindEventMachine: + def __init__(self, keybind: str): + self.keybind = keybind + self.events = [] + keyboard.add_hotkey(keybind, self.onPress) + + def onPress(self) -> None: + self.events.append(time.time()) + + # Returns the timestamp when the keybind was pressed, or 0 if no keypresses + # are queued. + def getNextPressTime(self) -> int: + if len(self.events) == 0: + return 0 + ret = self.events[0] + self.events = self.events[1:] + return ret + diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt index c3b91d8..c887808 100644 --- a/Scripts/requirements.txt +++ b/Scripts/requirements.txt @@ -1,6 +1,7 @@ editdistance faster-whisper@https://github.com/guillaumekln/faster-whisper/archive/358d373691c95205021bd4bbf28cde7ce4d10030.tar.gz future==0.18.2 +keyboard langcodes language-data openvr diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index c066957..366f3a2 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -11,6 +11,7 @@ import copy import os import osc_ctrl import generate_utils +import keybind_event_machine import langcodes import pyaudio import numpy as np @@ -280,6 +281,75 @@ def sendAudio(audio_state, use_builtin: bool, estate: EmotesState): # Pace this out time.sleep(0.01) +def readKeyboardInput(audio_state, enable_local_beep: bool, + use_builtin: bool, keybind: str): + machine = keybind_event_machine.KeybindEventMachine(keybind) + last_press_time = 0 + + # double pressing the keybind + double_press_timeout = 0.25 + + RECORD_STATE = 0 + PAUSE_STATE = 1 + state = PAUSE_STATE + + while audio_state.run_app == True: + time.sleep(0.05) + + cur_press_time = machine.getNextPressTime() + if cur_press_time == 0: + continue + + EVENT_SINGLE_PRESS = 0 + EVENT_DOUBLE_PRESS = 1 + if last_press_time == 0: + event = EVENT_SINGLE_PRESS + elif cur_press_time - last_press_time < double_press_timeout: + event = EVENT_DOUBLE_PRESS + else: + event = EVENT_SINGLE_PRESS + last_press_time = cur_press_time + + if event == EVENT_DOUBLE_PRESS: + state = PAUSE_STATE + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.toggleBoard(audio_state.osc_state.client, False) + #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav")) + + resetAudioLocked(audio_state) + resetDisplayLocked(audio_state) + audio_state.drop_transcription = True + audio_state.audio_paused = True + continue + + # Short hold + if state == RECORD_STATE: + state = PAUSE_STATE + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.lockWorld(audio_state.osc_state.client, True) + audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s + + audio_state.audio_paused = True + + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav")) + elif state == PAUSE_STATE: + state = RECORD_STATE + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) + osc_ctrl.toggleBoard(audio_state.osc_state.client, True) + osc_ctrl.lockWorld(audio_state.osc_state.client, False) + resetAudioLocked(audio_state) + resetDisplayLocked(audio_state) + + audio_state.drop_transcription = True + audio_state.audio_paused = False + + if enable_local_beep == 1: + playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav")) + def readControllerInput(audio_state, enable_local_beep: bool, use_builtin: bool, button: str): session = None @@ -357,7 +427,8 @@ def readControllerInput(audio_state, enable_local_beep: bool, def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, use_cpu: bool, use_builtin: bool, button: str, estate: EmotesState, - window_duration_s: int, gpu_idx: int): + window_duration_s: int, gpu_idx: int, + keyboard_hotkey: str): audio_state = getMicStream(mic) audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s @@ -391,10 +462,16 @@ def transcribeLoop(mic: str, language: str, model: str, send_audio_thd.daemon = True send_audio_thd.start() - controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep, use_builtin, button]) + controller_input_thd = threading.Thread(target = readControllerInput, args + = [audio_state, enable_local_beep, use_builtin, button]) controller_input_thd.daemon = True controller_input_thd.start() + keyboard_input_thd = threading.Thread(target = readKeyboardInput, args + = [audio_state, enable_local_beep, use_builtin, keyboard_hotkey]) + keyboard_input_thd.daemon = True + keyboard_input_thd.start() + print("Press enter to start a new message.") for line in sys.stdin: audio_state.transcribe_lock.acquire() @@ -412,6 +489,7 @@ def transcribeLoop(mic: str, language: str, model: str, audio_state.run_app = False transcribe_audio_thd.join() controller_input_thd.join() + keyboard_input_thd.join() if __name__ == "__main__": sys.stdout.reconfigure(encoding="utf-8") @@ -443,6 +521,7 @@ if __name__ == "__main__": parser.add_argument("--button", type=str, help="The controller button used to start/stop transcription. E.g. \"left joystick\"") parser.add_argument("--emotes_pickle", type=str, help="The path to emotes pickle. See emotes_v2.py for details.") parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") + parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") args = parser.parse_args() if not args.mic: @@ -501,5 +580,5 @@ if __name__ == "__main__": transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep, args.cpu, args.use_builtin, args.button, estate, window_duration_s, - args.gpu_idx) + args.gpu_idx, args.keybind) -- cgit v1.2.3