From 1c056bf385d2c48f6e4f77da513060c04415252c Mon Sep 17 00:00:00 2001 From: yum Date: Sun, 22 Jan 2023 15:05:54 -0800 Subject: Enable using built-in chatbox VRChat exposes a built-in chatbox which can be seen by anyone who has it enabled. This was not the case when I started this project: the chatbox would only be visible to friends. Since this is clearly useful, enabling the STT on public models, let's enable sending data to it. Caveats: * The built-in chatbox has anti-spam tech which limits us to updating about once every 2 seconds. The custom chatbox has no such limitation and is thus typically much faster. --- GUI/GUI/GUI/Config.cpp | 8 ++++--- GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 13 +++++++++++ GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.cpp | 1 + Scripts/osc_ctrl.py | 18 ++++++++++++++++ Scripts/transcribe.py | 50 +++++++++++++++++++++++++++---------------- 7 files changed, 71 insertions(+), 21 deletions(-) diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 9ce498a..4e6eb48 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -77,7 +77,8 @@ TranscriptionAppConfig::TranscriptionAppConfig() cols("48"), window_duration("15"), enable_local_beep(true), - use_cpu(false) + use_cpu(false), + use_builtin(false) {} bool TranscriptionAppConfig::Serialize(const std::filesystem::path& path) { @@ -94,8 +95,8 @@ bool TranscriptionAppConfig::Serialize(const std::filesystem::path& path) { root["window_duration"] << ryml::to_substr(window_duration); root["enable_local_beep"] << enable_local_beep; root["use_cpu"] << use_cpu; - - return Config::Serialize(path, &t); + root["use_builtin"] << use_builtin; + return Config::Serialize(path, &t); } bool TranscriptionAppConfig::Deserialize(const std::filesystem::path& path) { @@ -123,6 +124,7 @@ bool TranscriptionAppConfig::Deserialize(const std::filesystem::path& path) { root.get_if("window_duration", &c.window_duration); root.get_if("enable_local_beep", &c.enable_local_beep); root.get_if("use_cpu", &c.use_cpu); + root.get_if("use_builtin", &c.use_builtin); *this = std::move(c); return true; diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index 985380b..fe7b862 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -46,6 +46,7 @@ public: std::string window_duration; bool enable_local_beep; bool use_cpu; + bool use_builtin; }; // Represents the configurable fields for the Unity app. diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index f26cbec..f9c7998 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -35,6 +35,7 @@ namespace { ID_PY_APP_MODEL_PANEL, ID_PY_APP_ENABLE_LOCAL_BEEP, ID_PY_APP_USE_CPU, + ID_PY_APP_USE_BUILTIN, ID_PY_APP_ROWS, ID_PY_APP_COLS, ID_PY_APP_WINDOW_DURATION, @@ -433,6 +434,15 @@ Frame::Frame() ); py_app_use_cpu_ = py_app_use_cpu; + auto* py_app_use_builtin = new wxCheckBox(py_config_panel, + ID_PY_APP_USE_CPU, "Use built-in chatbox"); + py_app_use_builtin->SetValue(py_c.use_builtin); + py_app_use_builtin->SetToolTip( + "If checked, text will be sent to the built-in text box " + "instead of one attached to the current avatar." + ); + py_app_use_builtin_ = py_app_use_builtin; + auto* py_app_start_button = new wxButton(py_config_panel, ID_PY_APP_START_BUTTON, "Begin transcribing"); auto* py_app_stop_button = new wxButton(py_config_panel, ID_PY_APP_STOP_BUTTON, "Stop transcribing"); @@ -443,6 +453,7 @@ Frame::Frame() sizer->Add(py_app_config_panel_pairs, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_enable_local_beep, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_use_cpu, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(py_app_use_builtin, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_start_button, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_stop_button, /*proportion=*/0, /*flags=*/wxEXPAND); } @@ -935,6 +946,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { } const bool enable_local_beep = py_app_enable_local_beep_->GetValue(); const bool use_cpu = py_app_use_cpu_->GetValue(); + const bool use_builtin = py_app_use_builtin_->GetValue(); std::string rows_str = py_app_rows_->GetValue().ToStdString(); std::string cols_str = py_app_cols_->GetValue().ToStdString(); std::string window_duration_str = py_app_window_duration_->GetValue().ToStdString(); @@ -978,6 +990,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { py_c.window_duration = std::to_string(window_duration); py_c.enable_local_beep = enable_local_beep; py_c.use_cpu = use_cpu; + py_c.use_builtin = use_builtin; py_c.Serialize(TranscriptionAppConfig::kConfigPath); wxProcess* p = PythonWrapper::StartApp(std::move(cb), py_c); diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index cd62127..621715b 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -51,6 +51,7 @@ private: wxCheckBox* py_app_enable_local_beep_; wxCheckBox* py_app_use_cpu_; + wxCheckBox* py_app_use_builtin_; wxProcess* py_app_; wxTimer py_app_drain_; diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index 5581739..f26072a 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -159,6 +159,7 @@ wxProcess* PythonWrapper::StartApp( "--cols", config.cols, "--window_duration_s", config.window_duration, "--cpu", config.use_cpu ? "1" : "0", + "--use_builtin", config.use_builtin ? "1" : "0", }, std::move(exit_callback)); } diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py index 7c7d0ae..e57a843 100644 --- a/Scripts/osc_ctrl.py +++ b/Scripts/osc_ctrl.py @@ -30,6 +30,7 @@ class OscState: self.client = getClient(ip, port) self.pager = MultiLinePager(chars_per_sync, rows, cols) self.encoding= generateEncoding() + self.builtin_msg = "" # The last message sent to the built-in chatbox def reset(self): self.pager.reset() @@ -137,6 +138,23 @@ def pageMessage(osc_state: OscState, msg: str) -> bool: addr="/avatar/parameters/" + generate_utils.getSpeechNoiseToggleParam() osc_state.client.send_message(addr, False) +# Like `pageMessage` but uses the built-in chatbox. The built-in chatbox +# truncates data at about 150 chars, so just send the suffix of the message for +# now. +def pageMessageBuiltin(osc_state: OscState, msg: str) -> bool: + msg_begin = max(len(msg) - 140, 0) + msg_suffix = msg[msg_begin:len(msg)] + + if osc_state.builtin_msg != msg: + addr="/chatbox/typing" + osc_state.client.send_message(addr, False) + + addr="/chatbox/input" + osc_state.client.send_message(addr, (msg_suffix, True)) + osc_state.builtin_msg = msg + + return False # Not paging + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-i", default="127.0.0.1", help="OSC server IP") diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 7f07efe..1237334 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -303,17 +303,21 @@ def transcribeAudio(audio_state, model, use_cpu: bool): audio_state.transcribe_no_change_count = 0 audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s -def sendAudio(audio_state): +def sendAudio(audio_state, use_builtin: bool): while audio_state.run_app == True: text = audio_state.committed_text + " " + audio_state.text - ret = osc_ctrl.pageMessage(audio_state.osc_state, text) - is_paging = (ret == False) - osc_ctrl.indicatePaging(audio_state.osc_state.client, is_paging) + if use_builtin: + ret = osc_ctrl.pageMessageBuiltin(audio_state.osc_state, text) + time.sleep(1.5) + else: + ret = osc_ctrl.pageMessage(audio_state.osc_state, text) + is_paging = (ret == False) + osc_ctrl.indicatePaging(audio_state.osc_state.client, is_paging) - # Pace this out - time.sleep(0.01) + # Pace this out + time.sleep(0.01) -def readControllerInput(audio_state, enable_local_beep): +def readControllerInput(audio_state, enable_local_beep, use_builtin): session = None first = True while session == None and audio_state.run_app == True: @@ -343,8 +347,9 @@ def readControllerInput(audio_state, enable_local_beep): if now - last_rising > 0.5: # Long hold state = PAUSE_STATE - osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) - osc_ctrl.toggleBoard(audio_state.osc_state.client, False) + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.toggleBoard(audio_state.osc_state.client, False) #playsound(os.path.abspath("../Sounds/Noise_Off.wav")) resetAudioLocked(audio_state) @@ -355,8 +360,9 @@ def readControllerInput(audio_state, enable_local_beep): # Short hold if state == RECORD_STATE: state = PAUSE_STATE - osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) - osc_ctrl.lockWorld(audio_state.osc_state.client, True) + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.lockWorld(audio_state.osc_state.client, True) audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s audio_state.audio_paused = True @@ -365,9 +371,10 @@ def readControllerInput(audio_state, enable_local_beep): playsound(os.path.abspath("../Sounds/Noise_Off.wav")) elif state == PAUSE_STATE: state = RECORD_STATE - osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) - osc_ctrl.toggleBoard(audio_state.osc_state.client, True) - osc_ctrl.lockWorld(audio_state.osc_state.client, False) + if not use_builtin: + osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) + osc_ctrl.toggleBoard(audio_state.osc_state.client, True) + osc_ctrl.lockWorld(audio_state.osc_state.client, False) resetAudioLocked(audio_state) resetDisplayLocked(audio_state) @@ -379,7 +386,8 @@ def readControllerInput(audio_state, enable_local_beep): # model should correspond to one of the Whisper models defined in # whisper/__init__.py. Examples: tiny, base, small, medium. -def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, use_cpu: bool): +def transcribeLoop(mic: str, language: str, model: str, + enable_local_beep: bool, use_cpu: bool, use_builtin: bool): audio_state = getMicStream(mic) audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language] @@ -396,11 +404,11 @@ def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, transcribe_audio_thd.daemon = True transcribe_audio_thd.start() - send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state]) + send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state, use_builtin]) send_audio_thd.daemon = True send_audio_thd.start() - controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep]) + controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep, use_builtin]) controller_input_thd.daemon = True controller_input_thd.start() @@ -443,6 +451,7 @@ if __name__ == "__main__": parser.add_argument("--cols", type=int, help="The number of columns on the board") parser.add_argument("--window_duration_s", type=int, help="The length in seconds of the audio recording handed to the transcription algorithm") parser.add_argument("--cpu", type=int, help="If set to 1, use CPU instead of GPU") + parser.add_argument("--use_builtin", type=int, help="If set to 1, use the text box built into the game.") args = parser.parse_args() if not args.mic: @@ -470,11 +479,16 @@ if __name__ == "__main__": else: args.cpu = False + if args.use_builtin == 1: + args.use_builtin = True + else: + args.use_builtin = False + generate_utils.config.BYTES_PER_CHAR = int(args.bytes_per_char) generate_utils.config.CHARS_PER_SYNC = int(args.chars_per_sync) generate_utils.config.BOARD_ROWS = int(args.rows) generate_utils.config.BOARD_COLS = int(args.cols) transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep, - args.cpu) + args.cpu, args.use_builtin) -- cgit v1.2.3