From 1c056bf385d2c48f6e4f77da513060c04415252c Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Sun, 22 Jan 2023 15:05:54 -0800
Subject: Enable using built-in chatbox

VRChat exposes a built-in chatbox which can be seen by anyone who has
it enabled. This was not the case when I started this project: the
chatbox would only be visible to friends. Since this is clearly useful,
enabling the STT on public models, let's enable sending data to it.

Caveats:

* The built-in chatbox has anti-spam tech which limits us to updating
  about once every 2 seconds. The custom chatbox has no such limitation
  and is thus typically much faster.
---
 GUI/GUI/GUI/Config.cpp        |  8 ++++---
 GUI/GUI/GUI/Config.h          |  1 +
 GUI/GUI/GUI/Frame.cpp         | 13 +++++++++++
 GUI/GUI/GUI/Frame.h           |  1 +
 GUI/GUI/GUI/PythonWrapper.cpp |  1 +
 Scripts/osc_ctrl.py           | 18 ++++++++++++++++
 Scripts/transcribe.py         | 50 +++++++++++++++++++++++++++----------------
 7 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 9ce498a..4e6eb48 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -77,7 +77,8 @@ TranscriptionAppConfig::TranscriptionAppConfig()
 	cols("48"),
 	window_duration("15"),
 	enable_local_beep(true),
-	use_cpu(false)
+	use_cpu(false),
+	use_builtin(false)
 {}
 
 bool TranscriptionAppConfig::Serialize(const std::filesystem::path& path) {
@@ -94,8 +95,8 @@ bool TranscriptionAppConfig::Serialize(const std::filesystem::path& path) {
 	root["window_duration"] << ryml::to_substr(window_duration);
 	root["enable_local_beep"] << enable_local_beep;
 	root["use_cpu"] << use_cpu;
-
-	return Config::Serialize(path, &t);
+	root["use_builtin"] << use_builtin;
+  return Config::Serialize(path, &t);
 }
 
 bool TranscriptionAppConfig::Deserialize(const std::filesystem::path& path) {
@@ -123,6 +124,7 @@ bool TranscriptionAppConfig::Deserialize(const std::filesystem::path& path) {
 	root.get_if("window_duration", &c.window_duration);
 	root.get_if("enable_local_beep", &c.enable_local_beep);
 	root.get_if("use_cpu", &c.use_cpu);
+	root.get_if("use_builtin", &c.use_builtin);
 
 	*this = std::move(c);
 	return true;
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index 985380b..fe7b862 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -46,6 +46,7 @@ public:
 	std::string window_duration;
 	bool enable_local_beep;
 	bool use_cpu;
+	bool use_builtin;
 };
 
 // Represents the configurable fields for the Unity app.
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index f26cbec..f9c7998 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -35,6 +35,7 @@ namespace {
         ID_PY_APP_MODEL_PANEL,
         ID_PY_APP_ENABLE_LOCAL_BEEP,
         ID_PY_APP_USE_CPU,
+        ID_PY_APP_USE_BUILTIN,
         ID_PY_APP_ROWS,
         ID_PY_APP_COLS,
         ID_PY_APP_WINDOW_DURATION,
@@ -433,6 +434,15 @@ Frame::Frame()
                 );
                 py_app_use_cpu_ = py_app_use_cpu;
 
+                auto* py_app_use_builtin = new wxCheckBox(py_config_panel,
+                    ID_PY_APP_USE_CPU, "Use built-in chatbox");
+                py_app_use_builtin->SetValue(py_c.use_builtin);
+                py_app_use_builtin->SetToolTip(
+                    "If checked, text will be sent to the built-in text box "
+                    "instead of one attached to the current avatar."
+                );
+                py_app_use_builtin_ = py_app_use_builtin;
+
                 auto* py_app_start_button = new wxButton(py_config_panel, ID_PY_APP_START_BUTTON, "Begin transcribing");
                 auto* py_app_stop_button = new wxButton(py_config_panel, ID_PY_APP_STOP_BUTTON, "Stop transcribing");
 
@@ -443,6 +453,7 @@ Frame::Frame()
                 sizer->Add(py_app_config_panel_pairs, /*proportion=*/0, /*flags=*/wxEXPAND);
                 sizer->Add(py_app_enable_local_beep, /*proportion=*/0, /*flags=*/wxEXPAND);
                 sizer->Add(py_app_use_cpu, /*proportion=*/0, /*flags=*/wxEXPAND);
+                sizer->Add(py_app_use_builtin, /*proportion=*/0, /*flags=*/wxEXPAND);
                 sizer->Add(py_app_start_button, /*proportion=*/0, /*flags=*/wxEXPAND);
                 sizer->Add(py_app_stop_button, /*proportion=*/0, /*flags=*/wxEXPAND);
             }
@@ -935,6 +946,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     }
     const bool enable_local_beep = py_app_enable_local_beep_->GetValue();
     const bool use_cpu = py_app_use_cpu_->GetValue();
+    const bool use_builtin = py_app_use_builtin_->GetValue();
     std::string rows_str = py_app_rows_->GetValue().ToStdString();
     std::string cols_str = py_app_cols_->GetValue().ToStdString();
     std::string window_duration_str = py_app_window_duration_->GetValue().ToStdString();
@@ -978,6 +990,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     py_c.window_duration = std::to_string(window_duration);
     py_c.enable_local_beep = enable_local_beep;
     py_c.use_cpu = use_cpu;
+    py_c.use_builtin = use_builtin;
     py_c.Serialize(TranscriptionAppConfig::kConfigPath);
 
     wxProcess* p = PythonWrapper::StartApp(std::move(cb), py_c);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index cd62127..621715b 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -51,6 +51,7 @@ private:
 
     wxCheckBox* py_app_enable_local_beep_;
     wxCheckBox* py_app_use_cpu_;
+    wxCheckBox* py_app_use_builtin_;
 
     wxProcess* py_app_;
     wxTimer py_app_drain_;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index 5581739..f26072a 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -159,6 +159,7 @@ wxProcess* PythonWrapper::StartApp(
 		"--cols", config.cols,
 		"--window_duration_s", config.window_duration,
 		"--cpu", config.use_cpu ? "1" : "0",
+		"--use_builtin", config.use_builtin ? "1" : "0",
 		},
 		std::move(exit_callback));
 }
diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py
index 7c7d0ae..e57a843 100644
--- a/Scripts/osc_ctrl.py
+++ b/Scripts/osc_ctrl.py
@@ -30,6 +30,7 @@ class OscState:
         self.client = getClient(ip, port)
         self.pager = MultiLinePager(chars_per_sync, rows, cols)
         self.encoding= generateEncoding()
+        self.builtin_msg = ""  # The last message sent to the built-in chatbox
 
     def reset(self):
         self.pager.reset()
@@ -137,6 +138,23 @@ def pageMessage(osc_state: OscState, msg: str) -> bool:
         addr="/avatar/parameters/" + generate_utils.getSpeechNoiseToggleParam()
         osc_state.client.send_message(addr, False)
 
+# Like `pageMessage` but uses the built-in chatbox. The built-in chatbox
+# truncates data at about 150 chars, so just send the suffix of the message for
+# now.
+def pageMessageBuiltin(osc_state: OscState, msg: str) -> bool:
+    msg_begin = max(len(msg) - 140, 0)
+    msg_suffix = msg[msg_begin:len(msg)]
+
+    if osc_state.builtin_msg != msg:
+        addr="/chatbox/typing"
+        osc_state.client.send_message(addr, False)
+
+        addr="/chatbox/input"
+        osc_state.client.send_message(addr, (msg_suffix, True))
+        osc_state.builtin_msg = msg
+
+    return False  # Not paging
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-i", default="127.0.0.1", help="OSC server IP")
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 7f07efe..1237334 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -303,17 +303,21 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
             audio_state.transcribe_no_change_count = 0
             audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
-def sendAudio(audio_state):
+def sendAudio(audio_state, use_builtin: bool):
     while audio_state.run_app == True:
         text = audio_state.committed_text + " " + audio_state.text
-        ret = osc_ctrl.pageMessage(audio_state.osc_state, text)
-        is_paging = (ret == False)
-        osc_ctrl.indicatePaging(audio_state.osc_state.client, is_paging)
+        if use_builtin:
+            ret = osc_ctrl.pageMessageBuiltin(audio_state.osc_state, text)
+            time.sleep(1.5)
+        else:
+            ret = osc_ctrl.pageMessage(audio_state.osc_state, text)
+            is_paging = (ret == False)
+            osc_ctrl.indicatePaging(audio_state.osc_state.client, is_paging)
 
-        # Pace this out
-        time.sleep(0.01)
+            # Pace this out
+            time.sleep(0.01)
 
-def readControllerInput(audio_state, enable_local_beep):
+def readControllerInput(audio_state, enable_local_beep, use_builtin):
     session = None
     first = True
     while session == None and audio_state.run_app == True:
@@ -343,8 +347,9 @@ def readControllerInput(audio_state, enable_local_beep):
             if now - last_rising > 0.5:
                 # Long hold
                 state = PAUSE_STATE
-                osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
-                osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
+                if not use_builtin:
+                    osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
+                    osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
                 #playsound(os.path.abspath("../Sounds/Noise_Off.wav"))
 
                 resetAudioLocked(audio_state)
@@ -355,8 +360,9 @@ def readControllerInput(audio_state, enable_local_beep):
                 # Short hold
                 if state == RECORD_STATE:
                     state = PAUSE_STATE
-                    osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
-                    osc_ctrl.lockWorld(audio_state.osc_state.client, True)
+                    if not use_builtin:
+                        osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
+                        osc_ctrl.lockWorld(audio_state.osc_state.client, True)
                     audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
                     audio_state.audio_paused = True
@@ -365,9 +371,10 @@ def readControllerInput(audio_state, enable_local_beep):
                         playsound(os.path.abspath("../Sounds/Noise_Off.wav"))
                 elif state == PAUSE_STATE:
                     state = RECORD_STATE
-                    osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
-                    osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
-                    osc_ctrl.lockWorld(audio_state.osc_state.client, False)
+                    if not use_builtin:
+                        osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
+                        osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
+                        osc_ctrl.lockWorld(audio_state.osc_state.client, False)
                     resetAudioLocked(audio_state)
                     resetDisplayLocked(audio_state)
 
@@ -379,7 +386,8 @@ def readControllerInput(audio_state, enable_local_beep):
 
 # model should correspond to one of the Whisper models defined in
 # whisper/__init__.py. Examples: tiny, base, small, medium.
-def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, use_cpu: bool):
+def transcribeLoop(mic: str, language: str, model: str,
+        enable_local_beep: bool, use_cpu: bool, use_builtin: bool):
     audio_state = getMicStream(mic)
     audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
 
@@ -396,11 +404,11 @@ def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool,
     transcribe_audio_thd.daemon = True
     transcribe_audio_thd.start()
 
-    send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state])
+    send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state, use_builtin])
     send_audio_thd.daemon = True
     send_audio_thd.start()
 
-    controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep])
+    controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state, enable_local_beep, use_builtin])
     controller_input_thd.daemon = True
     controller_input_thd.start()
 
@@ -443,6 +451,7 @@ if __name__ == "__main__":
     parser.add_argument("--cols", type=int, help="The number of columns on the board")
     parser.add_argument("--window_duration_s", type=int, help="The length in seconds of the audio recording handed to the transcription algorithm")
     parser.add_argument("--cpu", type=int, help="If set to 1, use CPU instead of GPU")
+    parser.add_argument("--use_builtin", type=int, help="If set to 1, use the text box built into the game.")
     args = parser.parse_args()
 
     if not args.mic:
@@ -470,11 +479,16 @@ if __name__ == "__main__":
     else:
         args.cpu = False
 
+    if args.use_builtin == 1:
+        args.use_builtin = True
+    else:
+        args.use_builtin = False
+
     generate_utils.config.BYTES_PER_CHAR = int(args.bytes_per_char)
     generate_utils.config.CHARS_PER_SYNC = int(args.chars_per_sync)
     generate_utils.config.BOARD_ROWS = int(args.rows)
     generate_utils.config.BOARD_COLS = int(args.cols)
 
     transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep,
-            args.cpu)
+            args.cpu, args.use_builtin)
 
-- 
cgit v1.2.3