From ff7eb3c212195af71cd0ce4a3cd0c9a081d6ebda Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Wed, 28 Jun 2023 20:35:10 -0700
Subject: Add toggle for debug mode

Most transcription output is now gone by default. Users can enable a
more verbose output by toggling `Enable debug mode`.

Bugfix: Toggling off transcription would reset audio state, frequently
resulting in the loss of the last few words spoken.
---
 GUI/GUI/GUI/Config.cpp        |  3 +++
 GUI/GUI/GUI/Config.h          |  1 +
 GUI/GUI/GUI/Frame.cpp         | 18 ++++++++++++++
 GUI/GUI/GUI/Frame.h           |  1 +
 GUI/GUI/GUI/PythonWrapper.cpp |  1 +
 Scripts/transcribe.py         | 55 ++++++++++++++++++++++++++-----------------
 6 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 4f730b5..d337f77 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -79,6 +79,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	remove_trailing_period(false),
 	enable_uppercase_filter(false),
 	enable_lowercase_filter(false),
+	enable_debug_mode(false),
 	reset_on_toggle(true),
 	gpu_idx(0),
 	keybind("ctrl+x"),
@@ -131,6 +132,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
 	cm.Set("remove_trailing_period", remove_trailing_period);
 	cm.Set("enable_uppercase_filter", enable_uppercase_filter);
 	cm.Set("enable_lowercase_filter", enable_lowercase_filter);
+	cm.Set("enable_debug_mode", enable_debug_mode);
 	cm.Set("reset_on_toggle", reset_on_toggle);
 	cm.Set("gpu_idx", gpu_idx);
 	cm.Set("keybind", keybind);
@@ -196,6 +198,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
 	cm.Get("remove_trailing_period", c.remove_trailing_period);
 	cm.Get("enable_uppercase_filter", c.enable_uppercase_filter);
 	cm.Get("enable_lowercase_filter", c.enable_lowercase_filter);
+	cm.Get("enable_debug_mode", c.enable_debug_mode);
 	cm.Get("reset_on_toggle", c.reset_on_toggle);
 	cm.Get("gpu_idx", c.gpu_idx);
 	cm.Get("keybind", c.keybind);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index 6711d79..e4a9bf4 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -65,6 +65,7 @@ public:
 	bool remove_trailing_period;
 	bool enable_uppercase_filter;
 	bool enable_lowercase_filter;
+	bool enable_debug_mode;
 	bool reset_on_toggle;
 	int gpu_idx;
 	std::string keybind;
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 1df60e7..706165b 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -47,6 +47,7 @@ namespace {
         ID_PY_APP_REMOVE_TRAILING_PERIOD,
         ID_PY_APP_ENABLE_UPPERCASE_FILTER,
         ID_PY_APP_ENABLE_LOWERCASE_FILTER,
+        ID_PY_APP_ENABLE_DEBUG_MODE,
         ID_PY_APP_RESET_ON_TOGGLE,
         ID_PY_APP_ROWS,
         ID_PY_APP_COLS,
@@ -881,6 +882,16 @@ Frame::Frame()
                 );
                 py_app_enable_lowercase_filter_ = py_app_enable_lowercase_filter;
 
+                auto* py_app_enable_debug_mode = new wxCheckBox(py_config_panel,
+                    ID_PY_APP_ENABLE_DEBUG_MODE, "Enable debug mode");
+                py_app_enable_debug_mode->SetValue(app_c_->enable_debug_mode);
+                py_app_enable_debug_mode->SetToolTip(
+                    "If checked, the transcription engine will print out "
+                    "additional information. Use this if you're debugging a "
+                    "technical issue."
+                );
+                py_app_enable_debug_mode_ = py_app_enable_debug_mode;
+
                 auto* py_app_reset_on_toggle = new wxCheckBox(py_config_panel,
                     ID_PY_APP_RESET_ON_TOGGLE, "Reset transcript on toggle");
                 py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle);
@@ -922,6 +933,8 @@ Frame::Frame()
                     /*flags=*/wxEXPAND);
                 sizer->Add(py_app_enable_lowercase_filter, /*proportion=*/0,
                     /*flags=*/wxEXPAND);
+                sizer->Add(py_app_enable_debug_mode, /*proportion=*/0,
+                    /*flags=*/wxEXPAND);
                 sizer->Add(py_app_start_button, /*proportion=*/0,
                     /*flags=*/wxEXPAND);
                 sizer->Add(py_app_stop_button, /*proportion=*/0,
@@ -1440,6 +1453,9 @@ void Frame::ApplyConfigToInputFields()
     auto* py_app_enable_lowercase_filter = static_cast<wxCheckBox*>(FindWindowById(ID_PY_APP_ENABLE_LOWERCASE_FILTER));
     py_app_enable_lowercase_filter->SetValue(app_c_->enable_lowercase_filter);
 
+    auto* py_app_enable_debug_mode = static_cast<wxCheckBox*>(FindWindowById(ID_PY_APP_ENABLE_DEBUG_MODE));
+    py_app_enable_debug_mode->SetValue(app_c_->enable_debug_mode);
+
     auto* py_app_reset_on_toggle = static_cast<wxCheckBox*>(FindWindowById(ID_PY_APP_RESET_ON_TOGGLE));
     py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle);
 
@@ -2028,6 +2044,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     const bool remove_trailing_period = py_app_remove_trailing_period_->GetValue();
     const bool enable_uppercase_filter = py_app_enable_uppercase_filter_->GetValue();
     const bool enable_lowercase_filter = py_app_enable_lowercase_filter_->GetValue();
+    const bool enable_debug_mode = py_app_enable_debug_mode_->GetValue();
     const bool reset_on_toggle = py_app_reset_on_toggle_->GetValue();
     std::string rows_str = py_app_rows_->GetValue().ToStdString();
     std::string cols_str = py_app_cols_->GetValue().ToStdString();
@@ -2126,6 +2143,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     app_c_->remove_trailing_period = remove_trailing_period;
     app_c_->enable_uppercase_filter = enable_uppercase_filter;
     app_c_->enable_lowercase_filter = enable_lowercase_filter;
+    app_c_->enable_debug_mode = enable_debug_mode;
     app_c_->reset_on_toggle = reset_on_toggle;
     app_c_->gpu_idx = gpu_idx;
     app_c_->keybind = keybind;
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 7afc005..46f5bcd 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -69,6 +69,7 @@ private:
     wxCheckBox* py_app_remove_trailing_period_;
     wxCheckBox* py_app_enable_uppercase_filter_;
     wxCheckBox* py_app_enable_lowercase_filter_;
+    wxCheckBox* py_app_enable_debug_mode_;
     wxCheckBox* py_app_reset_on_toggle_;
     wxCheckBox* unity_clear_osc_;
 
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index e6f10c2..c5421e8 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -493,6 +493,7 @@ std::future<bool> PythonWrapper::StartApp(
 					"--remove_trailing_period", config.remove_trailing_period ? "1" : "0",
 					"--enable_uppercase_filter", config.enable_uppercase_filter ? "1" : "0",
 					"--enable_lowercase_filter", config.enable_lowercase_filter ? "1" : "0",
+					"--enable_debug_mode", config.enable_debug_mode ? "1" : "0",
 					"--emotes_pickle", kEmotesPickle,
 					"--gpu_idx", std::to_string(config.gpu_idx),
 					"--keybind", Quote(config.keybind),
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index cc1944c..28b6ca0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -38,6 +38,9 @@ class AudioState:
         # This matches the framerate expected by whisper.
         self.RATE = 16000
 
+        # If set, print additional information to stdout while transcribing.
+        self.enable_debug_mode = False
+
         # The maximum length that recordAudio() will put into frames before it
         # starts dropping from the start.
         self.MAX_LENGTH_S = 300
@@ -211,7 +214,8 @@ def resetAudioLocked(audio_state):
             audio_state.transcribe_sleep_duration_min_s
 
     if audio_state.reset_on_toggle:
-        print("resetAudioLocked resetting text")
+        if audio_state.enable_debug_mode:
+            print("resetAudioLocked resetting text")
         audio_state.text = ""
         audio_state.preview_text = ""
         audio_state.filtered_text = ""
@@ -244,7 +248,8 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             without_timestamps = False)
     ranges = []
     for s in segments:
-        #print(f"Segment: {s}")
+        if audio_state.enable_debug_mode:
+            print(f"Segment: {s}")
         ranges.append((s.start, s.end, s.text))
     audio_state.ranges_ls.append(ranges)
 
@@ -270,13 +275,14 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
 
             max_edit = audio_state.commit_fuzz_threshold
 
-            #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
-            #if c0 == c1 and c1 == c2 and c2 == c3:
+            if audio_state.enable_debug_mode:
+                print(f"c0: {c0}, c1: {c1}, c2: {c2}")
             if c0_c1_d < max_edit and c1_c2_d < max_edit and c2_c3_d < max_edit:
                 # For simplicity, completely reset saved audio ranges.
                 audio_state.ranges_ls = []
                 committed_text = c0[2]
-                print(f"Dropping frames until {c0[1]}")
+                if audio_state.enable_debug_mode:
+                    print(f"Dropping frames until {c0[1]}")
                 n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
                 audio_state.drop_frames_till_i = n_frames_to_drop
 
@@ -320,7 +326,8 @@ def transcribeAudio(audio_state,
 
         text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu)
         if len(text) == 0 and len(preview_text) == 0:
-            print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
+            if audio_state.enable_debug_mode:
+                print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
 
@@ -329,7 +336,8 @@ def transcribeAudio(audio_state,
             audio_state.text = ""
             audio_state.preview_text = ""
             audio_state.filtered_text = ""
-            print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
+            if audio_state.enable_debug_mode:
+                print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
 
@@ -436,13 +444,14 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             if not use_builtin:
                 osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
-            #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
 
             if audio_state.reset_on_toggle:
-                print("Toggle detected, dropping transcript (-2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, dropping transcript (1)")
                 audio_state.drop_transcription = True
             else:
-                print("Toggle detected, committing preview text (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, committing preview text (1)")
                 audio_state.text += audio_state.preview_text
             audio_state.audio_paused = True
             resetAudioLocked(audio_state)
@@ -458,7 +467,6 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
             audio_state.audio_paused = True
-            resetAudioLocked(audio_state)
 
             if enable_local_beep == 1:
                 playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
@@ -470,10 +478,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
                 osc_ctrl.lockWorld(audio_state.osc_state.client, False)
             if audio_state.reset_on_toggle:
-                print("Toggle detected, dropping transcript (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, dropping transcript (2)")
                 audio_state.drop_transcription = True
             else:
-                print("Toggle detected, committing preview text (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, committing preview text (2)")
                 audio_state.text += audio_state.preview_text
             audio_state.audio_paused = False
 
@@ -492,7 +502,8 @@ def readControllerInput(audio_state, enable_local_beep: bool,
         try:
             session = steamvr.SessionState()
         except:
-            print("steamvr is off, no controller input")
+            if audio_state.enable_debug_mode:
+                print("steamvr is off, no controller input")
             session = None
             time.sleep(5)
 
@@ -607,6 +618,7 @@ def transcribeLoop(mic: str,
         remove_trailing_period: bool,
         enable_uppercase_filter: bool,
         enable_lowercase_filter: bool,
+        enable_debug_mode: bool,
         button: str,
         estate: EmotesState,
         window_duration_s: int,
@@ -620,6 +632,7 @@ def transcribeLoop(mic: str,
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
     audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+    audio_state.enable_debug_mode = enable_debug_mode
 
     lang_bits = language_target.split(" | ")
     if len(lang_bits) == 2:
@@ -721,7 +734,6 @@ def transcribeLoop(mic: str,
     keyboard_input_thd.daemon = True
     keyboard_input_thd.start()
 
-    print("Press enter to start a new message.")
     for line in sys.stdin:
         audio_state.transcribe_lock.acquire()
         audio_state.audio_lock.acquire()
@@ -745,12 +757,6 @@ if __name__ == "__main__":
 
     print("args: {}".format(" ".join(sys.argv)))
 
-    # Set cwd to TaSTT/
-    abspath = os.path.abspath(__file__)
-    dname = os.path.dirname(abspath)
-    dname = os.path.dirname(dname)
-    dname = os.path.dirname(dname)
-    #os.chdir(dname)
     print(f"Set cwd to {os.getcwd()}")
 
     parser = argparse.ArgumentParser()
@@ -780,6 +786,7 @@ if __name__ == "__main__":
     parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
     parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
     parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
+    parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.")
     args = parser.parse_args()
 
     if not args.mic:
@@ -863,6 +870,11 @@ if __name__ == "__main__":
     else:
         args.enable_lowercase_filter = False
 
+    if args.enable_debug_mode == 1:
+        args.enable_debug_mode = True
+    else:
+        args.enable_debug_mode = False
+
     estate = EmotesState()
     estate.load(args.emotes_pickle)
 
@@ -884,6 +896,7 @@ if __name__ == "__main__":
             args.remove_trailing_period,
             args.enable_uppercase_filter,
             args.enable_lowercase_filter,
+            args.enable_debug_mode,
             args.button,
             estate, window_duration_s,
             args.gpu_idx,
-- 
cgit v1.2.3