From ff7eb3c212195af71cd0ce4a3cd0c9a081d6ebda Mon Sep 17 00:00:00 2001 From: yum Date: Wed, 28 Jun 2023 20:35:10 -0700 Subject: Add toggle for debug mode Most transcription output is now gone by default. Users can enable a more verbose output by toggling `Enable debug mode`. Bugfix: Toggling off transcription would reset audio state, frequently resulting in the loss of the last few words spoken. --- GUI/GUI/GUI/Config.cpp | 3 +++ GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 18 ++++++++++++++ GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.cpp | 1 + Scripts/transcribe.py | 55 ++++++++++++++++++++++++++----------------- 6 files changed, 58 insertions(+), 21 deletions(-) diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 4f730b5..d337f77 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -79,6 +79,7 @@ AppConfig::AppConfig(wxTextCtrl* out) remove_trailing_period(false), enable_uppercase_filter(false), enable_lowercase_filter(false), + enable_debug_mode(false), reset_on_toggle(true), gpu_idx(0), keybind("ctrl+x"), @@ -131,6 +132,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("remove_trailing_period", remove_trailing_period); cm.Set("enable_uppercase_filter", enable_uppercase_filter); cm.Set("enable_lowercase_filter", enable_lowercase_filter); + cm.Set("enable_debug_mode", enable_debug_mode); cm.Set("reset_on_toggle", reset_on_toggle); cm.Set("gpu_idx", gpu_idx); cm.Set("keybind", keybind); @@ -196,6 +198,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("remove_trailing_period", c.remove_trailing_period); cm.Get("enable_uppercase_filter", c.enable_uppercase_filter); cm.Get("enable_lowercase_filter", c.enable_lowercase_filter); + cm.Get("enable_debug_mode", c.enable_debug_mode); cm.Get("reset_on_toggle", c.reset_on_toggle); cm.Get("gpu_idx", c.gpu_idx); cm.Get("keybind", c.keybind); diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index 6711d79..e4a9bf4 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -65,6 +65,7 @@ public: bool remove_trailing_period; bool enable_uppercase_filter; bool enable_lowercase_filter; + bool enable_debug_mode; bool reset_on_toggle; int gpu_idx; std::string keybind; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 1df60e7..706165b 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -47,6 +47,7 @@ namespace { ID_PY_APP_REMOVE_TRAILING_PERIOD, ID_PY_APP_ENABLE_UPPERCASE_FILTER, ID_PY_APP_ENABLE_LOWERCASE_FILTER, + ID_PY_APP_ENABLE_DEBUG_MODE, ID_PY_APP_RESET_ON_TOGGLE, ID_PY_APP_ROWS, ID_PY_APP_COLS, @@ -881,6 +882,16 @@ Frame::Frame() ); py_app_enable_lowercase_filter_ = py_app_enable_lowercase_filter; + auto* py_app_enable_debug_mode = new wxCheckBox(py_config_panel, + ID_PY_APP_ENABLE_DEBUG_MODE, "Enable debug mode"); + py_app_enable_debug_mode->SetValue(app_c_->enable_debug_mode); + py_app_enable_debug_mode->SetToolTip( + "If checked, the transcription engine will print out " + "additional information. Use this if you're debugging a " + "technical issue." + ); + py_app_enable_debug_mode_ = py_app_enable_debug_mode; + auto* py_app_reset_on_toggle = new wxCheckBox(py_config_panel, ID_PY_APP_RESET_ON_TOGGLE, "Reset transcript on toggle"); py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle); @@ -922,6 +933,8 @@ Frame::Frame() /*flags=*/wxEXPAND); sizer->Add(py_app_enable_lowercase_filter, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(py_app_enable_debug_mode, /*proportion=*/0, + /*flags=*/wxEXPAND); sizer->Add(py_app_start_button, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_stop_button, /*proportion=*/0, @@ -1440,6 +1453,9 @@ void Frame::ApplyConfigToInputFields() auto* py_app_enable_lowercase_filter = static_cast(FindWindowById(ID_PY_APP_ENABLE_LOWERCASE_FILTER)); py_app_enable_lowercase_filter->SetValue(app_c_->enable_lowercase_filter); + auto* py_app_enable_debug_mode = static_cast(FindWindowById(ID_PY_APP_ENABLE_DEBUG_MODE)); + py_app_enable_debug_mode->SetValue(app_c_->enable_debug_mode); + auto* py_app_reset_on_toggle = static_cast(FindWindowById(ID_PY_APP_RESET_ON_TOGGLE)); py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle); @@ -2028,6 +2044,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { const bool remove_trailing_period = py_app_remove_trailing_period_->GetValue(); const bool enable_uppercase_filter = py_app_enable_uppercase_filter_->GetValue(); const bool enable_lowercase_filter = py_app_enable_lowercase_filter_->GetValue(); + const bool enable_debug_mode = py_app_enable_debug_mode_->GetValue(); const bool reset_on_toggle = py_app_reset_on_toggle_->GetValue(); std::string rows_str = py_app_rows_->GetValue().ToStdString(); std::string cols_str = py_app_cols_->GetValue().ToStdString(); @@ -2126,6 +2143,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->remove_trailing_period = remove_trailing_period; app_c_->enable_uppercase_filter = enable_uppercase_filter; app_c_->enable_lowercase_filter = enable_lowercase_filter; + app_c_->enable_debug_mode = enable_debug_mode; app_c_->reset_on_toggle = reset_on_toggle; app_c_->gpu_idx = gpu_idx; app_c_->keybind = keybind; diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 7afc005..46f5bcd 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -69,6 +69,7 @@ private: wxCheckBox* py_app_remove_trailing_period_; wxCheckBox* py_app_enable_uppercase_filter_; wxCheckBox* py_app_enable_lowercase_filter_; + wxCheckBox* py_app_enable_debug_mode_; wxCheckBox* py_app_reset_on_toggle_; wxCheckBox* unity_clear_osc_; diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index e6f10c2..c5421e8 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -493,6 +493,7 @@ std::future PythonWrapper::StartApp( "--remove_trailing_period", config.remove_trailing_period ? "1" : "0", "--enable_uppercase_filter", config.enable_uppercase_filter ? "1" : "0", "--enable_lowercase_filter", config.enable_lowercase_filter ? "1" : "0", + "--enable_debug_mode", config.enable_debug_mode ? "1" : "0", "--emotes_pickle", kEmotesPickle, "--gpu_idx", std::to_string(config.gpu_idx), "--keybind", Quote(config.keybind), diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index cc1944c..28b6ca0 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -38,6 +38,9 @@ class AudioState: # This matches the framerate expected by whisper. self.RATE = 16000 + # If set, print additional information to stdout while transcribing. + self.enable_debug_mode = False + # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. self.MAX_LENGTH_S = 300 @@ -211,7 +214,8 @@ def resetAudioLocked(audio_state): audio_state.transcribe_sleep_duration_min_s if audio_state.reset_on_toggle: - print("resetAudioLocked resetting text") + if audio_state.enable_debug_mode: + print("resetAudioLocked resetting text") audio_state.text = "" audio_state.preview_text = "" audio_state.filtered_text = "" @@ -244,7 +248,8 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st without_timestamps = False) ranges = [] for s in segments: - #print(f"Segment: {s}") + if audio_state.enable_debug_mode: + print(f"Segment: {s}") ranges.append((s.start, s.end, s.text)) audio_state.ranges_ls.append(ranges) @@ -270,13 +275,14 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st max_edit = audio_state.commit_fuzz_threshold - #print(f"c0: {c0}, c1: {c1}, c2: {c2}") - #if c0 == c1 and c1 == c2 and c2 == c3: + if audio_state.enable_debug_mode: + print(f"c0: {c0}, c1: {c1}, c2: {c2}") if c0_c1_d < max_edit and c1_c2_d < max_edit and c2_c3_d < max_edit: # For simplicity, completely reset saved audio ranges. audio_state.ranges_ls = [] committed_text = c0[2] - print(f"Dropping frames until {c0[1]}") + if audio_state.enable_debug_mode: + print(f"Dropping frames until {c0[1]}") n_frames_to_drop = int(ceil(audio_state.RATE * c0[1])) audio_state.drop_frames_till_i = n_frames_to_drop @@ -320,7 +326,8 @@ def transcribeAudio(audio_state, text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu) if len(text) == 0 and len(preview_text) == 0: - print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) + if audio_state.enable_debug_mode: + print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue @@ -329,7 +336,8 @@ def transcribeAudio(audio_state, audio_state.text = "" audio_state.preview_text = "" audio_state.filtered_text = "" - print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time)) + if audio_state.enable_debug_mode: + print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue @@ -436,13 +444,14 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, if not use_builtin: osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) osc_ctrl.toggleBoard(audio_state.osc_state.client, False) - #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav")) if audio_state.reset_on_toggle: - print("Toggle detected, dropping transcript (-2)") + if audio_state.enable_debug_mode: + print("Toggle detected, dropping transcript (1)") audio_state.drop_transcription = True else: - print("Toggle detected, committing preview text (2)") + if audio_state.enable_debug_mode: + print("Toggle detected, committing preview text (1)") audio_state.text += audio_state.preview_text audio_state.audio_paused = True resetAudioLocked(audio_state) @@ -458,7 +467,6 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s audio_state.audio_paused = True - resetAudioLocked(audio_state) if enable_local_beep == 1: playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"), @@ -470,10 +478,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, osc_ctrl.toggleBoard(audio_state.osc_state.client, True) osc_ctrl.lockWorld(audio_state.osc_state.client, False) if audio_state.reset_on_toggle: - print("Toggle detected, dropping transcript (2)") + if audio_state.enable_debug_mode: + print("Toggle detected, dropping transcript (2)") audio_state.drop_transcription = True else: - print("Toggle detected, committing preview text (2)") + if audio_state.enable_debug_mode: + print("Toggle detected, committing preview text (2)") audio_state.text += audio_state.preview_text audio_state.audio_paused = False @@ -492,7 +502,8 @@ def readControllerInput(audio_state, enable_local_beep: bool, try: session = steamvr.SessionState() except: - print("steamvr is off, no controller input") + if audio_state.enable_debug_mode: + print("steamvr is off, no controller input") session = None time.sleep(5) @@ -607,6 +618,7 @@ def transcribeLoop(mic: str, remove_trailing_period: bool, enable_uppercase_filter: bool, enable_lowercase_filter: bool, + enable_debug_mode: bool, button: str, estate: EmotesState, window_duration_s: int, @@ -620,6 +632,7 @@ def transcribeLoop(mic: str, audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle audio_state.commit_fuzz_threshold = commit_fuzz_threshold + audio_state.enable_debug_mode = enable_debug_mode lang_bits = language_target.split(" | ") if len(lang_bits) == 2: @@ -721,7 +734,6 @@ def transcribeLoop(mic: str, keyboard_input_thd.daemon = True keyboard_input_thd.start() - print("Press enter to start a new message.") for line in sys.stdin: audio_state.transcribe_lock.acquire() audio_state.audio_lock.acquire() @@ -745,12 +757,6 @@ if __name__ == "__main__": print("args: {}".format(" ".join(sys.argv))) - # Set cwd to TaSTT/ - abspath = os.path.abspath(__file__) - dname = os.path.dirname(abspath) - dname = os.path.dirname(dname) - dname = os.path.dirname(dname) - #os.chdir(dname) print(f"Set cwd to {os.getcwd()}") parser = argparse.ArgumentParser() @@ -780,6 +786,7 @@ if __name__ == "__main__": parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.") + parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.") args = parser.parse_args() if not args.mic: @@ -863,6 +870,11 @@ if __name__ == "__main__": else: args.enable_lowercase_filter = False + if args.enable_debug_mode == 1: + args.enable_debug_mode = True + else: + args.enable_debug_mode = False + estate = EmotesState() estate.load(args.emotes_pickle) @@ -884,6 +896,7 @@ if __name__ == "__main__": args.remove_trailing_period, args.enable_uppercase_filter, args.enable_lowercase_filter, + args.enable_debug_mode, args.button, estate, window_duration_s, args.gpu_idx, -- cgit v1.2.3