From cf75998dab6db1b1d21ca06bde18a56b5e896937 Mon Sep 17 00:00:00 2001 From: yum Date: Tue, 27 Jun 2023 15:16:06 -0700 Subject: Add ability to preserve transcript while using push to talk This is useful when streaming. Occasionally the STT can get into a bad state, and manually segmenting clears it up. However doing so would clear your accumulated transcript, which isn't always desired. Add ability to preserve the transcript. A small wrinkle: the new commit logic requires N consecutive identical windows before committing. To make this feature play nicely with it, I had to forcibly commit any preview text that hasn't yet been committed. Failing to do this would usually cause short utterances / the most recently said stuff to get wiped out. --- GUI/GUI/GUI/Config.cpp | 3 +++ GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 18 +++++++++++++++ GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.cpp | 1 + Scripts/transcribe.py | 51 ++++++++++++++++++++++++++++++------------- 6 files changed, 60 insertions(+), 15 deletions(-) diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index c549843..c851983 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -78,6 +78,7 @@ AppConfig::AppConfig(wxTextCtrl* out) remove_trailing_period(false), enable_uppercase_filter(false), enable_lowercase_filter(false), + reset_on_toggle(true), gpu_idx(0), keybind("ctrl+x"), @@ -128,6 +129,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("remove_trailing_period", remove_trailing_period); cm.Set("enable_uppercase_filter", enable_uppercase_filter); cm.Set("enable_lowercase_filter", enable_lowercase_filter); + cm.Set("reset_on_toggle", reset_on_toggle); cm.Set("gpu_idx", gpu_idx); cm.Set("keybind", keybind); @@ -191,6 +193,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("remove_trailing_period", c.remove_trailing_period); cm.Get("enable_uppercase_filter", c.enable_uppercase_filter); cm.Get("enable_lowercase_filter", c.enable_lowercase_filter); + cm.Get("reset_on_toggle", c.reset_on_toggle); cm.Get("gpu_idx", c.gpu_idx); cm.Get("keybind", c.keybind); diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index dd7e47a..d71aeb4 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -64,6 +64,7 @@ public: bool remove_trailing_period; bool enable_uppercase_filter; bool enable_lowercase_filter; + bool reset_on_toggle; int gpu_idx; std::string keybind; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 76b85ae..f4e99b9 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -47,6 +47,7 @@ namespace { ID_PY_APP_REMOVE_TRAILING_PERIOD, ID_PY_APP_ENABLE_UPPERCASE_FILTER, ID_PY_APP_ENABLE_LOWERCASE_FILTER, + ID_PY_APP_RESET_ON_TOGGLE, ID_PY_APP_ROWS, ID_PY_APP_COLS, ID_PY_APP_GPU_IDX, @@ -860,6 +861,16 @@ Frame::Frame() ); py_app_enable_lowercase_filter_ = py_app_enable_lowercase_filter; + auto* py_app_reset_on_toggle = new wxCheckBox(py_config_panel, + ID_PY_APP_RESET_ON_TOGGLE, "Reset transcript on toggle"); + py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle); + py_app_reset_on_toggle->SetToolTip( + "If checked, the transcript will be reset (cleared) every " + "time that transcription is toggled on. Only affects " + "keyboard controls, not the VR controls." + ); + py_app_reset_on_toggle_ = py_app_reset_on_toggle; + // Hack: Add newlines before and after the button text to make // the buttons bigger, and easier to click from inside VR. auto* py_app_start_button = new wxButton(py_config_panel, @@ -873,6 +884,8 @@ Frame::Frame() /*flags=*/wxEXPAND); sizer->Add(py_app_config_panel_pairs, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(py_app_reset_on_toggle, /*proportion=*/0, + /*flags=*/wxEXPAND); sizer->Add(py_app_enable_browser_src, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_enable_local_beep, /*proportion=*/0, @@ -1403,6 +1416,9 @@ void Frame::ApplyConfigToInputFields() auto* py_app_enable_lowercase_filter = static_cast(FindWindowById(ID_PY_APP_ENABLE_LOWERCASE_FILTER)); py_app_enable_lowercase_filter->SetValue(app_c_->enable_lowercase_filter); + auto* py_app_reset_on_toggle = static_cast(FindWindowById(ID_PY_APP_RESET_ON_TOGGLE)); + py_app_reset_on_toggle->SetValue(app_c_->reset_on_toggle); + // Unity panel auto* unity_chars_per_sync = static_cast(FindWindowById(ID_UNITY_CHARS_PER_SYNC)); unity_chars_per_sync->SetSelection(chars_idx); @@ -1988,6 +2004,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { const bool remove_trailing_period = py_app_remove_trailing_period_->GetValue(); const bool enable_uppercase_filter = py_app_enable_uppercase_filter_->GetValue(); const bool enable_lowercase_filter = py_app_enable_lowercase_filter_->GetValue(); + const bool reset_on_toggle = py_app_reset_on_toggle_->GetValue(); std::string rows_str = py_app_rows_->GetValue().ToStdString(); std::string cols_str = py_app_cols_->GetValue().ToStdString(); std::string chars_per_sync_str = @@ -2062,6 +2079,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->remove_trailing_period = remove_trailing_period; app_c_->enable_uppercase_filter = enable_uppercase_filter; app_c_->enable_lowercase_filter = enable_lowercase_filter; + app_c_->reset_on_toggle = reset_on_toggle; app_c_->gpu_idx = gpu_idx; app_c_->keybind = keybind; app_c_->Serialize(AppConfig::kConfigPath); diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index ede2afc..1856e7d 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -68,6 +68,7 @@ private: wxCheckBox* py_app_remove_trailing_period_; wxCheckBox* py_app_enable_uppercase_filter_; wxCheckBox* py_app_enable_lowercase_filter_; + wxCheckBox* py_app_reset_on_toggle_; wxCheckBox* unity_clear_osc_; std::future py_app_; diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index 9855b00..1402ed5 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -496,6 +496,7 @@ std::future PythonWrapper::StartApp( "--emotes_pickle", kEmotesPickle, "--gpu_idx", std::to_string(config.gpu_idx), "--keybind", Quote(config.keybind), + "--reset_on_toggle", config.reset_on_toggle ? "1" : "0", }, std::move(out_cb), std::move(in_cb), diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 8fe6190..694fd0b 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -49,9 +49,15 @@ class AudioState: # PyAudio stream object self.stream = None - self.committed_text = "" + self.preview_text = "" self.text = "" self.filtered_text = "" + + # If set to true, then the transcript strings (`text` and friends) will + # be reset whenever transcription is toggled on. At time of writing, + # this only applies to keyboard controls. + self.reset_on_toggle = True + # List of: # List of tuples of: # Segment start time, end time, and text @@ -198,20 +204,15 @@ def resetAudioLocked(audio_state): audio_state.transcribe_sleep_duration = \ audio_state.transcribe_sleep_duration_min_s - audio_state.text = "" - audio_state.preview_text = "" - audio_state.filtered_text = "" + if audio_state.reset_on_toggle: + print("resetAudioLocked resetting text") + audio_state.text = "" + audio_state.preview_text = "" + audio_state.filtered_text = "" def resetDisplayLocked(audio_state): osc_ctrl.clear(audio_state.osc_state) -def resetAudio(audio_state): - audio_state.transcribe_lock.acquire() - audio_state.audio_lock.acquire() - resetAudioLocked(audio_state) - audio_state.audio_lock.release() - audio_state.transcribe_lock.release() - # Transcribe the audio recorded in a file. # Returns two strings: committed text, and preview text. # Committed text is temporally stable. Preview text is *not* temporally stable, @@ -422,7 +423,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, osc_ctrl.toggleBoard(audio_state.osc_state.client, False) #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav")) - audio_state.drop_transcription = True + if audio_state.reset_on_toggle: + print("Toggle detected, dropping transcript (-2)") + audio_state.drop_transcription = True + else: + print("Toggle detected, committing preview text (2)") + audio_state.text += audio_state.preview_text audio_state.audio_paused = True resetAudioLocked(audio_state) resetDisplayLocked(audio_state) @@ -448,7 +454,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool, osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) osc_ctrl.toggleBoard(audio_state.osc_state.client, True) osc_ctrl.lockWorld(audio_state.osc_state.client, False) - audio_state.drop_transcription = True + if audio_state.reset_on_toggle: + print("Toggle detected, dropping transcript (2)") + audio_state.drop_transcription = True + else: + print("Toggle detected, committing preview text (2)") + audio_state.text += audio_state.preview_text audio_state.audio_paused = False resetAudioLocked(audio_state) @@ -585,11 +596,13 @@ def transcribeLoop(mic: str, estate: EmotesState, window_duration_s: int, gpu_idx: int, - keyboard_hotkey: str): + keyboard_hotkey: str, + reset_on_toggle: bool): audio_state = getMicStream(mic) audio_state.whisper_language = language audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s + audio_state.reset_on_toggle = reset_on_toggle lang_bits = language_target.split(" | ") if len(lang_bits) == 2: @@ -748,6 +761,7 @@ if __name__ == "__main__": parser.add_argument("--emotes_pickle", type=str, help="The path to emotes pickle. See emotes_v2.py for details.") parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") + parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") args = parser.parse_args() if not args.mic: @@ -796,6 +810,11 @@ if __name__ == "__main__": else: args.cpu = False + if args.reset_on_toggle == 1: + args.reset_on_toggle = True + else: + args.reset_on_toggle = False + if args.use_builtin == 1: args.use_builtin = True else: @@ -844,5 +863,7 @@ if __name__ == "__main__": args.enable_lowercase_filter, args.button, estate, window_duration_s, - args.gpu_idx, args.keybind) + args.gpu_idx, + args.keybind, + args.reset_on_toggle) -- cgit v1.2.3