From add7bd8ef86ec21cd1327eb45bcb739aa54f7db8 Mon Sep 17 00:00:00 2001 From: yum Date: Thu, 5 Oct 2023 18:22:55 -0700 Subject: Transcripts preceding long pauses now drop When hot-miking into the built-in chatbox, there are sometimes long pauses in conversation. After these pauses, it's undesirable to show the transcript generate before the pause. This feature makes it so that those transcripts can be dropped. Also: * Limit number of segments sent to browser source to 10. Allow this to grow up to 10 segments before dropping the first 5 segments. * Silence warnings generated by `install_in_venv`, used by e.g. translation codepath. * Enable audio normalization to improve accuracy when speaking softly, at the cost of some accuracy when speaking normally. Credit: user endo0269 on Discord suggested this feature. --- GUI/GUI/GUI/Config.cpp | 3 +++ GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 22 ++++++++++++++++++++++ GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.h | 4 ---- 5 files changed, 27 insertions(+), 4 deletions(-) (limited to 'GUI') diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 573238f..2abe5b2 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -88,6 +88,7 @@ AppConfig::AppConfig(wxTextCtrl* out) gpu_idx(0), min_silence_duration_ms(250), max_speech_duration_s(5), + reset_after_silence_s(10), transcription_loop_delay_ms(100), keybind("ctrl+x"), @@ -135,6 +136,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("gpu_idx", gpu_idx); cm.Set("min_silence_duration_ms", min_silence_duration_ms); cm.Set("max_speech_duration_s", max_speech_duration_s); + cm.Set("reset_after_silence_s", reset_after_silence_s); cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms); cm.Set("keybind", keybind); @@ -195,6 +197,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("gpu_idx", c.gpu_idx); cm.Get("min_silence_duration_ms", c.min_silence_duration_ms); cm.Get("max_speech_duration_s", c.max_speech_duration_s); + cm.Get("reset_after_silence_s", c.reset_after_silence_s); cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms); cm.Get("keybind", c.keybind); diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index ede21d6..0d0da66 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -74,6 +74,7 @@ public: int gpu_idx; int min_silence_duration_ms; int max_speech_duration_s; + int reset_after_silence_s; int transcription_loop_delay_ms; std::string keybind; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 23ac38c..602bf6d 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -78,6 +78,7 @@ namespace { ID_PY_APP_GPU_IDX, ID_PY_APP_MIN_SILENCE_DURATION_MS, ID_PY_APP_MAX_SPEECH_DURATION_S, + ID_PY_APP_RESET_AFTER_SILENCE_S, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS, ID_PY_APP_KEYBIND, ID_PY_APP_BROWSER_SRC_PORT, @@ -796,6 +797,16 @@ Frame::Frame() "milliseconds."); py_app_max_speech_duration_s_ = py_app_max_speech_duration_s; + auto* py_app_reset_after_silence_s = new wxTextCtrl( + py_app_config_panel_pairs, ID_PY_APP_RESET_AFTER_SILENCE_S, + std::to_string(app_c_->reset_after_silence_s), wxDefaultPosition, + wxDefaultSize, /*style=*/0); + py_app_reset_after_silence_s->SetToolTip( + "If you pause for at least this long between " + "sentences, the transcript before the pause will be " + "removed. To disable this feature, set it to -1."); + py_app_reset_after_silence_s_ = py_app_reset_after_silence_s; + auto* py_app_transcription_loop_delay_ms = new wxTextCtrl( py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS, std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition, @@ -905,6 +916,11 @@ Frame::Frame() sizer->Add(py_app_max_speech_duration_s, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(new wxStaticText(py_app_config_panel_pairs, + wxID_ANY, /*label=*/"Reset after silence (s):")); + sizer->Add(py_app_reset_after_silence_s, /*proportion=*/0, + /*flags=*/wxEXPAND); + sizer->Add(new wxStaticText(py_app_config_panel_pairs, wxID_ANY, /*label=*/"Transcription loop delay (ms):")); sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0, @@ -1629,6 +1645,10 @@ void Frame::ApplyConfigToInputFields() py_app_max_speech_duration_s->Clear(); py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s)); + auto* py_app_reset_after_silence_s = static_cast(FindWindowById(ID_PY_APP_RESET_AFTER_SILENCE_S)); + py_app_reset_after_silence_s->Clear(); + py_app_reset_after_silence_s->AppendText(std::to_string(app_c_->reset_after_silence_s)); + auto* py_app_transcription_loop_delay_ms = static_cast(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS)); py_app_transcription_loop_delay_ms->Clear(); py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms)); @@ -2405,6 +2425,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10)); ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000)); ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30)); + ASSIGN_OR_RETURN_VOID(int, reset_after_silence_s, stoiInRange(transcribe_out_, py_app_reset_after_silence_s_->GetValue().ToStdString(), "reset_after_silence_s", -1, 30)); ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000)); ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535)); @@ -2438,6 +2459,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->gpu_idx = gpu_idx; app_c_->min_silence_duration_ms = min_silence_duration_ms; app_c_->max_speech_duration_s = max_speech_duration_s; + app_c_->reset_after_silence_s = reset_after_silence_s; app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms; app_c_->keybind = keybind; app_c_->Serialize(AppConfig::kConfigPath); diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 468a650..ee91e98 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -40,6 +40,7 @@ private: wxTextCtrl* py_app_gpu_idx_; wxTextCtrl* py_app_min_silence_duration_ms_; wxTextCtrl* py_app_max_speech_duration_s_; + wxTextCtrl* py_app_reset_after_silence_s_; wxTextCtrl* py_app_transcription_loop_delay_ms_; wxTextCtrl* py_app_keybind_; wxTextCtrl* py_app_browser_src_port_; diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 31c571e..b5fe518 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -77,10 +77,6 @@ namespace PythonWrapper const std::function&& run_cb = []() { return true; }); bool InstallPip(std::string* out, std::string* err = nullptr); - // TODO(yum) both StartApp and GenerateAnimator should be - // parameterized with config files instead of these ever-growing lists of - // parameters. We could persist those files so settings would persist across - // app restarts. std::future StartApp( const AppConfig& app_c, const std::string& config_path, -- cgit v1.2.3