From d701f80119ecb51366a45f429ec7006926527a40 Mon Sep 17 00:00:00 2001 From: yum Date: Tue, 21 Feb 2023 15:20:38 -0800 Subject: Various refinements * Filter out transcriptions like " (music)" * Whisper mic choice auto-populates with queried values * No more manually lining up numbers! * Persist whisper mic in config * Remove setup and dump mics button from Whisper page * Redesign makes these unnecessary --- GUI/GUI/GUI/Config.cpp | 5 ++- GUI/GUI/GUI/Config.h | 1 + GUI/GUI/GUI/Frame.cpp | 85 +++++++++++++++++++++++----------------------- GUI/GUI/GUI/Frame.h | 3 +- GUI/GUI/GUI/WhisperCPP.cpp | 31 +++++++++++++---- 5 files changed, 74 insertions(+), 51 deletions(-) (limited to 'GUI') diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 1287619..50f0dca 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -89,7 +89,8 @@ AppConfig::AppConfig() menu_path(), clear_osc(false), - whisper_model("ggml-base.en.bin") + whisper_model("ggml-base.en.bin"), + whisper_mic(0) {} bool AppConfig::Serialize(const std::filesystem::path& path) { @@ -118,6 +119,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { root["clear_osc"] << clear_osc; root["whisper_model"] << whisper_model; + root["whisper_mic"] << whisper_mic; return Config::Serialize(path, &t); } @@ -159,6 +161,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { root.get_if("clear_osc", &c.clear_osc); root.get_if("whisper_model", &c.whisper_model); + root.get_if("whisper_mic", &c.whisper_mic); *this = std::move(c); return true; diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index 07a4d8c..b27dcc5 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -63,5 +63,6 @@ public: // WhisperCPP-specific settings. std::string whisper_model; + int whisper_mic; }; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index e0713c1..5332490 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -72,7 +72,7 @@ namespace { ID_WHISPER_OUT, ID_WHISPER_CONFIG_PANEL, ID_WHISPER_SETUP_BUTTON, - ID_WHISPER_DUMP_MICS_BUTTON, + //ID_WHISPER_DUMP_MICS_BUTTON, ID_WHISPER_CONFIG_PANEL_PAIRS, ID_WHISPER_MIC, ID_WHISPER_LANG, @@ -108,6 +108,21 @@ namespace { const size_t kNumMicChoices = sizeof(kMicChoices) / sizeof(kMicChoices[0]); constexpr int kMicDefault = 0; // index + wxString kWhisperMicChoices[] = { + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + }; + const size_t kNumWhisperMicChoices = sizeof(kWhisperMicChoices) / sizeof(kWhisperMicChoices[0]); + constexpr int kWhisperMicDefault = 0; // index + // lifted from whisper/tokenizer.py const wxString kLangChoices[] = { "english", @@ -322,25 +337,25 @@ Frame::Frame() auto* navbar = new wxPanel(main_panel, ID_NAVBAR); { auto* navbar_button_transcribe = new wxButton(navbar, - ID_NAVBAR_BUTTON_TRANSCRIBE, "Transcription"); + ID_NAVBAR_BUTTON_TRANSCRIBE, "Transcription (PY)"); auto* navbar_button_unity = new wxButton(navbar, ID_NAVBAR_BUTTON_UNITY, "Unity"); auto* navbar_button_debug = new wxButton(navbar, ID_NAVBAR_BUTTON_DEBUG, "Debug"); auto* navbar_button_whisper = new wxButton(navbar, - ID_NAVBAR_BUTTON_WHISPER, "WhisperCPP"); + ID_NAVBAR_BUTTON_WHISPER, "Transcription (CPP)"); auto* sizer = new wxBoxSizer(wxVERTICAL); navbar->SetSizer(sizer); sizer->Add(navbar_button_transcribe, /*proportion=*/0, + /*flags=*/wxEXPAND); + sizer->Add(navbar_button_whisper, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(navbar_button_unity, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(navbar_button_debug, /*proportion=*/0, /*flags=*/wxEXPAND); - sizer->Add(navbar_button_whisper, /*proportion=*/0, - /*flags=*/wxEXPAND); } auto* transcribe_panel = new wxPanel(main_panel, ID_PY_PANEL); @@ -843,27 +858,12 @@ Frame::Frame() auto* whisper_config_panel = new wxPanel(whisper_panel, ID_WHISPER_CONFIG_PANEL); { - auto* whisper_setup_button = new wxButton(whisper_config_panel, - ID_WHISPER_SETUP_BUTTON, "Set up Python virtual environment"); - whisper_setup_button->SetToolTip( - "TaSTT uses the Python programming language to provide both " - "transcription services and to interface with Unity. " - "It installs its dependencies into an isolated folder " - "called a 'virtual environment'. Click this button to " - "install those dependencies. This only has to be done " - "once when you install a new version of TaSTT."); - auto* whisper_dump_mics_button = new wxButton(whisper_config_panel, - ID_WHISPER_DUMP_MICS_BUTTON, "List input devices"); - whisper_dump_mics_button->SetToolTip( - "List the microphones (and input devices) attached to " - "your computer. To use a microphone, enter the number " - "to its left in the 'Microphone' dropdown."); auto* whisper_config_panel_pairs = new wxPanel(whisper_config_panel, ID_WHISPER_CONFIG_PANEL_PAIRS); { auto* whisper_mic = new wxChoice(whisper_config_panel_pairs, ID_WHISPER_MIC, wxDefaultPosition, - wxDefaultSize, kNumMicChoices, kMicChoices); + wxDefaultSize, kNumWhisperMicChoices, kWhisperMicChoices); whisper_mic->SetToolTip( "Select which microphone to listen to when " "transcribing. To get list microphones and get their " @@ -1037,10 +1037,6 @@ Frame::Frame() auto* sizer = new wxBoxSizer(wxVERTICAL); whisper_config_panel->SetSizer(sizer); - sizer->Add(whisper_setup_button, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(whisper_dump_mics_button, /*proportion=*/0, - /*flags=*/wxEXPAND); sizer->Add(whisper_config_panel_pairs, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(whisper_enable_local_beep, /*proportion=*/0, @@ -1060,8 +1056,8 @@ Frame::Frame() sizer->Add(whisper_config_panel, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(whisper_out, /*proportion=*/1, /*flags=*/wxEXPAND); } - whisper_panel_->Hide(); whisper_ = std::make_unique(whisper_out_); + whisper_panel_->Hide(); auto* debug_panel = new wxPanel(main_panel, ID_DEBUG_PANEL); debug_panel_ = debug_panel; @@ -1180,7 +1176,7 @@ Frame::Frame() Bind(wxEVT_TIMER, &Frame::OnAppDrain, this, ID_PY_APP_DRAIN); Bind(wxEVT_BUTTON, &Frame::OnSetupPython, this, ID_PY_SETUP_BUTTON); Bind(wxEVT_BUTTON, &Frame::OnDumpMics, this, ID_PY_DUMP_MICS_BUTTON); - Bind(wxEVT_BUTTON, &Frame::OnWhisperDumpMics, this, ID_WHISPER_DUMP_MICS_BUTTON); + //Bind(wxEVT_BUTTON, &Frame::OnWhisperDumpMics, this, ID_WHISPER_DUMP_MICS_BUTTON); Bind(wxEVT_BUTTON, &Frame::OnGenerateFX, this, ID_UNITY_BUTTON_GEN_ANIMATOR); Bind(wxEVT_BUTTON, &Frame::OnListPip, this, ID_DEBUG_BUTTON_LIST_PIP); @@ -1205,6 +1201,7 @@ Frame::Frame() // Initialize input fields using AppConfig. ApplyConfigToInputFields(); + PopulateDynamicInputFields(); Resize(); OnUnityParamChangeImpl(); @@ -1258,7 +1255,8 @@ void Frame::ApplyConfigToInputFields() // Whisper panel auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); - whisper_mic->SetSelection(mic_idx); + int whisper_mic_idx = app_c_.whisper_mic; + whisper_mic->SetSelection(whisper_mic_idx); auto* whisper_lang = static_cast(FindWindowById(ID_WHISPER_LANG)); whisper_lang->SetSelection(lang_idx); @@ -1266,7 +1264,7 @@ void Frame::ApplyConfigToInputFields() auto* whisper_model = static_cast(FindWindowById(ID_WHISPER_MODEL)); int whisper_model_idx = GetDropdownChoiceIndex(kWhisperModelChoices, kNumWhisperModelChoices, app_c_.whisper_model, kWhisperModelDefault); - whisper_model->SetSelection(model_idx); + whisper_model->SetSelection(whisper_model_idx); auto* whisper_button = static_cast(FindWindowById(ID_WHISPER_BUTTON)); whisper_button->SetSelection(button_idx); @@ -1301,6 +1299,20 @@ void Frame::ApplyConfigToInputFields() unity_cols->AppendText(std::to_string(app_c_.cols)); } +void Frame::PopulateDynamicInputFields() +{ + whisper_->Init(); + std::vector mics; + if (whisper_->GetMics(mics)) { + std::vector contents(mics.size()); + auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); + for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) { + contents[i] = mics[i]; + } + whisper_mic->Set(contents); + } +} + void Frame::OnExit(wxCommandEvent& event) { OnAppStop(event); @@ -1422,17 +1434,6 @@ void Frame::OnDumpMics(wxCommandEvent& event) Log(transcribe_out_, "{}\n", PythonWrapper::DumpMics()); } -void Frame::OnWhisperDumpMics(wxCommandEvent& event) -{ - whisper_->Init(); - std::vector mics; - whisper_->GetMics(mics); - Log(whisper_out_, "Microphones:\n"); - for (int i = 0; i < mics.size(); i++) { - Log(whisper_out_, " {}: {}\n", i, mics[i]); - } -} - bool GetUserPath(const std::string& raw, std::filesystem::path& clean, const std::string& err_prefix = "", bool must_exist = true) { clean = raw; @@ -1902,7 +1903,7 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { } int which_model = whisper_model_->GetSelection(); if (which_model == wxNOT_FOUND) { - which_model = kModelDefault; + which_model = kWhisperModelDefault; } int chars_per_sync_idx = whisper_chars_per_sync_->GetSelection(); if (chars_per_sync_idx == wxNOT_FOUND) { @@ -1965,7 +1966,7 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { return; } - app_c_.microphone = kMicChoices[which_mic].ToStdString(); + app_c_.whisper_mic = which_mic; app_c_.language = kLangChoices[which_lang].ToStdString(); app_c_.whisper_model = kWhisperModelChoices[which_model].ToStdString(); app_c_.chars_per_sync = chars_per_sync; diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index ef65d7f..5d69679 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -87,6 +87,8 @@ private: // Initialize GUI input fields using `app_c_`. void ApplyConfigToInputFields(); + // Populate dynamically-generated input fields, such as microphone lists. + void PopulateDynamicInputFields(); void OnExit(wxCommandEvent& event); void OnNavbarTranscribe(wxCommandEvent& event); @@ -95,7 +97,6 @@ private: void OnNavbarWhisper(wxCommandEvent& event); void OnSetupPython(wxCommandEvent& event); void OnDumpMics(wxCommandEvent& event); - void OnWhisperDumpMics(wxCommandEvent& event); void OnAppStart(wxCommandEvent& event); void OnAppStop(wxCommandEvent& event); void OnWhisperStart(wxCommandEvent& event); diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp index 742f0a4..809415e 100644 --- a/GUI/GUI/GUI/WhisperCPP.cpp +++ b/GUI/GUI/GUI/WhisperCPP.cpp @@ -251,13 +251,12 @@ void WhisperCPP::Start(const AppConfig& c) { return; } - // TODO(yum) use app config to select mic proc_ = new AppThread([&](AppThread* thd) { Log(out_, "Transcription thread top\n"); run_ = true; Whisper::iAudioCapture* mic_stream; - if (!OpenMic(1, mic_stream)) { + if (!OpenMic(c.whisper_mic, mic_stream)) { return; } ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); }); @@ -322,7 +321,8 @@ void WhisperCPP::Start(const AppConfig& c) { Whisper::sFullParams wparams{}; context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams); wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config - wparams.n_max_text_ctx = 20; + // This must be set to keep memory usage from growing without bound. + wparams.n_max_text_ctx = 100; wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT { wxTextCtrl* out = static_cast(user_data); @@ -346,12 +346,28 @@ void WhisperCPP::Start(const AppConfig& c) { const int s0 = length.countSegments - n_new; for (int i = s0; i < length.countSegments; i++) { const sSegment& seg = segments[i]; - Log(out, "{} ", seg.text); + bool is_metadata = false; for (int j = 0; j < seg.countTokens; j++) { const sToken& tok = tokens[seg.firstToken + j]; - if (*tok.text == 0 || tok.text[0] == '[') { + if (tok.text[0] == '[') { continue; } + if (tok.text[0] == ' ' && ( + tok.text[1] == '[' || + tok.text[1] == '(')) { + if (tok.text[strlen(tok.text) - 1] == ']') { + continue; + } + is_metadata = true; + continue; + } + if (is_metadata && ( + tok.text[strlen(tok.text) - 1] == ']' || + tok.text[strlen(tok.text) - 1] == ')')) { + is_metadata = false; + continue; + } + Log(out, "{}", tok.text); } } if (n_new) { @@ -369,11 +385,12 @@ void WhisperCPP::Start(const AppConfig& c) { Log(app->out_, "Exit transcription loop\n"); return S_FALSE; } + // Sleeping here prevents the GUI from hanging. + // For some reason, printing is also required to prevent hanging. static int i = 0; - if (++i % 10 == 0) { + if (++i % 20 == 0) { Log(app->out_, "Spin {}\n", i); } - // Sleeping here prevents the GUI from hanging. wxThread::Sleep(10); return S_OK; }; -- cgit v1.2.3