From b6dc76afb4f76a8b0eaa8f821c557cd83f69daf4 Mon Sep 17 00:00:00 2001 From: yum Date: Wed, 8 Mar 2023 14:59:57 -0800 Subject: Expose more C++ whisper parameters in GUI Expose decode method, beam search parameters, and voice activity detection parameters in GUI. * Remove WhisperCPP::Init(), do it on launch instead. * Add float support to ConfigMarshal --- GUI/GUI/GUI/Config.cpp | 29 ++++- GUI/GUI/GUI/Config.h | 9 ++ GUI/GUI/GUI/ConfigMarshal.h | 30 ++++++ GUI/GUI/GUI/Frame.cpp | 257 +++++++++++++++++++++++++++++++++++++++++--- GUI/GUI/GUI/Frame.h | 9 ++ GUI/GUI/GUI/WhisperCPP.cpp | 77 +++++++------ GUI/GUI/GUI/WhisperCPP.h | 9 +- 7 files changed, 360 insertions(+), 60 deletions(-) (limited to 'GUI') diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 5812f94..f45aa45 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -59,7 +59,7 @@ bool Config::Deserialize(const std::filesystem::path& path, return cm.Load(path); } -AppConfig::AppConfig(wxTextCtrl *out) +AppConfig::AppConfig(wxTextCtrl* out) : Config(out), microphone("index"), @@ -85,6 +85,15 @@ AppConfig::AppConfig(wxTextCtrl *out) whisper_model("ggml-medium.bin"), whisper_mic(0), + whisper_decode_method("greedy"), + whisper_max_ctxt(100), + whisper_beam_width(5), + whisper_beam_n_best(5), + whisper_vad_min_duration(0.5), + whisper_vad_max_duration(5.0), + whisper_vad_drop_start_silence(0.5), + whisper_vad_pause_duration(0.2), + whisper_vad_retain_duration(0.2), browser_src_port(9517), whisper_enable_builtin(false), @@ -118,6 +127,15 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("whisper_model", whisper_model); cm.Set("whisper_mic", whisper_mic); + cm.Set("whisper_decode_method", whisper_decode_method); + cm.Set("whisper_max_ctxt", whisper_max_ctxt); + cm.Set("whisper_beam_width", whisper_beam_width); + cm.Set("whisper_beam_n_best", whisper_beam_n_best); + cm.Set("whisper_vad_min_duration", whisper_vad_min_duration); + cm.Set("whisper_vad_max_duration", whisper_vad_max_duration); + cm.Set("whisper_vad_drop_start_silence", whisper_vad_drop_start_silence); + cm.Set("whisper_vad_pause_duration", whisper_vad_pause_duration); + cm.Set("whisper_vad_retain_duration", whisper_vad_retain_duration); cm.Set("browser_src_port", browser_src_port); cm.Set("whisper_enable_builtin", whisper_enable_builtin); @@ -164,6 +182,15 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("whisper_model", c.whisper_model); cm.Get("whisper_mic", c.whisper_mic); + cm.Get("whisper_decode_method", c.whisper_decode_method); + cm.Get("whisper_max_ctxt", c.whisper_max_ctxt); + cm.Get("whisper_beam_width", c.whisper_beam_width); + cm.Get("whisper_beam_n_best", c.whisper_beam_n_best); + cm.Get("whisper_vad_min_duration", c.whisper_vad_min_duration); + cm.Get("whisper_vad_max_duration", c.whisper_vad_max_duration); + cm.Get("whisper_vad_drop_start_silence", c.whisper_vad_drop_start_silence); + cm.Get("whisper_vad_pause_duration", c.whisper_vad_pause_duration); + cm.Get("whisper_vad_retain_duration", c.whisper_vad_retain_duration); cm.Get("browser_src_port", c.browser_src_port); cm.Get("whisper_enable_builtin", c.whisper_enable_builtin); diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index 14dcc58..f0babc2 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -74,6 +74,15 @@ public: // WhisperCPP-specific settings. std::string whisper_model; int whisper_mic; + std::string whisper_decode_method; + int whisper_max_ctxt; + int whisper_beam_width; + int whisper_beam_n_best; + float whisper_vad_min_duration; + float whisper_vad_max_duration; + float whisper_vad_drop_start_silence; + float whisper_vad_pause_duration; + float whisper_vad_retain_duration; // Browser source-specific settings. int browser_src_port; diff --git a/GUI/GUI/GUI/ConfigMarshal.h b/GUI/GUI/GUI/ConfigMarshal.h index a2f17f9..0f40ed4 100644 --- a/GUI/GUI/GUI/ConfigMarshal.h +++ b/GUI/GUI/GUI/ConfigMarshal.h @@ -30,6 +30,9 @@ public: for (const auto& [k, v] : kv_int_) { oss << k << ": " << std::to_string(v) << std::endl; } + for (const auto& [k, v] : kv_float_) { + oss << k << ": " << std::to_string(v) << std::endl; + } std::ofstream ofs(path.string()); ofs << oss.str(); @@ -66,6 +69,19 @@ public: catch (const std::invalid_argument&) {} catch (const std::out_of_range&) {} + try { + size_t pos; + float val_f = std::stof(val, &pos); + if (pos == val.length()) { + // The entire value is a float -> interpret as a float. Corollary: users + // can't store floats as strings! + kv_float_[key] = val_f; + continue; + } + } + catch (const std::invalid_argument&) {} + catch (const std::out_of_range&) {} + kv_str_[key] = val; } return true; @@ -81,6 +97,10 @@ public: kv_int_[key] = static_cast(value); return true; } + if constexpr (std::is_same_v) { + kv_float_[key] = value; + return true; + } Logging::Log(out_, "ConfigMarshal unsupported type: {}\n", typeid(T).name()); return false; } @@ -103,6 +123,15 @@ public: value = iter->second; return true; } + if constexpr (std::is_same_v) { + auto iter = kv_float_.find(key); + if (iter == kv_float_.end()) { + Logging::Log(out_, "Config contains no field named `{}`\n", key); + return false; + } + value = iter->second; + return true; + } if constexpr (std::is_same_v || std::is_same_v) { auto iter = kv_int_.find(key); if (iter == kv_int_.end()) { @@ -128,4 +157,5 @@ private: std::map kv_str_; std::map kv_int_; + std::map kv_float_; }; diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 47fe624..f7bc107 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -1,6 +1,7 @@ #include "Frame.h" #include "Logging.h" #include "PythonWrapper.h" +#include "ScopeGuard.h" #include "Util.h" #include @@ -85,6 +86,15 @@ namespace { ID_WHISPER_BROWSER_SRC_PORT, ID_WHISPER_ENABLE_LOCAL_BEEP, ID_WHISPER_USE_CPU, + ID_WHISPER_DECODE_METHOD, + ID_WHISPER_MAX_CTXT, + ID_WHISPER_BEAM_WIDTH, + ID_WHISPER_BEAM_N_BEST, + ID_WHISPER_VAD_MIN_DURATION, + ID_WHISPER_VAD_MAX_DURATION, + ID_WHISPER_VAD_DROP_START_SILENCE, + ID_WHISPER_VAD_PAUSE_DURATION, + ID_WHISPER_VAD_RETAIN_DURATION, ID_WHISPER_ENABLE_BUILTIN, ID_WHISPER_ENABLE_CUSTOM, ID_WHISPER_ENABLE_BROWSER_SRC, @@ -306,6 +316,13 @@ namespace { const size_t kNumButtons = sizeof(kButton) / sizeof(kButton[0]); constexpr int kButtonDefault = 0; + const wxString kDecodeMethods[] = { + "greedy", + "beam", + }; + const size_t kNumDecodeMethods = sizeof(kDecodeMethods) / sizeof(kDecodeMethods[0]); + constexpr int kDecodeMethodDefault = 0; + // Given the string value of a dropdown menu's entry, find its index. If no // entry matches, return `default_index`. int GetDropdownChoiceIndex(const wxString menu[], @@ -965,6 +982,74 @@ Frame::Frame() "ignore this option."); whisper_browser_src_port_ = whisper_browser_src_port; + auto* whisper_decode_method = new wxChoice( + whisper_config_panel_pairs, + ID_WHISPER_DECODE_METHOD, wxDefaultPosition, + wxDefaultSize, kNumDecodeMethods, kDecodeMethods); + whisper_decode_method->SetToolTip( + "Decoding method to use with whisper. Greedy is faster " + "and slightly less accurate."); + whisper_decode_method_ = whisper_decode_method; + + auto* whisper_max_ctxt = new wxTextCtrl( + whisper_config_panel_pairs, ID_WHISPER_MAX_CTXT, + std::to_string(app_c_->whisper_max_ctxt), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_max_ctxt->SetToolTip("TODO"); + whisper_max_ctxt_ = whisper_max_ctxt; + + auto* whisper_beam_width = new wxTextCtrl( + whisper_config_panel_pairs, ID_WHISPER_BEAM_WIDTH, + std::to_string(app_c_->whisper_beam_width), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_beam_width->SetToolTip("TODO"); + whisper_beam_width_ = whisper_beam_width; + + auto* whisper_beam_n_best = new wxTextCtrl( + whisper_config_panel_pairs, ID_WHISPER_BEAM_N_BEST, + std::to_string(app_c_->whisper_beam_n_best), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_beam_n_best->SetToolTip("TODO"); + whisper_beam_n_best_ = whisper_beam_n_best; + + auto* whisper_vad_min_duration = new wxTextCtrl( + whisper_config_panel_pairs, ID_WHISPER_VAD_MIN_DURATION, + std::to_string(app_c_->whisper_vad_min_duration), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_vad_min_duration->SetToolTip("TODO"); + whisper_vad_min_duration_ = whisper_vad_min_duration; + + auto* whisper_vad_max_duration = new wxTextCtrl( + whisper_config_panel_pairs, ID_WHISPER_VAD_MAX_DURATION, + std::to_string(app_c_->whisper_vad_max_duration), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_vad_max_duration->SetToolTip("TODO"); + whisper_vad_max_duration_ = whisper_vad_max_duration; + + auto* whisper_vad_drop_start_silence = new wxTextCtrl( + whisper_config_panel_pairs, + ID_WHISPER_VAD_DROP_START_SILENCE, + std::to_string(app_c_->whisper_vad_drop_start_silence), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_vad_drop_start_silence->SetToolTip("TODO"); + whisper_vad_drop_start_silence_ = whisper_vad_drop_start_silence; + + auto* whisper_vad_pause_duration = new wxTextCtrl( + whisper_config_panel_pairs, + ID_WHISPER_VAD_PAUSE_DURATION, + std::to_string(app_c_->whisper_vad_pause_duration), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_vad_pause_duration->SetToolTip("TODO"); + whisper_vad_pause_duration_ = whisper_vad_pause_duration; + + auto* whisper_vad_retain_duration = new wxTextCtrl( + whisper_config_panel_pairs, + ID_WHISPER_VAD_RETAIN_DURATION, + std::to_string(app_c_->whisper_vad_retain_duration), + wxDefaultPosition, wxDefaultSize, /*style=*/0); + whisper_vad_retain_duration->SetToolTip("TODO"); + whisper_vad_retain_duration_ = whisper_vad_retain_duration; + auto* sizer = new wxFlexGridSizer(/*cols=*/2); whisper_config_panel_pairs->SetSizer(sizer); @@ -983,6 +1068,51 @@ Frame::Frame() sizer->Add(whisper_model, /*proportion=*/0, /*flags=*/wxEXPAND); + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"Decode method:")); + sizer->Add(whisper_decode_method, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"Max audio contexts:")); + sizer->Add(whisper_max_ctxt, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"Beam width:")); + sizer->Add(whisper_beam_width, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"Beam n best:")); + sizer->Add(whisper_beam_n_best, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"VAD min duration:")); + sizer->Add(whisper_vad_min_duration, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"VAD max duration:")); + sizer->Add(whisper_vad_max_duration, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"VAD drop start silence:")); + sizer->Add(whisper_vad_drop_start_silence, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"VAD pause duration:")); + sizer->Add(whisper_vad_pause_duration, /*proportion=*/0, + /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(whisper_config_panel_pairs, + wxID_ANY, /*label=*/"VAD retain duration:")); + sizer->Add(whisper_vad_retain_duration, /*proportion=*/0, + /*flags=*/wxEXPAND); + #if 0 // Not implemented. sizer->Add(new wxStaticText(whisper_config_panel_pairs, @@ -1325,6 +1455,7 @@ void Frame::ApplyConfigToInputFields() auto* whisper_button = static_cast(FindWindowById(ID_WHISPER_BUTTON)); whisper_button->SetSelection(button_idx); +#if 0 auto* whisper_chars_per_sync = static_cast(FindWindowById(ID_WHISPER_CHARS_PER_SYNC)); whisper_chars_per_sync->SetSelection(chars_idx); @@ -1338,6 +1469,7 @@ void Frame::ApplyConfigToInputFields() auto* whisper_cols = static_cast(FindWindowById(ID_WHISPER_COLS)); whisper_cols->Clear(); whisper_cols->AppendText(std::to_string(app_c_->cols)); +#endif auto* whisper_browser_src_port = static_cast(FindWindowById(ID_WHISPER_BROWSER_SRC_PORT)); whisper_browser_src_port->Clear(); @@ -1358,6 +1490,35 @@ void Frame::ApplyConfigToInputFields() auto* whisper_enable_browser_src = static_cast(FindWindowById(ID_WHISPER_ENABLE_BROWSER_SRC)); whisper_enable_browser_src->SetValue(app_c_->whisper_enable_browser_src); + auto* whisper_decode_method = static_cast(FindWindowById(ID_WHISPER_DECODE_METHOD)); + int whisper_decode_method_idx = GetDropdownChoiceIndex(kDecodeMethods, + kNumDecodeMethods, app_c_->whisper_decode_method, kDecodeMethodDefault); + whisper_decode_method->SetSelection(whisper_decode_method_idx); + + auto* whisper_max_ctxt = static_cast(FindWindowById(ID_WHISPER_MAX_CTXT)); + whisper_max_ctxt->SetValue(std::to_string(app_c_->whisper_max_ctxt)); + + auto* whisper_beam_width = static_cast(FindWindowById(ID_WHISPER_BEAM_WIDTH)); + whisper_beam_width->SetValue(std::to_string(app_c_->whisper_beam_width)); + + auto* whisper_beam_n_best = static_cast(FindWindowById(ID_WHISPER_BEAM_N_BEST)); + whisper_beam_n_best->SetValue(std::to_string(app_c_->whisper_beam_n_best)); + + auto* whisper_vad_min_duration = static_cast(FindWindowById(ID_WHISPER_VAD_MIN_DURATION)); + whisper_vad_min_duration->SetValue(std::to_string(app_c_->whisper_vad_min_duration)); + + auto* whisper_vad_max_duration = static_cast(FindWindowById(ID_WHISPER_VAD_MAX_DURATION)); + whisper_vad_max_duration->SetValue(std::to_string(app_c_->whisper_vad_max_duration)); + + auto* whisper_vad_drop_start_silence = static_cast(FindWindowById(ID_WHISPER_VAD_DROP_START_SILENCE)); + whisper_vad_drop_start_silence->SetValue(std::to_string(app_c_->whisper_vad_drop_start_silence)); + + auto* whisper_vad_pause_duration = static_cast(FindWindowById(ID_WHISPER_VAD_PAUSE_DURATION)); + whisper_vad_pause_duration->SetValue(std::to_string(app_c_->whisper_vad_pause_duration)); + + auto* whisper_vad_retain_duration = static_cast(FindWindowById(ID_WHISPER_VAD_RETAIN_DURATION)); + whisper_vad_retain_duration->SetValue(std::to_string(app_c_->whisper_vad_retain_duration)); + // Unity panel auto* unity_chars_per_sync = static_cast(FindWindowById(ID_UNITY_CHARS_PER_SYNC)); unity_chars_per_sync->SetSelection(chars_idx); @@ -1376,21 +1537,25 @@ void Frame::ApplyConfigToInputFields() void Frame::PopulateDynamicInputFields() { - if (whisper_->Init()) { - std::vector mics; - if (whisper_->GetMics(mics)) { - std::vector contents(mics.size()); - auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); - for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) { - contents[i] = mics[i]; - } - int mic_idx = whisper_mic->GetSelection(); - whisper_mic->Set(contents); - if (mic_idx < contents.size()) { - whisper_mic->SetSelection(mic_idx); - } - } - } + Whisper::iMediaFoundation* f = nullptr; + if (!whisper_->GetMediaFoundation(f)) { + return; + } + ScopeGuard f_cleanup([f]() { f->Release(); }); + + std::vector mics; + if (whisper_->GetMics(f, mics)) { + std::vector contents(mics.size()); + auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); + for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) { + contents[i] = mics[i]; + } + int mic_idx = whisper_mic->GetSelection(); + whisper_mic->Set(contents); + if (mic_idx < contents.size()) { + whisper_mic->SetSelection(mic_idx); + } + } } void Frame::OnExit(wxCloseEvent& event) @@ -1990,6 +2155,10 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { if (button_idx == wxNOT_FOUND) { button_idx = kBytesDefault; } + int decode_method_idx = whisper_decode_method_->GetSelection(); + if (decode_method_idx == wxNOT_FOUND) { + decode_method_idx = kDecodeMethodDefault; + } const bool enable_local_beep = whisper_enable_local_beep_->GetValue(); const bool use_cpu = whisper_use_cpu_->GetValue(); std::string rows_str = whisper_rows_->GetValue().ToStdString(); @@ -2032,6 +2201,55 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { return; } + std::string max_ctxt_str = whisper_max_ctxt_->GetValue().ToStdString(); + std::string beam_sz_str = whisper_beam_width_->GetValue().ToStdString(); + std::string beam_wd_str = whisper_beam_n_best_->GetValue().ToStdString(); + int max_ctxt, beam_sz, beam_wd; + Log(whisper_out_, "here {}\n", __LINE__); + try { + Log(whisper_out_, "whisper max ctxt str: {}\n", max_ctxt_str); + max_ctxt = std::stoi(max_ctxt_str); + Log(whisper_out_, "whisper max ctxt: {}\n", max_ctxt); + beam_sz = std::stoi(beam_sz_str); + beam_wd = std::stoi(beam_wd_str); + } + catch (const std::invalid_argument&) { + Log(whisper_out_, "Could not parse max_ctxt '{}' beam_sz '{}' or beam_wd '{}' as an integer", + max_ctxt_str, beam_sz_str, beam_wd_str); + return; + } + catch (const std::out_of_range&) { + Log(whisper_out_, "Could not parse max_ctxt '{}', beam_sz '{}' or beam_wd '{}' as an integer: out of range", + max_ctxt_str, beam_sz_str, beam_wd_str); + return; + } + + std::string vad_min_dur_str = whisper_vad_min_duration_->GetValue().ToStdString(); + std::string vad_max_dur_str = whisper_vad_max_duration_->GetValue().ToStdString(); + std::string vad_drop_si_dur_str = whisper_vad_drop_start_silence_->GetValue().ToStdString(); + std::string vad_pause_dur_str = whisper_vad_pause_duration_->GetValue().ToStdString(); + std::string vad_ret_dur_str = whisper_vad_retain_duration_->GetValue().ToStdString(); + float vad_min_dur, vad_max_dur, vad_drop_silence_dur, vad_pause_dur, vad_retain_dur; + try { + vad_min_dur = std::stof(vad_min_dur_str); + vad_max_dur = std::stof(vad_max_dur_str); + vad_drop_silence_dur = std::stof(vad_drop_si_dur_str); + vad_pause_dur = std::stof(vad_pause_dur_str); + vad_retain_dur = std::stof(vad_ret_dur_str); + } + catch (const std::invalid_argument&) { + // TODO update error msg + Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer", + beam_sz, beam_wd); + return; + } + catch (const std::out_of_range&) { + // TODO update error msg + Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer: out of range", + beam_sz, beam_wd); + return; + } + const int min_port = 1024; const int max_port = 65535; if (browser_src_port < min_port || browser_src_port > max_port) { @@ -2054,6 +2272,15 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { app_c_->whisper_enable_browser_src = whisper_enable_browser_src_->GetValue(); app_c_->whisper_enable_builtin = whisper_enable_builtin_->GetValue(); app_c_->whisper_enable_custom = whisper_enable_custom_->GetValue(); + app_c_->whisper_decode_method = kDecodeMethods[decode_method_idx].ToStdString(); + app_c_->whisper_max_ctxt = max_ctxt; + app_c_->whisper_beam_width = beam_sz; + app_c_->whisper_beam_n_best = beam_wd; + app_c_->whisper_vad_min_duration = vad_min_dur; + app_c_->whisper_vad_max_duration = vad_max_dur; + app_c_->whisper_vad_drop_start_silence = vad_drop_silence_dur; + app_c_->whisper_vad_pause_duration = vad_pause_dur; + app_c_->whisper_vad_retain_duration = vad_retain_dur; app_c_->Serialize(AppConfig::kConfigPath); whisper_->Start(*app_c_); diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 91ec62d..0ecd268 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -45,6 +45,14 @@ private: wxTextCtrl* whisper_rows_; wxTextCtrl* whisper_cols_; wxTextCtrl* whisper_browser_src_port_; + wxTextCtrl* whisper_max_ctxt_; + wxTextCtrl* whisper_beam_width_; + wxTextCtrl* whisper_beam_n_best_; + wxTextCtrl* whisper_vad_min_duration_; + wxTextCtrl* whisper_vad_max_duration_; + wxTextCtrl* whisper_vad_drop_start_silence_; + wxTextCtrl* whisper_vad_pause_duration_; + wxTextCtrl* whisper_vad_retain_duration_; wxDirPickerCtrl* unity_assets_file_picker_; wxFilePickerCtrl* unity_animator_file_picker_; @@ -66,6 +74,7 @@ private: wxChoice* whisper_chars_per_sync_; wxChoice* whisper_bytes_per_char_; wxChoice* whisper_button_; + wxChoice* whisper_decode_method_; wxCheckBox* py_app_enable_local_beep_; wxCheckBox* py_app_use_cpu_; diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp index 6fd38b8..2b1a03f 100644 --- a/GUI/GUI/GUI/WhisperCPP.cpp +++ b/GUI/GUI/GUI/WhisperCPP.cpp @@ -65,7 +65,7 @@ namespace { }; WhisperCPP::WhisperCPP(wxTextCtrl* out) - : out_(out), f_(nullptr), did_init_(false), run_transcription_(false), run_browser_src_(false) + : out_(out), run_transcription_(false), run_browser_src_(false) { // Initialize futures so that valid() returns true. We use this as a proxy // to tell whether they're still executing. @@ -86,35 +86,23 @@ WhisperCPP::WhisperCPP(wxTextCtrl* out) } } -WhisperCPP::~WhisperCPP() { - f_->Release(); -} - -bool WhisperCPP::Init() { - if (did_init_) { - return true; - } +WhisperCPP::~WhisperCPP() {} +bool WhisperCPP::GetMediaFoundation(Whisper::iMediaFoundation*& f) { iMediaFoundation* tmp_f = nullptr; HRESULT err = initMediaFoundation(&tmp_f); if (FAILED(err)) { Log(out_, "Failed to initialize media layer: {}", err); return false; } - f_ = tmp_f; - did_init_ = true; - Log(out_, "Initialized successfully\n"); + f = tmp_f; return true; } -bool WhisperCPP::GetMics(std::vector& mics) { - if (!did_init_) { - return false; - } - +bool WhisperCPP::GetMics(Whisper::iMediaFoundation* f, std::vector& mics) { std::vector> mics_raw; - if (!GetMicsImpl(mics_raw)) { + if (!GetMicsImpl(f, mics_raw)) { return false; } @@ -126,14 +114,9 @@ bool WhisperCPP::GetMics(std::vector& mics) { return true; } -bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) { - if (!did_init_) { - Log(out_, "Whisper not initialized\n"); - return false; - } - +bool WhisperCPP::OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream) { std::vector> mics_raw; - if (!GetMicsImpl(mics_raw)) { + if (!GetMicsImpl(f, mics_raw)) { return false; } @@ -143,13 +126,13 @@ bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) { } Whisper::sCaptureParams params{}; - params.dropStartSilence = 1.0; - params.pauseDuration = 1.0; - params.minDuration = 2.0; - params.maxDuration = 3.0; - params.retainDuration = 1.5; + params.dropStartSilence = c.whisper_vad_drop_start_silence; + params.pauseDuration = c.whisper_vad_pause_duration; + params.minDuration = c.whisper_vad_min_duration; + params.maxDuration = c.whisper_vad_max_duration; + params.retainDuration = c.whisper_vad_retain_duration; stream = nullptr; - HRESULT err = f_->openCaptureDevice(mics_raw[idx]->endpoint.c_str(), + HRESULT err = f->openCaptureDevice(mics_raw[idx]->endpoint.c_str(), params, &stream); if (FAILED(err)) { Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx, @@ -255,7 +238,6 @@ bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& conte } void WhisperCPP::Start(const AppConfig& c) { - Init(); transcript_.Clear(); if (!transcription_thd_.valid()) { @@ -266,8 +248,15 @@ void WhisperCPP::Start(const AppConfig& c) { transcription_thd_ = std::async(std::launch::async, [&]() -> void { run_transcription_ = true; + iMediaFoundation* f = nullptr; + if (!GetMediaFoundation(f)) { + return; + } + ScopeGuard f_cleanup([f]() { f->Release(); }); + + Whisper::iAudioCapture* mic_stream; - if (!OpenMic(c.whisper_mic, mic_stream)) { + if (!OpenMic(f, c, c.whisper_mic, mic_stream)) { return; } ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); }); @@ -318,12 +307,22 @@ void WhisperCPP::Start(const AppConfig& c) { ScopeGuard context_cleanup([context]() { context->Release(); }); Whisper::sFullParams wparams{}; - context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams); - wparams.beam_search.beam_width = 5; - wparams.beam_search.n_best = 5; + if (c.whisper_decode_method == "greedy") { + Log(out_, "Using greedy decoding\n"); + context->fullDefaultParams(eSamplingStrategy::Greedy, &wparams); + } + else if (c.whisper_decode_method == "beam") { + Log(out_, "Using beam search decoding\n"); + context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams); + wparams.beam_search.beam_width = c.whisper_beam_width; + wparams.beam_search.n_best = c.whisper_beam_n_best; + } else { + Log(out_, "Invalid decoding method: {}\n", c.whisper_decode_method); + return; + } wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config // This must be set to keep memory usage from growing without bound. - wparams.n_max_text_ctx = 100; + wparams.n_max_text_ctx = c.whisper_max_ctxt; wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT { WhisperCPP* app = static_cast(user_data); @@ -540,7 +539,7 @@ void WhisperCPP::StopCustomChatbox() { Log(out_, "Done!\n"); } -bool WhisperCPP::GetMicsImpl(std::vector>& mics) { +bool WhisperCPP::GetMicsImpl(Whisper::iMediaFoundation* f, std::vector>& mics) { pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall { auto mics = static_cast>*>(pv); for (int i = 0; i < len; i++) { @@ -549,7 +548,7 @@ bool WhisperCPP::GetMicsImpl(std::vector>& mics) { return S_OK; }; mics.clear(); - HRESULT err = f_->listCaptureDevices(dev_cb, &mics); + HRESULT err = f->listCaptureDevices(dev_cb, &mics); if (FAILED(err)) { Log(out_, "Failed to get microphones: {}\n", err); return false; diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h index d58a671..530d65a 100644 --- a/GUI/GUI/GUI/WhisperCPP.h +++ b/GUI/GUI/GUI/WhisperCPP.h @@ -27,9 +27,9 @@ public: WhisperCPP(wxTextCtrl* out); ~WhisperCPP(); - bool Init(); - bool GetMics(std::vector& mics); - bool OpenMic(const int idx, Whisper::iAudioCapture*& stream); + bool GetMediaFoundation(Whisper::iMediaFoundation*& f); + bool GetMics(Whisper::iMediaFoundation* f, std::vector& mics); + bool OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream); bool InstallDependencies(); bool DownloadModel(const std::string& model_name, const std::filesystem::path& fs_path); @@ -55,11 +55,10 @@ private: std::wstring endpoint; }; bool GetMicsImpl( + Whisper::iMediaFoundation* f, std::vector>& mics); wxTextCtrl* out_; - Whisper::iMediaFoundation* f_; - bool did_init_; std::future transcription_thd_; volatile bool run_transcription_; -- cgit v1.2.3