diff options
| author | yum <yum.food.vr@gmail.com> | 2023-03-08 14:59:57 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-03-08 14:59:57 -0800 |
| commit | b6dc76afb4f76a8b0eaa8f821c557cd83f69daf4 (patch) | |
| tree | 32fcaa816aa5411e813d40cc2aebae7800aca32b /GUI | |
| parent | 5d39ab1d6091241fb228a379da64a84436138f59 (diff) | |
Expose more C++ whisper parameters in GUI
Expose decode method, beam search parameters, and voice activity
detection parameters in GUI.
* Remove WhisperCPP::Init(), do it on launch instead.
* Add float support to ConfigMarshal
Diffstat (limited to 'GUI')
| -rw-r--r-- | GUI/GUI/GUI/Config.cpp | 29 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Config.h | 9 | ||||
| -rw-r--r-- | GUI/GUI/GUI/ConfigMarshal.h | 30 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 257 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.h | 9 | ||||
| -rw-r--r-- | GUI/GUI/GUI/WhisperCPP.cpp | 77 | ||||
| -rw-r--r-- | GUI/GUI/GUI/WhisperCPP.h | 9 |
7 files changed, 360 insertions, 60 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 5812f94..f45aa45 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -59,7 +59,7 @@ bool Config::Deserialize(const std::filesystem::path& path, return cm.Load(path);
}
-AppConfig::AppConfig(wxTextCtrl *out)
+AppConfig::AppConfig(wxTextCtrl* out)
: Config(out),
microphone("index"),
@@ -85,6 +85,15 @@ AppConfig::AppConfig(wxTextCtrl *out) whisper_model("ggml-medium.bin"),
whisper_mic(0),
+ whisper_decode_method("greedy"),
+ whisper_max_ctxt(100),
+ whisper_beam_width(5),
+ whisper_beam_n_best(5),
+ whisper_vad_min_duration(0.5),
+ whisper_vad_max_duration(5.0),
+ whisper_vad_drop_start_silence(0.5),
+ whisper_vad_pause_duration(0.2),
+ whisper_vad_retain_duration(0.2),
browser_src_port(9517),
whisper_enable_builtin(false),
@@ -118,6 +127,15 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("whisper_model", whisper_model);
cm.Set("whisper_mic", whisper_mic);
+ cm.Set("whisper_decode_method", whisper_decode_method);
+ cm.Set("whisper_max_ctxt", whisper_max_ctxt);
+ cm.Set("whisper_beam_width", whisper_beam_width);
+ cm.Set("whisper_beam_n_best", whisper_beam_n_best);
+ cm.Set("whisper_vad_min_duration", whisper_vad_min_duration);
+ cm.Set("whisper_vad_max_duration", whisper_vad_max_duration);
+ cm.Set("whisper_vad_drop_start_silence", whisper_vad_drop_start_silence);
+ cm.Set("whisper_vad_pause_duration", whisper_vad_pause_duration);
+ cm.Set("whisper_vad_retain_duration", whisper_vad_retain_duration);
cm.Set("browser_src_port", browser_src_port);
cm.Set("whisper_enable_builtin", whisper_enable_builtin);
@@ -164,6 +182,15 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("whisper_model", c.whisper_model);
cm.Get("whisper_mic", c.whisper_mic);
+ cm.Get("whisper_decode_method", c.whisper_decode_method);
+ cm.Get("whisper_max_ctxt", c.whisper_max_ctxt);
+ cm.Get("whisper_beam_width", c.whisper_beam_width);
+ cm.Get("whisper_beam_n_best", c.whisper_beam_n_best);
+ cm.Get("whisper_vad_min_duration", c.whisper_vad_min_duration);
+ cm.Get("whisper_vad_max_duration", c.whisper_vad_max_duration);
+ cm.Get("whisper_vad_drop_start_silence", c.whisper_vad_drop_start_silence);
+ cm.Get("whisper_vad_pause_duration", c.whisper_vad_pause_duration);
+ cm.Get("whisper_vad_retain_duration", c.whisper_vad_retain_duration);
cm.Get("browser_src_port", c.browser_src_port);
cm.Get("whisper_enable_builtin", c.whisper_enable_builtin);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index 14dcc58..f0babc2 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -74,6 +74,15 @@ public: // WhisperCPP-specific settings.
std::string whisper_model;
int whisper_mic;
+ std::string whisper_decode_method;
+ int whisper_max_ctxt;
+ int whisper_beam_width;
+ int whisper_beam_n_best;
+ float whisper_vad_min_duration;
+ float whisper_vad_max_duration;
+ float whisper_vad_drop_start_silence;
+ float whisper_vad_pause_duration;
+ float whisper_vad_retain_duration;
// Browser source-specific settings.
int browser_src_port;
diff --git a/GUI/GUI/GUI/ConfigMarshal.h b/GUI/GUI/GUI/ConfigMarshal.h index a2f17f9..0f40ed4 100644 --- a/GUI/GUI/GUI/ConfigMarshal.h +++ b/GUI/GUI/GUI/ConfigMarshal.h @@ -30,6 +30,9 @@ public: for (const auto& [k, v] : kv_int_) {
oss << k << ": " << std::to_string(v) << std::endl;
}
+ for (const auto& [k, v] : kv_float_) {
+ oss << k << ": " << std::to_string(v) << std::endl;
+ }
std::ofstream ofs(path.string());
ofs << oss.str();
@@ -66,6 +69,19 @@ public: catch (const std::invalid_argument&) {}
catch (const std::out_of_range&) {}
+ try {
+ size_t pos;
+ float val_f = std::stof(val, &pos);
+ if (pos == val.length()) {
+ // The entire value is a float -> interpret as a float. Corollary: users
+ // can't store floats as strings!
+ kv_float_[key] = val_f;
+ continue;
+ }
+ }
+ catch (const std::invalid_argument&) {}
+ catch (const std::out_of_range&) {}
+
kv_str_[key] = val;
}
return true;
@@ -81,6 +97,10 @@ public: kv_int_[key] = static_cast<int>(value);
return true;
}
+ if constexpr (std::is_same_v<T, float>) {
+ kv_float_[key] = value;
+ return true;
+ }
Logging::Log(out_, "ConfigMarshal unsupported type: {}\n", typeid(T).name());
return false;
}
@@ -103,6 +123,15 @@ public: value = iter->second;
return true;
}
+ if constexpr (std::is_same_v<T, float>) {
+ auto iter = kv_float_.find(key);
+ if (iter == kv_float_.end()) {
+ Logging::Log(out_, "Config contains no field named `{}`\n", key);
+ return false;
+ }
+ value = iter->second;
+ return true;
+ }
if constexpr (std::is_same_v<T, int> || std::is_same_v<T, bool>) {
auto iter = kv_int_.find(key);
if (iter == kv_int_.end()) {
@@ -128,4 +157,5 @@ private: std::map<std::string, std::string> kv_str_;
std::map<std::string, int> kv_int_;
+ std::map<std::string, float> kv_float_;
};
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 47fe624..f7bc107 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -1,6 +1,7 @@ #include "Frame.h"
#include "Logging.h"
#include "PythonWrapper.h"
+#include "ScopeGuard.h"
#include "Util.h"
#include <filesystem>
@@ -85,6 +86,15 @@ namespace { ID_WHISPER_BROWSER_SRC_PORT,
ID_WHISPER_ENABLE_LOCAL_BEEP,
ID_WHISPER_USE_CPU,
+ ID_WHISPER_DECODE_METHOD,
+ ID_WHISPER_MAX_CTXT,
+ ID_WHISPER_BEAM_WIDTH,
+ ID_WHISPER_BEAM_N_BEST,
+ ID_WHISPER_VAD_MIN_DURATION,
+ ID_WHISPER_VAD_MAX_DURATION,
+ ID_WHISPER_VAD_DROP_START_SILENCE,
+ ID_WHISPER_VAD_PAUSE_DURATION,
+ ID_WHISPER_VAD_RETAIN_DURATION,
ID_WHISPER_ENABLE_BUILTIN,
ID_WHISPER_ENABLE_CUSTOM,
ID_WHISPER_ENABLE_BROWSER_SRC,
@@ -306,6 +316,13 @@ namespace { const size_t kNumButtons = sizeof(kButton) / sizeof(kButton[0]);
constexpr int kButtonDefault = 0;
+ const wxString kDecodeMethods[] = {
+ "greedy",
+ "beam",
+ };
+ const size_t kNumDecodeMethods = sizeof(kDecodeMethods) / sizeof(kDecodeMethods[0]);
+ constexpr int kDecodeMethodDefault = 0;
+
// Given the string value of a dropdown menu's entry, find its index. If no
// entry matches, return `default_index`.
int GetDropdownChoiceIndex(const wxString menu[],
@@ -965,6 +982,74 @@ Frame::Frame() "ignore this option.");
whisper_browser_src_port_ = whisper_browser_src_port;
+ auto* whisper_decode_method = new wxChoice(
+ whisper_config_panel_pairs,
+ ID_WHISPER_DECODE_METHOD, wxDefaultPosition,
+ wxDefaultSize, kNumDecodeMethods, kDecodeMethods);
+ whisper_decode_method->SetToolTip(
+ "Decoding method to use with whisper. Greedy is faster "
+ "and slightly less accurate.");
+ whisper_decode_method_ = whisper_decode_method;
+
+ auto* whisper_max_ctxt = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_MAX_CTXT,
+ std::to_string(app_c_->whisper_max_ctxt),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_max_ctxt->SetToolTip("TODO");
+ whisper_max_ctxt_ = whisper_max_ctxt;
+
+ auto* whisper_beam_width = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_BEAM_WIDTH,
+ std::to_string(app_c_->whisper_beam_width),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_beam_width->SetToolTip("TODO");
+ whisper_beam_width_ = whisper_beam_width;
+
+ auto* whisper_beam_n_best = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_BEAM_N_BEST,
+ std::to_string(app_c_->whisper_beam_n_best),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_beam_n_best->SetToolTip("TODO");
+ whisper_beam_n_best_ = whisper_beam_n_best;
+
+ auto* whisper_vad_min_duration = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_VAD_MIN_DURATION,
+ std::to_string(app_c_->whisper_vad_min_duration),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_vad_min_duration->SetToolTip("TODO");
+ whisper_vad_min_duration_ = whisper_vad_min_duration;
+
+ auto* whisper_vad_max_duration = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_VAD_MAX_DURATION,
+ std::to_string(app_c_->whisper_vad_max_duration),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_vad_max_duration->SetToolTip("TODO");
+ whisper_vad_max_duration_ = whisper_vad_max_duration;
+
+ auto* whisper_vad_drop_start_silence = new wxTextCtrl(
+ whisper_config_panel_pairs,
+ ID_WHISPER_VAD_DROP_START_SILENCE,
+ std::to_string(app_c_->whisper_vad_drop_start_silence),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_vad_drop_start_silence->SetToolTip("TODO");
+ whisper_vad_drop_start_silence_ = whisper_vad_drop_start_silence;
+
+ auto* whisper_vad_pause_duration = new wxTextCtrl(
+ whisper_config_panel_pairs,
+ ID_WHISPER_VAD_PAUSE_DURATION,
+ std::to_string(app_c_->whisper_vad_pause_duration),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_vad_pause_duration->SetToolTip("TODO");
+ whisper_vad_pause_duration_ = whisper_vad_pause_duration;
+
+ auto* whisper_vad_retain_duration = new wxTextCtrl(
+ whisper_config_panel_pairs,
+ ID_WHISPER_VAD_RETAIN_DURATION,
+ std::to_string(app_c_->whisper_vad_retain_duration),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_vad_retain_duration->SetToolTip("TODO");
+ whisper_vad_retain_duration_ = whisper_vad_retain_duration;
+
auto* sizer = new wxFlexGridSizer(/*cols=*/2);
whisper_config_panel_pairs->SetSizer(sizer);
@@ -983,6 +1068,51 @@ Frame::Frame() sizer->Add(whisper_model, /*proportion=*/0,
/*flags=*/wxEXPAND);
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Decode method:"));
+ sizer->Add(whisper_decode_method, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Max audio contexts:"));
+ sizer->Add(whisper_max_ctxt, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Beam width:"));
+ sizer->Add(whisper_beam_width, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Beam n best:"));
+ sizer->Add(whisper_beam_n_best, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"VAD min duration:"));
+ sizer->Add(whisper_vad_min_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"VAD max duration:"));
+ sizer->Add(whisper_vad_max_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"VAD drop start silence:"));
+ sizer->Add(whisper_vad_drop_start_silence, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"VAD pause duration:"));
+ sizer->Add(whisper_vad_pause_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"VAD retain duration:"));
+ sizer->Add(whisper_vad_retain_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
#if 0
// Not implemented.
sizer->Add(new wxStaticText(whisper_config_panel_pairs,
@@ -1325,6 +1455,7 @@ void Frame::ApplyConfigToInputFields() auto* whisper_button = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_BUTTON));
whisper_button->SetSelection(button_idx);
+#if 0
auto* whisper_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_CHARS_PER_SYNC));
whisper_chars_per_sync->SetSelection(chars_idx);
@@ -1338,6 +1469,7 @@ void Frame::ApplyConfigToInputFields() auto* whisper_cols = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_COLS));
whisper_cols->Clear();
whisper_cols->AppendText(std::to_string(app_c_->cols));
+#endif
auto* whisper_browser_src_port = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_BROWSER_SRC_PORT));
whisper_browser_src_port->Clear();
@@ -1358,6 +1490,35 @@ void Frame::ApplyConfigToInputFields() auto* whisper_enable_browser_src = static_cast<wxCheckBox*>(FindWindowById(ID_WHISPER_ENABLE_BROWSER_SRC));
whisper_enable_browser_src->SetValue(app_c_->whisper_enable_browser_src);
+ auto* whisper_decode_method = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_DECODE_METHOD));
+ int whisper_decode_method_idx = GetDropdownChoiceIndex(kDecodeMethods,
+ kNumDecodeMethods, app_c_->whisper_decode_method, kDecodeMethodDefault);
+ whisper_decode_method->SetSelection(whisper_decode_method_idx);
+
+ auto* whisper_max_ctxt = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_MAX_CTXT));
+ whisper_max_ctxt->SetValue(std::to_string(app_c_->whisper_max_ctxt));
+
+ auto* whisper_beam_width = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_BEAM_WIDTH));
+ whisper_beam_width->SetValue(std::to_string(app_c_->whisper_beam_width));
+
+ auto* whisper_beam_n_best = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_BEAM_N_BEST));
+ whisper_beam_n_best->SetValue(std::to_string(app_c_->whisper_beam_n_best));
+
+ auto* whisper_vad_min_duration = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_VAD_MIN_DURATION));
+ whisper_vad_min_duration->SetValue(std::to_string(app_c_->whisper_vad_min_duration));
+
+ auto* whisper_vad_max_duration = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_VAD_MAX_DURATION));
+ whisper_vad_max_duration->SetValue(std::to_string(app_c_->whisper_vad_max_duration));
+
+ auto* whisper_vad_drop_start_silence = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_VAD_DROP_START_SILENCE));
+ whisper_vad_drop_start_silence->SetValue(std::to_string(app_c_->whisper_vad_drop_start_silence));
+
+ auto* whisper_vad_pause_duration = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_VAD_PAUSE_DURATION));
+ whisper_vad_pause_duration->SetValue(std::to_string(app_c_->whisper_vad_pause_duration));
+
+ auto* whisper_vad_retain_duration = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_VAD_RETAIN_DURATION));
+ whisper_vad_retain_duration->SetValue(std::to_string(app_c_->whisper_vad_retain_duration));
+
// Unity panel
auto* unity_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_UNITY_CHARS_PER_SYNC));
unity_chars_per_sync->SetSelection(chars_idx);
@@ -1376,21 +1537,25 @@ void Frame::ApplyConfigToInputFields() void Frame::PopulateDynamicInputFields()
{
- if (whisper_->Init()) {
- std::vector<std::string> mics;
- if (whisper_->GetMics(mics)) {
- std::vector<wxString> contents(mics.size());
- auto* whisper_mic = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MIC));
- for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) {
- contents[i] = mics[i];
- }
- int mic_idx = whisper_mic->GetSelection();
- whisper_mic->Set(contents);
- if (mic_idx < contents.size()) {
- whisper_mic->SetSelection(mic_idx);
- }
- }
- }
+ Whisper::iMediaFoundation* f = nullptr;
+ if (!whisper_->GetMediaFoundation(f)) {
+ return;
+ }
+ ScopeGuard f_cleanup([f]() { f->Release(); });
+
+ std::vector<std::string> mics;
+ if (whisper_->GetMics(f, mics)) {
+ std::vector<wxString> contents(mics.size());
+ auto* whisper_mic = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MIC));
+ for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) {
+ contents[i] = mics[i];
+ }
+ int mic_idx = whisper_mic->GetSelection();
+ whisper_mic->Set(contents);
+ if (mic_idx < contents.size()) {
+ whisper_mic->SetSelection(mic_idx);
+ }
+ }
}
void Frame::OnExit(wxCloseEvent& event)
@@ -1990,6 +2155,10 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { if (button_idx == wxNOT_FOUND) {
button_idx = kBytesDefault;
}
+ int decode_method_idx = whisper_decode_method_->GetSelection();
+ if (decode_method_idx == wxNOT_FOUND) {
+ decode_method_idx = kDecodeMethodDefault;
+ }
const bool enable_local_beep = whisper_enable_local_beep_->GetValue();
const bool use_cpu = whisper_use_cpu_->GetValue();
std::string rows_str = whisper_rows_->GetValue().ToStdString();
@@ -2032,6 +2201,55 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { return;
}
+ std::string max_ctxt_str = whisper_max_ctxt_->GetValue().ToStdString();
+ std::string beam_sz_str = whisper_beam_width_->GetValue().ToStdString();
+ std::string beam_wd_str = whisper_beam_n_best_->GetValue().ToStdString();
+ int max_ctxt, beam_sz, beam_wd;
+ Log(whisper_out_, "here {}\n", __LINE__);
+ try {
+ Log(whisper_out_, "whisper max ctxt str: {}\n", max_ctxt_str);
+ max_ctxt = std::stoi(max_ctxt_str);
+ Log(whisper_out_, "whisper max ctxt: {}\n", max_ctxt);
+ beam_sz = std::stoi(beam_sz_str);
+ beam_wd = std::stoi(beam_wd_str);
+ }
+ catch (const std::invalid_argument&) {
+ Log(whisper_out_, "Could not parse max_ctxt '{}' beam_sz '{}' or beam_wd '{}' as an integer",
+ max_ctxt_str, beam_sz_str, beam_wd_str);
+ return;
+ }
+ catch (const std::out_of_range&) {
+ Log(whisper_out_, "Could not parse max_ctxt '{}', beam_sz '{}' or beam_wd '{}' as an integer: out of range",
+ max_ctxt_str, beam_sz_str, beam_wd_str);
+ return;
+ }
+
+ std::string vad_min_dur_str = whisper_vad_min_duration_->GetValue().ToStdString();
+ std::string vad_max_dur_str = whisper_vad_max_duration_->GetValue().ToStdString();
+ std::string vad_drop_si_dur_str = whisper_vad_drop_start_silence_->GetValue().ToStdString();
+ std::string vad_pause_dur_str = whisper_vad_pause_duration_->GetValue().ToStdString();
+ std::string vad_ret_dur_str = whisper_vad_retain_duration_->GetValue().ToStdString();
+ float vad_min_dur, vad_max_dur, vad_drop_silence_dur, vad_pause_dur, vad_retain_dur;
+ try {
+ vad_min_dur = std::stof(vad_min_dur_str);
+ vad_max_dur = std::stof(vad_max_dur_str);
+ vad_drop_silence_dur = std::stof(vad_drop_si_dur_str);
+ vad_pause_dur = std::stof(vad_pause_dur_str);
+ vad_retain_dur = std::stof(vad_ret_dur_str);
+ }
+ catch (const std::invalid_argument&) {
+ // TODO update error msg
+ Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer",
+ beam_sz, beam_wd);
+ return;
+ }
+ catch (const std::out_of_range&) {
+ // TODO update error msg
+ Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer: out of range",
+ beam_sz, beam_wd);
+ return;
+ }
+
const int min_port = 1024;
const int max_port = 65535;
if (browser_src_port < min_port || browser_src_port > max_port) {
@@ -2054,6 +2272,15 @@ void Frame::OnWhisperStart(wxCommandEvent& event) { app_c_->whisper_enable_browser_src = whisper_enable_browser_src_->GetValue();
app_c_->whisper_enable_builtin = whisper_enable_builtin_->GetValue();
app_c_->whisper_enable_custom = whisper_enable_custom_->GetValue();
+ app_c_->whisper_decode_method = kDecodeMethods[decode_method_idx].ToStdString();
+ app_c_->whisper_max_ctxt = max_ctxt;
+ app_c_->whisper_beam_width = beam_sz;
+ app_c_->whisper_beam_n_best = beam_wd;
+ app_c_->whisper_vad_min_duration = vad_min_dur;
+ app_c_->whisper_vad_max_duration = vad_max_dur;
+ app_c_->whisper_vad_drop_start_silence = vad_drop_silence_dur;
+ app_c_->whisper_vad_pause_duration = vad_pause_dur;
+ app_c_->whisper_vad_retain_duration = vad_retain_dur;
app_c_->Serialize(AppConfig::kConfigPath);
whisper_->Start(*app_c_);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 91ec62d..0ecd268 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -45,6 +45,14 @@ private: wxTextCtrl* whisper_rows_;
wxTextCtrl* whisper_cols_;
wxTextCtrl* whisper_browser_src_port_;
+ wxTextCtrl* whisper_max_ctxt_;
+ wxTextCtrl* whisper_beam_width_;
+ wxTextCtrl* whisper_beam_n_best_;
+ wxTextCtrl* whisper_vad_min_duration_;
+ wxTextCtrl* whisper_vad_max_duration_;
+ wxTextCtrl* whisper_vad_drop_start_silence_;
+ wxTextCtrl* whisper_vad_pause_duration_;
+ wxTextCtrl* whisper_vad_retain_duration_;
wxDirPickerCtrl* unity_assets_file_picker_;
wxFilePickerCtrl* unity_animator_file_picker_;
@@ -66,6 +74,7 @@ private: wxChoice* whisper_chars_per_sync_;
wxChoice* whisper_bytes_per_char_;
wxChoice* whisper_button_;
+ wxChoice* whisper_decode_method_;
wxCheckBox* py_app_enable_local_beep_;
wxCheckBox* py_app_use_cpu_;
diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp index 6fd38b8..2b1a03f 100644 --- a/GUI/GUI/GUI/WhisperCPP.cpp +++ b/GUI/GUI/GUI/WhisperCPP.cpp @@ -65,7 +65,7 @@ namespace { };
WhisperCPP::WhisperCPP(wxTextCtrl* out)
- : out_(out), f_(nullptr), did_init_(false), run_transcription_(false), run_browser_src_(false)
+ : out_(out), run_transcription_(false), run_browser_src_(false)
{
// Initialize futures so that valid() returns true. We use this as a proxy
// to tell whether they're still executing.
@@ -86,35 +86,23 @@ WhisperCPP::WhisperCPP(wxTextCtrl* out) }
}
-WhisperCPP::~WhisperCPP() {
- f_->Release();
-}
-
-bool WhisperCPP::Init() {
- if (did_init_) {
- return true;
- }
+WhisperCPP::~WhisperCPP() {}
+bool WhisperCPP::GetMediaFoundation(Whisper::iMediaFoundation*& f) {
iMediaFoundation* tmp_f = nullptr;
HRESULT err = initMediaFoundation(&tmp_f);
if (FAILED(err)) {
Log(out_, "Failed to initialize media layer: {}", err);
return false;
}
- f_ = tmp_f;
- did_init_ = true;
- Log(out_, "Initialized successfully\n");
+ f = tmp_f;
return true;
}
-bool WhisperCPP::GetMics(std::vector<std::string>& mics) {
- if (!did_init_) {
- return false;
- }
-
+bool WhisperCPP::GetMics(Whisper::iMediaFoundation* f, std::vector<std::string>& mics) {
std::vector<std::unique_ptr<MicInfo>> mics_raw;
- if (!GetMicsImpl(mics_raw)) {
+ if (!GetMicsImpl(f, mics_raw)) {
return false;
}
@@ -126,14 +114,9 @@ bool WhisperCPP::GetMics(std::vector<std::string>& mics) { return true;
}
-bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) {
- if (!did_init_) {
- Log(out_, "Whisper not initialized\n");
- return false;
- }
-
+bool WhisperCPP::OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream) {
std::vector<std::unique_ptr<MicInfo>> mics_raw;
- if (!GetMicsImpl(mics_raw)) {
+ if (!GetMicsImpl(f, mics_raw)) {
return false;
}
@@ -143,13 +126,13 @@ bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) { }
Whisper::sCaptureParams params{};
- params.dropStartSilence = 1.0;
- params.pauseDuration = 1.0;
- params.minDuration = 2.0;
- params.maxDuration = 3.0;
- params.retainDuration = 1.5;
+ params.dropStartSilence = c.whisper_vad_drop_start_silence;
+ params.pauseDuration = c.whisper_vad_pause_duration;
+ params.minDuration = c.whisper_vad_min_duration;
+ params.maxDuration = c.whisper_vad_max_duration;
+ params.retainDuration = c.whisper_vad_retain_duration;
stream = nullptr;
- HRESULT err = f_->openCaptureDevice(mics_raw[idx]->endpoint.c_str(),
+ HRESULT err = f->openCaptureDevice(mics_raw[idx]->endpoint.c_str(),
params, &stream);
if (FAILED(err)) {
Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx,
@@ -255,7 +238,6 @@ bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& conte }
void WhisperCPP::Start(const AppConfig& c) {
- Init();
transcript_.Clear();
if (!transcription_thd_.valid()) {
@@ -266,8 +248,15 @@ void WhisperCPP::Start(const AppConfig& c) { transcription_thd_ = std::async(std::launch::async, [&]() -> void {
run_transcription_ = true;
+ iMediaFoundation* f = nullptr;
+ if (!GetMediaFoundation(f)) {
+ return;
+ }
+ ScopeGuard f_cleanup([f]() { f->Release(); });
+
+
Whisper::iAudioCapture* mic_stream;
- if (!OpenMic(c.whisper_mic, mic_stream)) {
+ if (!OpenMic(f, c, c.whisper_mic, mic_stream)) {
return;
}
ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); });
@@ -318,12 +307,22 @@ void WhisperCPP::Start(const AppConfig& c) { ScopeGuard context_cleanup([context]() { context->Release(); });
Whisper::sFullParams wparams{};
- context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams);
- wparams.beam_search.beam_width = 5;
- wparams.beam_search.n_best = 5;
+ if (c.whisper_decode_method == "greedy") {
+ Log(out_, "Using greedy decoding\n");
+ context->fullDefaultParams(eSamplingStrategy::Greedy, &wparams);
+ }
+ else if (c.whisper_decode_method == "beam") {
+ Log(out_, "Using beam search decoding\n");
+ context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams);
+ wparams.beam_search.beam_width = c.whisper_beam_width;
+ wparams.beam_search.n_best = c.whisper_beam_n_best;
+ } else {
+ Log(out_, "Invalid decoding method: {}\n", c.whisper_decode_method);
+ return;
+ }
wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config
// This must be set to keep memory usage from growing without bound.
- wparams.n_max_text_ctx = 100;
+ wparams.n_max_text_ctx = c.whisper_max_ctxt;
wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT {
WhisperCPP* app = static_cast<WhisperCPP*>(user_data);
@@ -540,7 +539,7 @@ void WhisperCPP::StopCustomChatbox() { Log(out_, "Done!\n");
}
-bool WhisperCPP::GetMicsImpl(std::vector<std::unique_ptr<MicInfo>>& mics) {
+bool WhisperCPP::GetMicsImpl(Whisper::iMediaFoundation* f, std::vector<std::unique_ptr<MicInfo>>& mics) {
pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall {
auto mics = static_cast<std::vector<std::unique_ptr<MicInfo>>*>(pv);
for (int i = 0; i < len; i++) {
@@ -549,7 +548,7 @@ bool WhisperCPP::GetMicsImpl(std::vector<std::unique_ptr<MicInfo>>& mics) { return S_OK;
};
mics.clear();
- HRESULT err = f_->listCaptureDevices(dev_cb, &mics);
+ HRESULT err = f->listCaptureDevices(dev_cb, &mics);
if (FAILED(err)) {
Log(out_, "Failed to get microphones: {}\n", err);
return false;
diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h index d58a671..530d65a 100644 --- a/GUI/GUI/GUI/WhisperCPP.h +++ b/GUI/GUI/GUI/WhisperCPP.h @@ -27,9 +27,9 @@ public: WhisperCPP(wxTextCtrl* out);
~WhisperCPP();
- bool Init();
- bool GetMics(std::vector<std::string>& mics);
- bool OpenMic(const int idx, Whisper::iAudioCapture*& stream);
+ bool GetMediaFoundation(Whisper::iMediaFoundation*& f);
+ bool GetMics(Whisper::iMediaFoundation* f, std::vector<std::string>& mics);
+ bool OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream);
bool InstallDependencies();
bool DownloadModel(const std::string& model_name,
const std::filesystem::path& fs_path);
@@ -55,11 +55,10 @@ private: std::wstring endpoint;
};
bool GetMicsImpl(
+ Whisper::iMediaFoundation* f,
std::vector<std::unique_ptr<MicInfo>>& mics);
wxTextCtrl* out_;
- Whisper::iMediaFoundation* f_;
- bool did_init_;
std::future<void> transcription_thd_;
volatile bool run_transcription_;
|
