summaryrefslogtreecommitdiffstats
path: root/GUI
diff options
context:
space:
mode:
Diffstat (limited to 'GUI')
-rw-r--r--GUI/GUI/GUI/.gitignore1
-rw-r--r--GUI/GUI/GUI/Config.cpp8
-rw-r--r--GUI/GUI/GUI/Config.h3
-rw-r--r--GUI/GUI/GUI/Frame.cpp468
-rw-r--r--GUI/GUI/GUI/Frame.h25
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj8
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj.filters6
-rw-r--r--GUI/GUI/GUI/Logging.h19
-rw-r--r--GUI/GUI/GUI/PythonWrapper.cpp16
-rw-r--r--GUI/GUI/GUI/WhisperCPP.cpp419
-rw-r--r--GUI/GUI/GUI/WhisperCPP.h63
-rw-r--r--GUI/Libraries/.gitignore1
-rw-r--r--GUI/Libraries/fetch.ps157
-rw-r--r--GUI/package.ps111
14 files changed, 1071 insertions, 34 deletions
diff --git a/GUI/GUI/GUI/.gitignore b/GUI/GUI/GUI/.gitignore
index e843fdb..36df1a2 100644
--- a/GUI/GUI/GUI/.gitignore
+++ b/GUI/GUI/GUI/.gitignore
@@ -5,3 +5,4 @@ x86
GUI.APS
# No fetched files
ryml.h
+whisper/
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 4985d52..1287619 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -87,7 +87,9 @@ AppConfig::AppConfig()
fx_path(),
params_path(),
menu_path(),
- clear_osc(false)
+ clear_osc(false),
+
+ whisper_model("ggml-base.en.bin")
{}
bool AppConfig::Serialize(const std::filesystem::path& path) {
@@ -115,6 +117,8 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
root["menu_path"] << ryml::to_substr(menu_path);
root["clear_osc"] << clear_osc;
+ root["whisper_model"] << whisper_model;
+
return Config::Serialize(path, &t);
}
@@ -154,6 +158,8 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
root.get_if("menu_path", &c.menu_path);
root.get_if("clear_osc", &c.clear_osc);
+ root.get_if("whisper_model", &c.whisper_model);
+
*this = std::move(c);
return true;
}
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index c4a2f80..07a4d8c 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -60,5 +60,8 @@ public:
std::string params_path;
std::string menu_path;
bool clear_osc;
+
+ // WhisperCPP-specific settings.
+ std::string whisper_model;
};
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 614048c..e0713c1 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -1,6 +1,7 @@
#include "Frame.h"
#include "Logging.h"
#include "PythonWrapper.h"
+#include "Util.h"
#include <filesystem>
#include <string>
@@ -15,6 +16,7 @@ namespace {
ID_NAVBAR_BUTTON_TRANSCRIBE,
ID_NAVBAR_BUTTON_UNITY,
ID_NAVBAR_BUTTON_DEBUG,
+ ID_NAVBAR_BUTTON_WHISPER,
ID_PY_PANEL,
ID_PY_CONFIG_PANEL,
ID_PY_APP_CONFIG_PANEL_PAIRS,
@@ -66,6 +68,26 @@ namespace {
ID_DEBUG_BUTTON_CLEAR_OSC,
ID_DEBUG_BUTTON_BACKUP_VENV,
ID_DEBUG_BUTTON_RESTORE_VENV,
+ ID_WHISPER_PANEL,
+ ID_WHISPER_OUT,
+ ID_WHISPER_CONFIG_PANEL,
+ ID_WHISPER_SETUP_BUTTON,
+ ID_WHISPER_DUMP_MICS_BUTTON,
+ ID_WHISPER_CONFIG_PANEL_PAIRS,
+ ID_WHISPER_MIC,
+ ID_WHISPER_LANG,
+ ID_WHISPER_MODEL,
+ ID_WHISPER_CHARS_PER_SYNC,
+ ID_WHISPER_BYTES_PER_CHAR,
+ ID_WHISPER_BUTTON,
+ ID_WHISPER_ROWS,
+ ID_WHISPER_COLS,
+ ID_WHISPER_WINDOW_DURATION,
+ ID_WHISPER_ENABLE_LOCAL_BEEP,
+ ID_WHISPER_USE_CPU,
+ ID_WHISPER_USE_BUILTIN,
+ ID_WHISPER_START_BUTTON,
+ ID_WHISPER_STOP_BUTTON,
};
const wxString kMicChoices[] = {
@@ -205,6 +227,23 @@ namespace {
const size_t kNumModelChoices = sizeof(kModelChoices) / sizeof(kModelChoices[0]);
constexpr int kModelDefault = 2; // base.en
+ // Source: https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
+ const wxString kWhisperModelChoices[] = {
+ "ggml-tiny.bin",
+ "ggml-tiny.en.bin",
+ "ggml-base.bin",
+ "ggml-base.en.bin",
+ "ggml-small.bin",
+ "ggml-small.en.bin",
+ "ggml-medium.bin",
+ "ggml-medium.en.bin",
+ "ggml-large.bin",
+ "ggml-large.en.bin",
+ };
+ const size_t kNumWhisperModelChoices = sizeof(kWhisperModelChoices) /
+ sizeof(kWhisperModelChoices[0]);
+ constexpr int kWhisperModelDefault = 3; // base.en
+
const wxString kCharsPerSync[] = {
"5",
"6",
@@ -266,6 +305,7 @@ namespace {
} // namespace
+using ::Logging::DrainAsyncOutput;
using ::Logging::Log;
Frame::Frame()
@@ -287,6 +327,8 @@ Frame::Frame()
ID_NAVBAR_BUTTON_UNITY, "Unity");
auto* navbar_button_debug = new wxButton(navbar,
ID_NAVBAR_BUTTON_DEBUG, "Debug");
+ auto* navbar_button_whisper = new wxButton(navbar,
+ ID_NAVBAR_BUTTON_WHISPER, "WhisperCPP");
auto* sizer = new wxBoxSizer(wxVERTICAL);
navbar->SetSizer(sizer);
@@ -297,6 +339,8 @@ Frame::Frame()
/*flags=*/wxEXPAND);
sizer->Add(navbar_button_debug, /*proportion=*/0,
/*flags=*/wxEXPAND);
+ sizer->Add(navbar_button_whisper, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
}
auto* transcribe_panel = new wxPanel(main_panel, ID_PY_PANEL);
@@ -491,7 +535,7 @@ Frame::Frame()
py_app_use_cpu_ = py_app_use_cpu;
auto* py_app_use_builtin = new wxCheckBox(py_config_panel,
- ID_PY_APP_USE_CPU, "Use built-in chatbox");
+ ID_PY_APP_USE_BUILTIN, "Use built-in chatbox");
py_app_use_builtin->SetValue(app_c_.use_builtin);
py_app_use_builtin->SetToolTip(
"If checked, text will be sent to the built-in text box "
@@ -786,6 +830,239 @@ Frame::Frame()
}
unity_panel_->Hide();
+ auto* whisper_panel = new wxPanel(main_panel, ID_WHISPER_PANEL);
+ whisper_panel_ = whisper_panel;
+ {
+ const auto whisper_out_sz = wxSize(/*x_px=*/480, /*y_px=*/160);
+ auto* whisper_out = new wxTextCtrl(whisper_panel,
+ ID_WHISPER_OUT, wxEmptyString, wxDefaultPosition,
+ whisper_out_sz, wxTE_MULTILINE | wxTE_READONLY);
+ whisper_out->SetMinSize(whisper_out_sz);
+ whisper_out_ = whisper_out;
+
+ auto* whisper_config_panel = new wxPanel(whisper_panel,
+ ID_WHISPER_CONFIG_PANEL);
+ {
+ auto* whisper_setup_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_SETUP_BUTTON, "Set up Python virtual environment");
+ whisper_setup_button->SetToolTip(
+ "TaSTT uses the Python programming language to provide both "
+ "transcription services and to interface with Unity. "
+ "It installs its dependencies into an isolated folder "
+ "called a 'virtual environment'. Click this button to "
+ "install those dependencies. This only has to be done "
+ "once when you install a new version of TaSTT.");
+ auto* whisper_dump_mics_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_DUMP_MICS_BUTTON, "List input devices");
+ whisper_dump_mics_button->SetToolTip(
+ "List the microphones (and input devices) attached to "
+ "your computer. To use a microphone, enter the number "
+ "to its left in the 'Microphone' dropdown.");
+ auto* whisper_config_panel_pairs = new wxPanel(whisper_config_panel,
+ ID_WHISPER_CONFIG_PANEL_PAIRS);
+ {
+ auto* whisper_mic = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_MIC, wxDefaultPosition,
+ wxDefaultSize, kNumMicChoices, kMicChoices);
+ whisper_mic->SetToolTip(
+ "Select which microphone to listen to when "
+ "transcribing. To get list microphones and get their "
+ "numbers, click 'List input devices'.");
+ whisper_mic_ = whisper_mic;
+
+ auto* whisper_lang = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_LANG, wxDefaultPosition, wxDefaultSize,
+ kNumLangChoices, kLangChoices);
+ whisper_lang->SetToolTip("Select which language you will "
+ "speak in. It will be whisperd into that language. "
+ "If using a language with non-ASCII characters (i.e. "
+ "not English), make sure you have 'bytes per char' "
+ "set to 2. If using something other than English, "
+ "make sure you're not using a *.en model.");
+ whisper_lang_ = whisper_lang;
+
+ auto* whisper_model = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_MODEL,
+ wxDefaultPosition, wxDefaultSize, kNumWhisperModelChoices,
+ kWhisperModelChoices);
+ whisper_model->SetToolTip("Select which version of "
+ "the transcription model to use. 'base' is a good "
+ "choice for most users. 'small' is slightly more "
+ "accurate, slower, and uses more VRAM. The *.en "
+ "models are fine-tuned English language models, and "
+ "don't work for other languages.");
+ whisper_model_ = whisper_model;
+
+ auto* whisper_chars_per_sync = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_CHARS_PER_SYNC,
+ wxDefaultPosition, wxDefaultSize, kNumCharsPerSync,
+ kCharsPerSync);
+ whisper_chars_per_sync->SetToolTip(
+ "VRChat syncs avatar parameters roughly 5 times per "
+ "second. We use this to send text to the box. By "
+ "sending more characters per sync, the box will be "
+ "faster, but you'll use more avatar parameters.");
+ whisper_chars_per_sync_ = whisper_chars_per_sync;
+
+ auto* whisper_bytes_per_char = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_BYTES_PER_CHAR,
+ wxDefaultPosition, wxDefaultSize, kNumBytesPerChar,
+ kBytesPerChar);
+ whisper_bytes_per_char->SetToolTip(
+ "If you speak a language that uses non-ASCII "
+ "characters (i.e. not English), set this to 2.");
+ whisper_bytes_per_char_ = whisper_bytes_per_char;
+
+ auto* whisper_button = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_BUTTON, wxDefaultPosition,
+ wxDefaultSize, kNumButtons, kButton);
+ whisper_button->SetToolTip(
+ "You will use this button in game to start and stop "
+ "transcription. Set it to a button you're not using "
+ "for anything else!");
+ whisper_button_ = whisper_button;
+
+ auto* whisper_rows = new wxTextCtrl(whisper_config_panel_pairs,
+ ID_WHISPER_ROWS, std::to_string(app_c_.rows),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_rows->SetToolTip(
+ "The number of rows on the text box.");
+ whisper_rows_ = whisper_rows;
+
+ auto* whisper_cols = new wxTextCtrl(whisper_config_panel_pairs,
+ ID_WHISPER_COLS, std::to_string(app_c_.cols),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_cols->SetToolTip(
+ "The number of columns on the text box.");
+ whisper_cols_ = whisper_cols;
+
+ auto* whisper_window_duration = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_WINDOW_DURATION,
+ app_c_.window_duration, wxDefaultPosition,
+ wxDefaultSize, /*style=*/0);
+ whisper_window_duration->SetToolTip(
+ "This controls how long the slice of audio that "
+ "we feed the transcription algorithm is, in seconds. "
+ "Shorter values (as low as 10 seconds) can be whisperd "
+ "more quickly, but are less accurate. Longer values "
+ "(as high as 28 seconds) take longer to whisper, "
+ "but are far more accurate.");
+ whisper_window_duration_ = whisper_window_duration;
+
+ auto* sizer = new wxFlexGridSizer(/*cols=*/2);
+ whisper_config_panel_pairs->SetSizer(sizer);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Microphone:"));
+ sizer->Add(whisper_mic, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Language:"));
+ sizer->Add(whisper_lang, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Model:"));
+ sizer->Add(whisper_model, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Characters per sync:"));
+ sizer->Add(whisper_chars_per_sync, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Bytes per character:"));
+ sizer->Add(whisper_bytes_per_char, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Button:"));
+ sizer->Add(whisper_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Text box rows:"));
+ sizer->Add(whisper_rows, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Text box columns:"));
+ sizer->Add(whisper_cols, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Window duration (s):"));
+ sizer->Add(whisper_window_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ }
+
+ auto* whisper_enable_local_beep = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_ENABLE_LOCAL_BEEP, "Enable local beep");
+ whisper_enable_local_beep->SetValue(app_c_.enable_local_beep);
+ whisper_enable_local_beep->SetToolTip(
+ "By default, TaSTT will play a sound (audible only to "
+ "you) when it begins transcription and when it stops. "
+ "Uncheck this to disable that behavior."
+ );
+ whisper_enable_local_beep_ = whisper_enable_local_beep;
+
+ auto* whisper_use_cpu = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_USE_CPU, "Use CPU");
+ whisper_use_cpu->SetValue(app_c_.use_cpu);
+ whisper_use_cpu->SetToolTip(
+ "If checked, the transcription engine will run on your "
+ "CPU instead of your GPU. This is typically much slower "
+ "and should only be used if you aren't able to use your "
+ "GPU."
+ );
+ whisper_use_cpu_ = whisper_use_cpu;
+
+ auto* whisper_use_builtin = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_USE_BUILTIN, "Use built-in chatbox");
+ whisper_use_builtin->SetValue(app_c_.use_builtin);
+ whisper_use_builtin->SetToolTip(
+ "If checked, text will be sent to the built-in text box "
+ "instead of one attached to the current avatar."
+ );
+ whisper_use_builtin_ = whisper_use_builtin;
+
+ // Hack: Add newlines before and after the button text to make
+ // the buttons bigger, and easier to click from inside VR.
+ auto* whisper_start_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_START_BUTTON, "\nBegin transcribing\n\n");
+ auto* whisper_stop_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_STOP_BUTTON, "\nStop transcribing\n\n");
+
+ auto* sizer = new wxBoxSizer(wxVERTICAL);
+ whisper_config_panel->SetSizer(sizer);
+ sizer->Add(whisper_setup_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_dump_mics_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_config_panel_pairs, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_enable_local_beep, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_use_cpu, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_use_builtin, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_start_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_stop_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ }
+
+ auto* sizer = new wxBoxSizer(wxHORIZONTAL);
+ whisper_panel->SetSizer(sizer);
+ sizer->Add(whisper_config_panel, /*proportion=*/0, /*flags=*/wxEXPAND);
+ sizer->Add(whisper_out, /*proportion=*/1, /*flags=*/wxEXPAND);
+ }
+ whisper_panel_->Hide();
+ whisper_ = std::make_unique<WhisperCPP>(whisper_out_);
+
auto* debug_panel = new wxPanel(main_panel, ID_DEBUG_PANEL);
debug_panel_ = debug_panel;
{
@@ -887,6 +1164,7 @@ Frame::Frame()
sizer->Add(transcribe_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
sizer->Add(unity_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
sizer->Add(debug_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
+ sizer->Add(whisper_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
}
Bind(wxEVT_MENU, &Frame::OnExit, this, wxID_EXIT);
@@ -894,11 +1172,15 @@ Frame::Frame()
ID_NAVBAR_BUTTON_TRANSCRIBE);
Bind(wxEVT_BUTTON, &Frame::OnNavbarUnity, this, ID_NAVBAR_BUTTON_UNITY);
Bind(wxEVT_BUTTON, &Frame::OnNavbarDebug, this, ID_NAVBAR_BUTTON_DEBUG);
+ Bind(wxEVT_BUTTON, &Frame::OnNavbarWhisper, this, ID_NAVBAR_BUTTON_WHISPER);
Bind(wxEVT_BUTTON, &Frame::OnAppStart, this, ID_PY_APP_START_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnAppStop, this, ID_PY_APP_STOP_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperStart, this, ID_WHISPER_START_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperStop, this, ID_WHISPER_STOP_BUTTON);
Bind(wxEVT_TIMER, &Frame::OnAppDrain, this, ID_PY_APP_DRAIN);
Bind(wxEVT_BUTTON, &Frame::OnSetupPython, this, ID_PY_SETUP_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnDumpMics, this, ID_PY_DUMP_MICS_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperDumpMics, this, ID_WHISPER_DUMP_MICS_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnGenerateFX, this,
ID_UNITY_BUTTON_GEN_ANIMATOR);
Bind(wxEVT_BUTTON, &Frame::OnListPip, this, ID_DEBUG_BUTTON_LIST_PIP);
@@ -974,6 +1256,35 @@ void Frame::ApplyConfigToInputFields()
py_app_cols->Clear();
py_app_cols->AppendText(std::to_string(app_c_.cols));
+ // Whisper panel
+ auto* whisper_mic = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MIC));
+ whisper_mic->SetSelection(mic_idx);
+
+ auto* whisper_lang = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_LANG));
+ whisper_lang->SetSelection(lang_idx);
+
+ auto* whisper_model = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MODEL));
+ int whisper_model_idx = GetDropdownChoiceIndex(kWhisperModelChoices,
+ kNumWhisperModelChoices, app_c_.whisper_model, kWhisperModelDefault);
+ whisper_model->SetSelection(model_idx);
+
+ auto* whisper_button = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_BUTTON));
+ whisper_button->SetSelection(button_idx);
+
+ auto* whisper_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_CHARS_PER_SYNC));
+ whisper_chars_per_sync->SetSelection(chars_idx);
+
+ auto* whisper_bytes_per_char = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_BYTES_PER_CHAR));
+ whisper_bytes_per_char->SetSelection(bytes_idx);
+
+ auto* whisper_rows = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_ROWS));
+ whisper_rows->Clear();
+ whisper_rows->AppendText(std::to_string(app_c_.rows));
+
+ auto* whisper_cols = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_COLS));
+ whisper_cols->Clear();
+ whisper_cols->AppendText(std::to_string(app_c_.cols));
+
// Unity panel
auto* unity_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_UNITY_CHARS_PER_SYNC));
unity_chars_per_sync->SetSelection(chars_idx);
@@ -1001,9 +1312,13 @@ void Frame::OnNavbarTranscribe(wxCommandEvent& event)
// Initialize input fields using AppConfig.
ApplyConfigToInputFields();
- transcribe_panel_->Show();
+ transcribe_panel_->Hide();
unity_panel_->Hide();
debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ transcribe_panel_->Show();
Resize();
}
@@ -1013,8 +1328,12 @@ void Frame::OnNavbarUnity(wxCommandEvent& event)
ApplyConfigToInputFields();
transcribe_panel_->Hide();
- unity_panel_->Show();
+ unity_panel_->Hide();
debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ unity_panel_->Show();
Resize();
}
@@ -1025,10 +1344,32 @@ void Frame::OnNavbarDebug(wxCommandEvent& event)
transcribe_panel_->Hide();
unity_panel_->Hide();
+ debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
debug_panel_->Show();
Resize();
}
+void Frame::OnNavbarWhisper(wxCommandEvent& event)
+{
+ // Initialize input fields using AppConfig.
+ ApplyConfigToInputFields();
+
+ transcribe_panel_->Hide();
+ unity_panel_->Hide();
+ debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ whisper_panel_->Show();
+
+ whisper_->Init();
+
+ Resize();
+}
+
void Frame::OnSetupPython(wxCommandEvent& event)
{
if (env_proc_) {
@@ -1081,6 +1422,17 @@ void Frame::OnDumpMics(wxCommandEvent& event)
Log(transcribe_out_, "{}\n", PythonWrapper::DumpMics());
}
+void Frame::OnWhisperDumpMics(wxCommandEvent& event)
+{
+ whisper_->Init();
+ std::vector<std::string> mics;
+ whisper_->GetMics(mics);
+ Log(whisper_out_, "Microphones:\n");
+ for (int i = 0; i < mics.size(); i++) {
+ Log(whisper_out_, " {}: {}\n", i, mics[i]);
+ }
+}
+
bool GetUserPath(const std::string& raw, std::filesystem::path& clean,
const std::string& err_prefix = "", bool must_exist = true) {
clean = raw;
@@ -1537,25 +1889,107 @@ void Frame::OnAppStop(wxCommandEvent& event) {
}
}
-void Frame::OnAppDrain(wxTimerEvent& event) {
- DrainAsyncOutput(py_app_, transcribe_out_);
- DrainAsyncOutput(env_proc_, transcribe_out_);
-}
+void Frame::OnWhisperStart(wxCommandEvent& event) {
+ Log(whisper_out_, "Launching transcription engine\n");
-void Frame::DrainAsyncOutput(wxProcess* proc, wxTextCtrl* frame) {
- if (!proc) {
+ int which_mic = whisper_mic_->GetSelection();
+ if (which_mic == wxNOT_FOUND) {
+ which_mic = kMicDefault;
+ }
+ int which_lang = whisper_lang_->GetSelection();
+ if (which_lang == wxNOT_FOUND) {
+ which_lang = kLangDefault;
+ }
+ int which_model = whisper_model_->GetSelection();
+ if (which_model == wxNOT_FOUND) {
+ which_model = kModelDefault;
+ }
+ int chars_per_sync_idx = whisper_chars_per_sync_->GetSelection();
+ if (chars_per_sync_idx == wxNOT_FOUND) {
+ chars_per_sync_idx = kCharsDefault;
+ }
+ int bytes_per_char_idx = whisper_bytes_per_char_->GetSelection();
+ if (bytes_per_char_idx == wxNOT_FOUND) {
+ bytes_per_char_idx = kBytesDefault;
+ }
+ int button_idx = whisper_button_->GetSelection();
+ if (button_idx == wxNOT_FOUND) {
+ button_idx = kBytesDefault;
+ }
+ const bool enable_local_beep = whisper_enable_local_beep_->GetValue();
+ const bool use_cpu = whisper_use_cpu_->GetValue();
+ const bool use_builtin = whisper_use_builtin_->GetValue();
+ std::string rows_str = whisper_rows_->GetValue().ToStdString();
+ std::string cols_str = whisper_cols_->GetValue().ToStdString();
+ std::string chars_per_sync_str =
+ kCharsPerSync[chars_per_sync_idx].ToStdString();
+ std::string bytes_per_char_str =
+ kBytesPerChar[bytes_per_char_idx].ToStdString();
+ std::string window_duration_str =
+ whisper_window_duration_->GetValue().ToStdString();
+ int rows, cols, chars_per_sync, bytes_per_char, window_duration;
+ try {
+ rows = std::stoi(rows_str);
+ cols = std::stoi(cols_str);
+ chars_per_sync = std::stoi(chars_per_sync_str);
+ bytes_per_char = std::stoi(bytes_per_char_str);
+ window_duration = std::stoi(window_duration_str);
+ }
+ catch (const std::invalid_argument&) {
+ Log(whisper_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
+ "per sync \"{}\", bytes per char \"{}\" or window duration \"{}\" "
+ "as an integer\n", rows_str, cols_str, chars_per_sync_str,
+ bytes_per_char_str, window_duration_str);
+ return;
+ }
+ catch (const std::out_of_range&) {
+ Log(whisper_out_, "Rows \"{}\", cols \"{}\", chars per sync "
+ "\"{}\", bytes per char \"{}\" or window duration \"{}\" are out "
+ "of range\n", rows_str, cols_str, chars_per_sync_str,
+ bytes_per_char_str, window_duration_str);
+ return;
+ }
+ const int max_rows = 10;
+ const int max_cols = 240;
+ const int min_window_duration_s = 10;
+ const int max_window_duration_s = 28;
+ if (rows < 0 || rows > max_rows ||
+ cols < 0 || cols > max_cols ||
+ window_duration < min_window_duration_s ||
+ window_duration > max_window_duration_s) {
+ Log(whisper_out_, "Rows not on [{},{}] or cols not on [{},{}] or "
+ "window_duration not on [{},{}]\n",
+ 0, max_rows,
+ 0, max_cols,
+ min_window_duration_s, max_window_duration_s);
return;
}
- while (proc->IsInputAvailable()) {
- wxTextInputStream iss(*(proc->GetInputStream()));
- Log(frame, " {}\n", iss.ReadLine());
- }
+ app_c_.microphone = kMicChoices[which_mic].ToStdString();
+ app_c_.language = kLangChoices[which_lang].ToStdString();
+ app_c_.whisper_model = kWhisperModelChoices[which_model].ToStdString();
+ app_c_.chars_per_sync = chars_per_sync;
+ app_c_.bytes_per_char = bytes_per_char;
+ app_c_.button = kButton[button_idx].ToStdString();
+ app_c_.rows = rows;
+ app_c_.cols = cols;
+ app_c_.window_duration = std::to_string(window_duration);
+ app_c_.enable_local_beep = enable_local_beep;
+ app_c_.use_cpu = use_cpu;
+ app_c_.use_builtin = use_builtin;
+ app_c_.Serialize(AppConfig::kConfigPath);
- while (proc->IsErrorAvailable()) {
- wxTextInputStream iss(*(proc->GetErrorStream()));
- Log(frame, " {}\n", iss.ReadLine());
- }
+ whisper_->Start(app_c_);
+ Log(whisper_out_, "Control flow exit start button\n");
+}
+
+void Frame::OnWhisperStop(wxCommandEvent& event) {
+ whisper_->Stop();
+}
+
+void Frame::OnAppDrain(wxTimerEvent& event) {
+ DrainAsyncOutput(py_app_, transcribe_out_);
+ DrainAsyncOutput(env_proc_, transcribe_out_);
}
void Frame::LoadAndSetIcons() {
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 7d3465f..ef65d7f 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -8,6 +8,7 @@
#endif
#include "Config.h"
+#include "WhisperCPP.h"
#include <memory>
@@ -23,10 +24,12 @@ private:
wxPanel* transcribe_panel_;
wxPanel* unity_panel_;
wxPanel* debug_panel_;
+ wxPanel* whisper_panel_;
wxTextCtrl* transcribe_out_;
wxTextCtrl* unity_out_;
wxTextCtrl* debug_out_;
+ wxTextCtrl* whisper_out_;
wxTextCtrl* unity_animator_generated_dir_;
wxTextCtrl* unity_animator_generated_name_;
@@ -38,6 +41,9 @@ private:
wxTextCtrl* py_app_window_duration_;
wxTextCtrl* unity_rows_;
wxTextCtrl* unity_cols_;
+ wxTextCtrl* whisper_rows_;
+ wxTextCtrl* whisper_cols_;
+ wxTextCtrl* whisper_window_duration_;
wxDirPickerCtrl* unity_assets_file_picker_;
wxFilePickerCtrl* unity_animator_file_picker_;
@@ -53,18 +59,32 @@ private:
wxChoice* py_app_button_;
wxChoice* unity_chars_per_sync_;
wxChoice* unity_bytes_per_char_;
+ wxChoice* whisper_mic_;
+ wxChoice* whisper_lang_;
+ wxChoice* whisper_model_;
+ wxChoice* whisper_chars_per_sync_;
+ wxChoice* whisper_bytes_per_char_;
+ wxChoice* whisper_button_;
wxCheckBox* py_app_enable_local_beep_;
wxCheckBox* py_app_use_cpu_;
wxCheckBox* py_app_use_builtin_;
wxCheckBox* unity_clear_osc_;
+ wxCheckBox* whisper_enable_local_beep_;
+ wxCheckBox* whisper_use_cpu_;
+ wxCheckBox* whisper_use_builtin_;
wxProcess* py_app_;
wxProcess* env_proc_;
wxTimer py_app_drain_;
+ wxProcess* whisper_app_;
+ wxTimer whisper_app_drain_;
+
AppConfig app_c_;
+ std::unique_ptr<WhisperCPP> whisper_;
+
// Initialize GUI input fields using `app_c_`.
void ApplyConfigToInputFields();
@@ -72,12 +92,15 @@ private:
void OnNavbarTranscribe(wxCommandEvent& event);
void OnNavbarUnity(wxCommandEvent& event);
void OnNavbarDebug(wxCommandEvent& event);
+ void OnNavbarWhisper(wxCommandEvent& event);
void OnSetupPython(wxCommandEvent& event);
void OnDumpMics(wxCommandEvent& event);
+ void OnWhisperDumpMics(wxCommandEvent& event);
void OnAppStart(wxCommandEvent& event);
void OnAppStop(wxCommandEvent& event);
+ void OnWhisperStart(wxCommandEvent& event);
+ void OnWhisperStop(wxCommandEvent& event);
void OnAppDrain(wxTimerEvent& event);
- void DrainAsyncOutput(wxProcess* proc, wxTextCtrl *frame);
void OnGenerateFX(wxCommandEvent& event);
void OnUnityParamChangeImpl();
void OnUnityParamChange(wxCommandEvent& event);
diff --git a/GUI/GUI/GUI/GUI.vcxproj b/GUI/GUI/GUI/GUI.vcxproj
index 23f3d88..0243d73 100644
--- a/GUI/GUI/GUI/GUI.vcxproj
+++ b/GUI/GUI/GUI/GUI.vcxproj
@@ -74,6 +74,10 @@
<Import Project="..\..\Libraries\wx\wxwidgets.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(ProjectDir)/whisper/</LibraryPath>
+ <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(ProjectDir)/whisper</IncludePath>
+ </PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
@@ -104,6 +108,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -136,6 +141,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
@@ -145,6 +151,7 @@
<ClCompile Include="Logging.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="PythonWrapper.cpp" />
+ <ClCompile Include="WhisperCPP.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="App.h" />
@@ -156,6 +163,7 @@
<ClInclude Include="ryml.h" />
<ClInclude Include="ScopeGuard.h" />
<ClInclude Include="Util.h" />
+ <ClInclude Include="WhisperCPP.h" />
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GUI.rc" />
diff --git a/GUI/GUI/GUI/GUI.vcxproj.filters b/GUI/GUI/GUI/GUI.vcxproj.filters
index 2798d1e..1ec73d1 100644
--- a/GUI/GUI/GUI/GUI.vcxproj.filters
+++ b/GUI/GUI/GUI/GUI.vcxproj.filters
@@ -33,6 +33,9 @@
<ClCompile Include="Config.cpp">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="WhisperCPP.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="PythonWrapper.h">
@@ -62,6 +65,9 @@
<ClInclude Include="Util.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="WhisperCPP.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GUI.rc">
diff --git a/GUI/GUI/GUI/Logging.h b/GUI/GUI/GUI/Logging.h
index 9fb88fd..99462c0 100644
--- a/GUI/GUI/GUI/Logging.h
+++ b/GUI/GUI/GUI/Logging.h
@@ -8,6 +8,9 @@
#include <wx/wx.h>
#endif
+#include <wx/process.h>
+#include <wx/txtstrm.h>
+
#include <format>
#include <string>
#include <string_view>
@@ -51,5 +54,21 @@ namespace Logging {
frame->Remove(0, frame->GetLastPosition() - max_frame_len_bytes);
}
}
+
+ inline void DrainAsyncOutput(wxProcess* proc, wxTextCtrl* frame) {
+ if (!proc) {
+ return;
+ }
+
+ while (proc->IsInputAvailable()) {
+ wxTextInputStream iss(*(proc->GetInputStream()));
+ Log(frame, " {}\n", iss.ReadLine());
+ }
+
+ while (proc->IsErrorAvailable()) {
+ wxTextInputStream iss(*(proc->GetErrorStream()));
+ Log(frame, " {}\n", iss.ReadLine());
+ }
+ }
}
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index c353003..bcb7b1d 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -7,6 +7,7 @@
#include <stdio.h>
#include <filesystem>
+#include <fstream>
#include <sstream>
using ::Logging::Log;
@@ -167,8 +168,21 @@ std::string PythonWrapper::DumpMics() {
bool PythonWrapper::InstallPip(std::string* out) {
std::string result;
+ std::filesystem::path pip_flag = "Resources/Python/.pip_installed";
+ if (std::filesystem::exists(pip_flag)) {
+ return true;
+ }
+
std::string pip_path = "Resources/Python/get-pip.py";
- return InvokeWithArgs({ pip_path }, out);
+ if (!InvokeWithArgs({ pip_path }, out)) {
+ return false;
+ }
+
+ // Create the flag file so subsstd::chrono::milliseconds(100));equent calls don't reinstall.
+ std::ofstream flag_ofs(pip_path);
+ flag_ofs.close();
+
+ return true;
}
wxProcess* PythonWrapper::StartApp(
diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp
new file mode 100644
index 0000000..742f0a4
--- /dev/null
+++ b/GUI/GUI/GUI/WhisperCPP.cpp
@@ -0,0 +1,419 @@
+#include "Logging.h"
+#include "PythonWrapper.h"
+#include "ScopeGuard.h"
+#include "Util.h"
+#include "WhisperCPP.h"
+
+#include <unknwn.h>
+#include <wchar.h>
+#include <winerror.h>
+
+#include "whisperWindows.h"
+
+#include <charconv>
+#include <codecvt>
+#include <cwchar>
+#include <fstream>
+#include <locale>
+#include <string>
+#include <vector>
+
+using namespace Whisper;
+using ::Logging::DrainAsyncOutput;
+using ::Logging::Log;
+
+namespace {
+ std::string wcharToAsciiString(const wchar_t* wc_str) {
+ int len = wcslen(wc_str);
+ std::string result(len, 0);
+
+ size_t len_out;
+ wcstombs_s(&len_out, result.data(), len, wc_str, _TRUNCATE);
+
+ return result;
+ }
+
+ std::string hresultToString(HRESULT err) {
+ LPWSTR errorText = nullptr;
+
+ // Call FormatMessage to retrieve the error message
+ DWORD size = FormatMessage(
+ FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ nullptr,
+ err,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPWSTR)&errorText,
+ 0,
+ nullptr);
+
+ // Check if the error message was retrieved successfully
+ if (size <= 0) {
+ std::ostringstream oss;
+ oss << "HRESULT:" << err;
+ return oss.str();
+ }
+ std::wstring errorMessage(errorText, size);
+ LocalFree(errorText);
+ // Convert the wide string to a narrow string for printing
+ return std::string(errorMessage.begin(), errorMessage.end());
+ }
+};
+
+WhisperCPP::WhisperCPP(wxTextCtrl* out)
+ : out_(out), f_(nullptr), did_init_(false), proc_(nullptr), run_(false)
+{
+ Log(out_, "Setting concurrency to 2: {}\n", wxThread::SetConcurrency(2));
+}
+
+WhisperCPP::~WhisperCPP() {
+ f_->Release();
+}
+
+bool WhisperCPP::Init() {
+ if (did_init_) {
+ return true;
+ }
+
+ iMediaFoundation* tmp_f = nullptr;
+ HRESULT err = initMediaFoundation(&tmp_f);
+ if (FAILED(err)) {
+ Log(out_, "Failed to initialize media layer: {}", err);
+ return false;
+ }
+ f_ = tmp_f;
+
+ did_init_ = true;
+ Log(out_, "Initialized successfully\n");
+ return true;
+}
+
+bool WhisperCPP::GetMics(std::vector<std::string>& mics) {
+ if (!did_init_) {
+ return false;
+ }
+
+ std::vector<sCaptureDevice> mics_raw;
+ if (!GetMicsImpl(mics_raw)) {
+ return false;
+ }
+
+ mics.clear();
+ for (const auto& raw_mic : mics_raw) {
+ mics.push_back(wcharToAsciiString(raw_mic.displayName));
+ }
+
+ return true;
+}
+
+bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) {
+ if (!did_init_) {
+ Log(out_, "Whisper not initialized\n");
+ return false;
+ }
+
+ std::vector<sCaptureDevice> mics_raw;
+ if (!GetMicsImpl(mics_raw)) {
+ return false;
+ }
+
+ if (mics_raw.size() <= idx) {
+ Log(out_, "Mic index out of range: {} vs. {}\n", idx, mics_raw.size());
+ return false;
+ }
+
+ Whisper::sCaptureParams params{};
+ stream = nullptr;
+ HRESULT err = f_->openCaptureDevice(mics_raw[idx].endpoint, params,
+ &stream);
+ if (FAILED(err)) {
+ Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx,
+ wcharToAsciiString(mics_raw[idx].displayName),
+ hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+bool WhisperCPP::InstallDependencies(wxProcess*& proc) {
+ std::filesystem::path flag_file = "Resources/.whisper_deps_installed";
+ flag_file = flag_file.lexically_normal();
+
+ if (std::filesystem::exists(flag_file)) {
+ proc = nullptr;
+ return true;
+ }
+
+ auto cb = [&](wxProcess* proc, int ret) -> void {
+ Log(out_, "Dependency installation exited with code {}\n", ret);
+ if (ret == 0) {
+ Log(out_, "Dependency installation finished\n");
+ }
+ DrainAsyncOutput(proc, out_);
+ return;
+ };
+
+ proc = PythonWrapper::InvokeAsyncWithArgs({
+ "-u", // Unbuffered output
+ "-m pip",
+ "install",
+ "-r Resources/Scripts/whisper_requirements.txt",
+ }, std::move(cb));
+ if (!proc) {
+ Log(out_, "Failed to launch installation thread!\n");
+ return false;
+ }
+
+ // Create the flag file so subsequent calls don't reinstall.
+ std::ofstream flagfile_ofs(flag_file);
+ flagfile_ofs.close();
+
+ return true;
+}
+
+bool WhisperCPP::DownloadModel(const std::string& model_name,
+ const std::filesystem::path& fs_path, wxProcess*& proc) {
+ auto cb = [&](wxProcess* proc, int ret) -> void {
+ Log(out_, "Model download completed with code {}\n", ret);
+ if (ret == 0) {
+ Log(out_, "Model download finished\n");
+ }
+ DrainAsyncOutput(proc, out_);
+ return;
+ };
+
+ std::ostringstream url_oss;
+ url_oss << "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/";
+ url_oss << model_name;
+ Log(out_, "Model will be saved to {}\n", fs_path.lexically_normal().string());
+ proc = PythonWrapper::InvokeAsyncWithArgs({
+ "-u", // Unbuffered output
+ "-m wget",
+ url_oss.str(),
+ "-o", fs_path.string(),
+ }, std::move(cb));
+ if (!proc) {
+ Log(out_, "Failed to launch download thread!\n");
+ return false;
+ }
+ return true;
+}
+
+std::wstring utf8ToUtf16(const std::string& utf8) {
+ int wide_str_len = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
+ std::wstring utf16(wide_str_len, 0);
+ MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, utf16.data(), wide_str_len);
+ return utf16;
+}
+
+bool WhisperCPP::LoadModel(const std::string& path, Whisper::iModel*& model) {
+ model = nullptr;
+ HRESULT err = Whisper::loadModel(utf8ToUtf16(path).c_str(),
+ eModelImplementation::GPU, /*flags=*/0, /*callbacks=*/nullptr, &model);
+ if (FAILED(err)) {
+ Log(out_, "Failed to load model: {}\n", hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& context) {
+ context = nullptr;
+ HRESULT err = model->createContext(&context);
+ if (FAILED(err)) {
+ Log(out_, "Failed to create context: {}\n", hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+WhisperCPP::AppThread::AppThread(
+ const std::function<void(AppThread* thd)>&& cb,
+ WhisperCPP* app)
+ : wxThread(wxTHREAD_DETACHED), cb_(cb), app_(app)
+{}
+WhisperCPP::AppThread::~AppThread()
+{
+ Log(app_->out_, "Destroy transcription thread\n");
+ app_->proc_ = nullptr;
+}
+
+void* WhisperCPP::AppThread::Entry() {
+ cb_(this);
+ return nullptr;
+}
+
+void WhisperCPP::Start(const AppConfig& c) {
+ if (proc_) {
+ Log(out_, "Transcription engine already running\n");
+ return;
+ }
+
+ // TODO(yum) use app config to select mic
+ proc_ = new AppThread([&](AppThread* thd) {
+ Log(out_, "Transcription thread top\n");
+ run_ = true;
+
+ Whisper::iAudioCapture* mic_stream;
+ if (!OpenMic(1, mic_stream)) {
+ return;
+ }
+ ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); });
+
+ {
+ std::string output;
+ Log(out_, "Installing pip\n");
+ if (!PythonWrapper::InstallPip(&output)) {
+ Log(out_, "Failed to install pip: {}\n", output);
+ }
+ }
+
+ {
+ Log(out_, "Installing Python dependencies\n");
+ wxProcess* proc = nullptr;
+ if (!InstallDependencies(proc)) {
+ return;
+ }
+ while (proc && proc->Exists(proc->GetPid())) {
+ if (!run_ || thd->TestDestroy()) {
+ proc->Kill(proc->GetPid(), wxSIGKILL);
+ return;
+ }
+ wxThread::Sleep(100);
+ }
+ }
+
+ std::filesystem::path model_path = "Resources/Models";
+ model_path /= c.whisper_model;
+ if (std::filesystem::exists(model_path)) {
+ Log(out_, "Model found at {}\n", model_path.string());
+ }
+ else {
+ Log(out_, "Downloading model {}\n", c.whisper_model);
+ wxProcess* proc = nullptr;
+ model_path = model_path.lexically_normal();
+ if (!DownloadModel(c.whisper_model, model_path, proc)) {
+ return;
+ }
+ while (proc->Exists(proc->GetPid())) {
+ if (!run_ || thd->TestDestroy()) {
+ proc->Kill(proc->GetPid(), wxSIGKILL);
+ std::filesystem::remove(model_path);
+ return;
+ }
+ wxThread::Sleep(100);
+ }
+ }
+
+ Whisper::iModel* model;
+ if (!LoadModel(model_path.string(), model)) {
+ return;
+ }
+ ScopeGuard model_cleanup([model]() { model->Release(); });
+
+ Whisper::iContext* context;
+ if (!CreateContext(model, context)) {
+ return;
+ }
+ ScopeGuard context_cleanup([context]() { context->Release(); });
+
+ Whisper::sFullParams wparams{};
+ context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams);
+ wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config
+ wparams.n_max_text_ctx = 20;
+
+ wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT {
+ wxTextCtrl* out = static_cast<wxTextCtrl*>(user_data);
+ iTranscribeResult* results = nullptr;
+ HRESULT err = context->getResults(eResultFlags::Timestamps | eResultFlags::Tokens, &results);
+ if (FAILED(err)) {
+ Log(out, "Failed to get transcription: {}\n", hresultToString(err));
+ return S_OK;
+ }
+ ScopeGuard results_cleanup([results]() { results->Release(); });
+
+ sTranscribeLength length;
+ err = results->getSize(length);
+ if (FAILED(err)) {
+ Log(out, "Failed to get transcription size: {}\n", hresultToString(err));
+ return S_OK;
+ }
+
+ const sSegment* const segments = results->getSegments();
+ const sToken* const tokens = results->getTokens();
+ const int s0 = length.countSegments - n_new;
+ for (int i = s0; i < length.countSegments; i++) {
+ const sSegment& seg = segments[i];
+ Log(out, "{} ", seg.text);
+ for (int j = 0; j < seg.countTokens; j++) {
+ const sToken& tok = tokens[seg.firstToken + j];
+ if (*tok.text == 0 || tok.text[0] == '[') {
+ continue;
+ }
+ }
+ }
+ if (n_new) {
+ Log(out, "\n");
+ }
+
+ return S_OK;
+ };
+ wparams.new_segment_callback_user_data = out_;
+
+ sCaptureCallbacks callbacks{};
+ callbacks.shouldCancel = [](void* pv) noexcept -> HRESULT __stdcall {
+ WhisperCPP* app = static_cast<WhisperCPP*>(pv);
+ if (app->proc_->TestDestroy() || !app->run_) {
+ Log(app->out_, "Exit transcription loop\n");
+ return S_FALSE;
+ }
+ static int i = 0;
+ if (++i % 10 == 0) {
+ Log(app->out_, "Spin {}\n", i);
+ }
+ // Sleeping here prevents the GUI from hanging.
+ wxThread::Sleep(10);
+ return S_OK;
+ };
+ callbacks.pv = this;
+
+ // This will block.
+ HRESULT err = context->runCapture(wparams, callbacks, mic_stream);
+ if (FAILED(err)) {
+ Log(out_, "Capture failed: {}\n", hresultToString(err));
+ return;
+ }
+
+ Log(out_, "Exit transcription engine\n");
+ }, this);
+
+ proc_->Run();
+
+ Log(out_, "Success!\n");
+ return;
+}
+
+void WhisperCPP::Stop() {
+ Log(out_, "Stopping transcription engine...\n");
+ run_ = false;
+}
+
+bool WhisperCPP::GetMicsImpl(std::vector<sCaptureDevice>& mics) {
+ pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall {
+ std::vector<sCaptureDevice>* mics = static_cast<std::vector<sCaptureDevice>*>(pv);
+ for (int i = 0; i < len; i++) {
+ mics->push_back(buf[i]);
+ }
+ return S_OK;
+ };
+ mics.clear();
+ HRESULT err = f_->listCaptureDevices(dev_cb, &mics);
+ if (FAILED(err)) {
+ Log(out_, "Failed to get microphones: {}\n", err);
+ return false;
+ }
+
+ return true;
+}
diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h
new file mode 100644
index 0000000..20b0106
--- /dev/null
+++ b/GUI/GUI/GUI/WhisperCPP.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <wx/filepicker.h>
+#include <wx/wxprec.h>
+#include <wx/process.h>
+#include <wx/thread.h>
+
+#ifndef WX_PRECOMP
+#include <wx/wx.h>
+#endif
+
+#include <unknwn.h>
+#include <wchar.h>
+#include <winerror.h>
+
+#include "whisperWindows.h"
+
+#include "Config.h"
+
+#include <filesystem>
+#include <functional>
+#include <string>
+#include <vector>
+
+class WhisperCPP {
+public:
+ WhisperCPP(wxTextCtrl* out);
+ ~WhisperCPP();
+
+ bool Init();
+ bool GetMics(std::vector<std::string>& mics);
+ bool OpenMic(const int idx, Whisper::iAudioCapture*& stream);
+ bool InstallDependencies(wxProcess*& proc);
+ bool DownloadModel(const std::string& model_name,
+ const std::filesystem::path& fs_path, wxProcess*& proc);
+ bool LoadModel(const std::string& path, Whisper::iModel*& model);
+ bool CreateContext(Whisper::iModel* model, Whisper::iContext*& context);
+
+ void Start(const AppConfig& c);
+ void Stop();
+
+private:
+ bool GetMicsImpl(std::vector<Whisper::sCaptureDevice>& mics);
+
+ class AppThread : public wxThread {
+ public:
+ AppThread(const std::function<void(AppThread* thd)>&& cb, WhisperCPP* app);
+
+ virtual ~AppThread();
+
+ virtual void* Entry() wxOVERRIDE;
+
+ private:
+ const std::function<void(AppThread* thd)> cb_;
+ WhisperCPP* app_;
+ };
+
+ wxTextCtrl* out_;
+ Whisper::iMediaFoundation* f_;
+ bool did_init_;
+ AppThread* volatile proc_;
+ volatile bool run_;
+};
diff --git a/GUI/Libraries/.gitignore b/GUI/Libraries/.gitignore
index fb46029..0e14a69 100644
--- a/GUI/Libraries/.gitignore
+++ b/GUI/Libraries/.gitignore
@@ -1,4 +1,5 @@
# Don't check in anything we fetch
wx
rapidyaml
+whisper
diff --git a/GUI/Libraries/fetch.ps1 b/GUI/Libraries/fetch.ps1
index 78bf0d5..f13bad5 100644
--- a/GUI/Libraries/fetch.ps1
+++ b/GUI/Libraries/fetch.ps1
@@ -1,34 +1,65 @@
+param(
+ [switch]$overwrite = $false
+)
+
Set-PSDebug -trace 0
$WX_3_2_1_URL = "https://github.com/wxWidgets/wxWidgets/releases/download/v3.2.1/wxWidgets-3.2.1.zip"
$WX_URL = $WX_3_2_1_URL
$WX_FILE = $(Split-Path -Path $WX_URL -Leaf)
+$WHISPER_1_7_0_URL = "https://github.com/Const-me/Whisper/releases/download/1.7.0/Library.zip"
+$WHISPER_URL = $WHISPER_1_7_0_URL
+$WHISPER_FILE = $(Split-Path -Path $WHISPER_URL -Leaf)
+
pushd $PSScriptRoot
# WX
-if (Test-Path wx) {
+if ((Test-Path wx) -And ($overwrite)) {
rm -Recurse wx
}
-mkdir wx
-pushd wx > $null
-Invoke-WebRequest $WX_URL -OutFile $WX_FILE
-Expand-Archive $WX_FILE -DestinationPath .
-popd > $null
+if (-Not (Test-Path wx)) {
+ mkdir wx
+ pushd wx > $null
+ Invoke-WebRequest $WX_URL -OutFile $WX_FILE
+ Expand-Archive $WX_FILE -DestinationPath .
+ popd > $null
+}
# RAPIDYAML
-if (Test-Path rapidyaml) {
+if ((Test-Path rapidyaml) -And ($overwrite)) {
rm -Recurse rapidyaml
}
-git clone https://github.com/biojppm/rapidyaml
-pushd rapidyaml > $null
-git checkout v0.5.0
-git submodule update --init --recursive
+if (-Not (Test-Path rapidyaml)) {
+ git clone https://github.com/biojppm/rapidyaml
+ pushd rapidyaml > $null
+ git checkout v0.5.0
+ git submodule update --init --recursive
+
+ python3 tools/amalgamate.py ryml.h
+ cp ryml.h ../../GUI/GUI/ryml.h
+}
+
+if ((Test-Path whisper) -And ($overwrite)) {
+ rm -Recurse whisper
+}
-python3 tools/amalgamate.py ryml.h
-cp ryml.h ../../GUI/GUI/ryml.h
+if (-Not (Test-Path whisper)) {
+ mkdir whisper
+ pushd whisper > $null
+ Invoke-WebRequest $WHISPER_URL -OutFile $WHISPER_FILE
+ Expand-Archive $WHISPER_FILE -DestinationPath .
+ if (Test-Path ../../GUI/GUI/whisper/) {
+ rm -Recurse ../../GUI/GUI/whisper/
+ }
+ mkdir ../../GUI/GUI/whisper/
+ cp Include/*.h ../../GUI/GUI/whisper/
+ cp Linker/*.lib ../../GUI/GUI/whisper/Whisper.lib
+ cp Binary/*.dll ../../GUI/GUI/whisper/Whisper.dll
+ popd > $null
+}
popd > $null # rapidyaml
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 400ad6c..fa9eee4 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -5,7 +5,7 @@ param(
$install_dir = "TaSTT"
if (Test-Path $install_dir) {
- rm -Recurse $install_dir
+ rm -Recurse -Force $install_dir
}
$py_dir = "Python"
@@ -61,6 +61,12 @@ if (-Not (Test-Path $git_dir)) {
Read-Host -Prompt "Press enter once PortableGit is installed at $pwd\PortableGit"
}
+#$WHISPER_CHECKPOINT_URL = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
+#$WHISPER_CHECKPOINT_FILE = $(Split-Path -Path $WHISPER_CHECKPOINT_URL -Leaf)
+#if (-Not (Test-Path $WHISPER_CHECKPOINT_FILE)) {
+# Invoke-WebRequest $WHISPER_CHECKPOINT_URL -OutFile $WHISPER_CHECKPOINT_FILE
+#}
+
mkdir $install_dir > $null
mkdir $install_dir/Resources > $null
cp -Recurse ../Animations TaSTT/Resources/Animations
@@ -75,6 +81,9 @@ cp -Recurse ../Shaders TaSTT/Resources/Shaders
cp -Recurse ../Sounds TaSTT/Resources/Sounds
cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets
cp GUI/x64/Release/GUI.exe TaSTT/TaSTT.exe
+cp GUI/GUI/Whisper/Whisper.dll TaSTT/Whisper.dll
+mkdir TaSTT/Resources/Models
+#cp $WHISPER_CHECKPOINT_FILE TaSTT/Resources/Models/
if (-Not $skip_zip) {
Compress-Archive -Path "$install_dir" -DestinationPath "$install_dir.zip" -Force