summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-02-21 13:19:43 -0800
committeryum <yum.food.vr@gmail.com>2023-02-22 21:49:29 -0800
commit9a97fbc3c583ccd518d838faaaa36ed9aa5558e1 (patch)
tree92646de70fdd4971092de3d5cf76dce79978cd8e
parentcece1ee8f1b985c2a89adb661dd02c6d44787f67 (diff)
Begin work on C++ implementation
Use Const-me/Whisper to perform transcription. This implementation is vastly more efficient: CPU usage, memory usage, and VRAM usage are all dramatically reduced. It's slightly less accurate when comparing the same model (due to the lack of beam search decoding), but since you can use larger models, the impact is largely a wash.
-rw-r--r--GUI/GUI/GUI/.gitignore1
-rw-r--r--GUI/GUI/GUI/Config.cpp8
-rw-r--r--GUI/GUI/GUI/Config.h3
-rw-r--r--GUI/GUI/GUI/Frame.cpp468
-rw-r--r--GUI/GUI/GUI/Frame.h25
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj8
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj.filters6
-rw-r--r--GUI/GUI/GUI/Logging.h19
-rw-r--r--GUI/GUI/GUI/PythonWrapper.cpp16
-rw-r--r--GUI/GUI/GUI/WhisperCPP.cpp419
-rw-r--r--GUI/GUI/GUI/WhisperCPP.h63
-rw-r--r--GUI/Libraries/.gitignore1
-rw-r--r--GUI/Libraries/fetch.ps157
-rw-r--r--GUI/package.ps111
-rw-r--r--Scripts/string_matcher.py4
-rw-r--r--Scripts/whisper_requirements.txt8
16 files changed, 1083 insertions, 34 deletions
diff --git a/GUI/GUI/GUI/.gitignore b/GUI/GUI/GUI/.gitignore
index e843fdb..36df1a2 100644
--- a/GUI/GUI/GUI/.gitignore
+++ b/GUI/GUI/GUI/.gitignore
@@ -5,3 +5,4 @@ x86
GUI.APS
# No fetched files
ryml.h
+whisper/
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 4985d52..1287619 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -87,7 +87,9 @@ AppConfig::AppConfig()
fx_path(),
params_path(),
menu_path(),
- clear_osc(false)
+ clear_osc(false),
+
+ whisper_model("ggml-base.en.bin")
{}
bool AppConfig::Serialize(const std::filesystem::path& path) {
@@ -115,6 +117,8 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
root["menu_path"] << ryml::to_substr(menu_path);
root["clear_osc"] << clear_osc;
+ root["whisper_model"] << whisper_model;
+
return Config::Serialize(path, &t);
}
@@ -154,6 +158,8 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
root.get_if("menu_path", &c.menu_path);
root.get_if("clear_osc", &c.clear_osc);
+ root.get_if("whisper_model", &c.whisper_model);
+
*this = std::move(c);
return true;
}
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index c4a2f80..07a4d8c 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -60,5 +60,8 @@ public:
std::string params_path;
std::string menu_path;
bool clear_osc;
+
+ // WhisperCPP-specific settings.
+ std::string whisper_model;
};
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 614048c..e0713c1 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -1,6 +1,7 @@
#include "Frame.h"
#include "Logging.h"
#include "PythonWrapper.h"
+#include "Util.h"
#include <filesystem>
#include <string>
@@ -15,6 +16,7 @@ namespace {
ID_NAVBAR_BUTTON_TRANSCRIBE,
ID_NAVBAR_BUTTON_UNITY,
ID_NAVBAR_BUTTON_DEBUG,
+ ID_NAVBAR_BUTTON_WHISPER,
ID_PY_PANEL,
ID_PY_CONFIG_PANEL,
ID_PY_APP_CONFIG_PANEL_PAIRS,
@@ -66,6 +68,26 @@ namespace {
ID_DEBUG_BUTTON_CLEAR_OSC,
ID_DEBUG_BUTTON_BACKUP_VENV,
ID_DEBUG_BUTTON_RESTORE_VENV,
+ ID_WHISPER_PANEL,
+ ID_WHISPER_OUT,
+ ID_WHISPER_CONFIG_PANEL,
+ ID_WHISPER_SETUP_BUTTON,
+ ID_WHISPER_DUMP_MICS_BUTTON,
+ ID_WHISPER_CONFIG_PANEL_PAIRS,
+ ID_WHISPER_MIC,
+ ID_WHISPER_LANG,
+ ID_WHISPER_MODEL,
+ ID_WHISPER_CHARS_PER_SYNC,
+ ID_WHISPER_BYTES_PER_CHAR,
+ ID_WHISPER_BUTTON,
+ ID_WHISPER_ROWS,
+ ID_WHISPER_COLS,
+ ID_WHISPER_WINDOW_DURATION,
+ ID_WHISPER_ENABLE_LOCAL_BEEP,
+ ID_WHISPER_USE_CPU,
+ ID_WHISPER_USE_BUILTIN,
+ ID_WHISPER_START_BUTTON,
+ ID_WHISPER_STOP_BUTTON,
};
const wxString kMicChoices[] = {
@@ -205,6 +227,23 @@ namespace {
const size_t kNumModelChoices = sizeof(kModelChoices) / sizeof(kModelChoices[0]);
constexpr int kModelDefault = 2; // base.en
+ // Source: https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
+ const wxString kWhisperModelChoices[] = {
+ "ggml-tiny.bin",
+ "ggml-tiny.en.bin",
+ "ggml-base.bin",
+ "ggml-base.en.bin",
+ "ggml-small.bin",
+ "ggml-small.en.bin",
+ "ggml-medium.bin",
+ "ggml-medium.en.bin",
+ "ggml-large.bin",
+ "ggml-large.en.bin",
+ };
+ const size_t kNumWhisperModelChoices = sizeof(kWhisperModelChoices) /
+ sizeof(kWhisperModelChoices[0]);
+ constexpr int kWhisperModelDefault = 3; // base.en
+
const wxString kCharsPerSync[] = {
"5",
"6",
@@ -266,6 +305,7 @@ namespace {
} // namespace
+using ::Logging::DrainAsyncOutput;
using ::Logging::Log;
Frame::Frame()
@@ -287,6 +327,8 @@ Frame::Frame()
ID_NAVBAR_BUTTON_UNITY, "Unity");
auto* navbar_button_debug = new wxButton(navbar,
ID_NAVBAR_BUTTON_DEBUG, "Debug");
+ auto* navbar_button_whisper = new wxButton(navbar,
+ ID_NAVBAR_BUTTON_WHISPER, "WhisperCPP");
auto* sizer = new wxBoxSizer(wxVERTICAL);
navbar->SetSizer(sizer);
@@ -297,6 +339,8 @@ Frame::Frame()
/*flags=*/wxEXPAND);
sizer->Add(navbar_button_debug, /*proportion=*/0,
/*flags=*/wxEXPAND);
+ sizer->Add(navbar_button_whisper, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
}
auto* transcribe_panel = new wxPanel(main_panel, ID_PY_PANEL);
@@ -491,7 +535,7 @@ Frame::Frame()
py_app_use_cpu_ = py_app_use_cpu;
auto* py_app_use_builtin = new wxCheckBox(py_config_panel,
- ID_PY_APP_USE_CPU, "Use built-in chatbox");
+ ID_PY_APP_USE_BUILTIN, "Use built-in chatbox");
py_app_use_builtin->SetValue(app_c_.use_builtin);
py_app_use_builtin->SetToolTip(
"If checked, text will be sent to the built-in text box "
@@ -786,6 +830,239 @@ Frame::Frame()
}
unity_panel_->Hide();
+ auto* whisper_panel = new wxPanel(main_panel, ID_WHISPER_PANEL);
+ whisper_panel_ = whisper_panel;
+ {
+ const auto whisper_out_sz = wxSize(/*x_px=*/480, /*y_px=*/160);
+ auto* whisper_out = new wxTextCtrl(whisper_panel,
+ ID_WHISPER_OUT, wxEmptyString, wxDefaultPosition,
+ whisper_out_sz, wxTE_MULTILINE | wxTE_READONLY);
+ whisper_out->SetMinSize(whisper_out_sz);
+ whisper_out_ = whisper_out;
+
+ auto* whisper_config_panel = new wxPanel(whisper_panel,
+ ID_WHISPER_CONFIG_PANEL);
+ {
+ auto* whisper_setup_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_SETUP_BUTTON, "Set up Python virtual environment");
+ whisper_setup_button->SetToolTip(
+ "TaSTT uses the Python programming language to provide both "
+ "transcription services and to interface with Unity. "
+ "It installs its dependencies into an isolated folder "
+ "called a 'virtual environment'. Click this button to "
+ "install those dependencies. This only has to be done "
+ "once when you install a new version of TaSTT.");
+ auto* whisper_dump_mics_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_DUMP_MICS_BUTTON, "List input devices");
+ whisper_dump_mics_button->SetToolTip(
+ "List the microphones (and input devices) attached to "
+ "your computer. To use a microphone, enter the number "
+ "to its left in the 'Microphone' dropdown.");
+ auto* whisper_config_panel_pairs = new wxPanel(whisper_config_panel,
+ ID_WHISPER_CONFIG_PANEL_PAIRS);
+ {
+ auto* whisper_mic = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_MIC, wxDefaultPosition,
+ wxDefaultSize, kNumMicChoices, kMicChoices);
+ whisper_mic->SetToolTip(
+ "Select which microphone to listen to when "
+ "transcribing. To get list microphones and get their "
+ "numbers, click 'List input devices'.");
+ whisper_mic_ = whisper_mic;
+
+ auto* whisper_lang = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_LANG, wxDefaultPosition, wxDefaultSize,
+ kNumLangChoices, kLangChoices);
+ whisper_lang->SetToolTip("Select which language you will "
+ "speak in. It will be whisperd into that language. "
+ "If using a language with non-ASCII characters (i.e. "
+ "not English), make sure you have 'bytes per char' "
+ "set to 2. If using something other than English, "
+ "make sure you're not using a *.en model.");
+ whisper_lang_ = whisper_lang;
+
+ auto* whisper_model = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_MODEL,
+ wxDefaultPosition, wxDefaultSize, kNumWhisperModelChoices,
+ kWhisperModelChoices);
+ whisper_model->SetToolTip("Select which version of "
+ "the transcription model to use. 'base' is a good "
+ "choice for most users. 'small' is slightly more "
+ "accurate, slower, and uses more VRAM. The *.en "
+ "models are fine-tuned English language models, and "
+ "don't work for other languages.");
+ whisper_model_ = whisper_model;
+
+ auto* whisper_chars_per_sync = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_CHARS_PER_SYNC,
+ wxDefaultPosition, wxDefaultSize, kNumCharsPerSync,
+ kCharsPerSync);
+ whisper_chars_per_sync->SetToolTip(
+ "VRChat syncs avatar parameters roughly 5 times per "
+ "second. We use this to send text to the box. By "
+ "sending more characters per sync, the box will be "
+ "faster, but you'll use more avatar parameters.");
+ whisper_chars_per_sync_ = whisper_chars_per_sync;
+
+ auto* whisper_bytes_per_char = new wxChoice(
+ whisper_config_panel_pairs, ID_WHISPER_BYTES_PER_CHAR,
+ wxDefaultPosition, wxDefaultSize, kNumBytesPerChar,
+ kBytesPerChar);
+ whisper_bytes_per_char->SetToolTip(
+ "If you speak a language that uses non-ASCII "
+ "characters (i.e. not English), set this to 2.");
+ whisper_bytes_per_char_ = whisper_bytes_per_char;
+
+ auto* whisper_button = new wxChoice(whisper_config_panel_pairs,
+ ID_WHISPER_BUTTON, wxDefaultPosition,
+ wxDefaultSize, kNumButtons, kButton);
+ whisper_button->SetToolTip(
+ "You will use this button in game to start and stop "
+ "transcription. Set it to a button you're not using "
+ "for anything else!");
+ whisper_button_ = whisper_button;
+
+ auto* whisper_rows = new wxTextCtrl(whisper_config_panel_pairs,
+ ID_WHISPER_ROWS, std::to_string(app_c_.rows),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_rows->SetToolTip(
+ "The number of rows on the text box.");
+ whisper_rows_ = whisper_rows;
+
+ auto* whisper_cols = new wxTextCtrl(whisper_config_panel_pairs,
+ ID_WHISPER_COLS, std::to_string(app_c_.cols),
+ wxDefaultPosition, wxDefaultSize, /*style=*/0);
+ whisper_cols->SetToolTip(
+ "The number of columns on the text box.");
+ whisper_cols_ = whisper_cols;
+
+ auto* whisper_window_duration = new wxTextCtrl(
+ whisper_config_panel_pairs, ID_WHISPER_WINDOW_DURATION,
+ app_c_.window_duration, wxDefaultPosition,
+ wxDefaultSize, /*style=*/0);
+ whisper_window_duration->SetToolTip(
+ "This controls how long the slice of audio that "
+ "we feed the transcription algorithm is, in seconds. "
+ "Shorter values (as low as 10 seconds) can be whisperd "
+ "more quickly, but are less accurate. Longer values "
+ "(as high as 28 seconds) take longer to whisper, "
+ "but are far more accurate.");
+ whisper_window_duration_ = whisper_window_duration;
+
+ auto* sizer = new wxFlexGridSizer(/*cols=*/2);
+ whisper_config_panel_pairs->SetSizer(sizer);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Microphone:"));
+ sizer->Add(whisper_mic, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Language:"));
+ sizer->Add(whisper_lang, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Model:"));
+ sizer->Add(whisper_model, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Characters per sync:"));
+ sizer->Add(whisper_chars_per_sync, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Bytes per character:"));
+ sizer->Add(whisper_bytes_per_char, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Button:"));
+ sizer->Add(whisper_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Text box rows:"));
+ sizer->Add(whisper_rows, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Text box columns:"));
+ sizer->Add(whisper_cols, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(whisper_config_panel_pairs,
+ wxID_ANY, /*label=*/"Window duration (s):"));
+ sizer->Add(whisper_window_duration, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ }
+
+ auto* whisper_enable_local_beep = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_ENABLE_LOCAL_BEEP, "Enable local beep");
+ whisper_enable_local_beep->SetValue(app_c_.enable_local_beep);
+ whisper_enable_local_beep->SetToolTip(
+ "By default, TaSTT will play a sound (audible only to "
+ "you) when it begins transcription and when it stops. "
+ "Uncheck this to disable that behavior."
+ );
+ whisper_enable_local_beep_ = whisper_enable_local_beep;
+
+ auto* whisper_use_cpu = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_USE_CPU, "Use CPU");
+ whisper_use_cpu->SetValue(app_c_.use_cpu);
+ whisper_use_cpu->SetToolTip(
+ "If checked, the transcription engine will run on your "
+ "CPU instead of your GPU. This is typically much slower "
+ "and should only be used if you aren't able to use your "
+ "GPU."
+ );
+ whisper_use_cpu_ = whisper_use_cpu;
+
+ auto* whisper_use_builtin = new wxCheckBox(whisper_config_panel,
+ ID_WHISPER_USE_BUILTIN, "Use built-in chatbox");
+ whisper_use_builtin->SetValue(app_c_.use_builtin);
+ whisper_use_builtin->SetToolTip(
+ "If checked, text will be sent to the built-in text box "
+ "instead of one attached to the current avatar."
+ );
+ whisper_use_builtin_ = whisper_use_builtin;
+
+ // Hack: Add newlines before and after the button text to make
+ // the buttons bigger, and easier to click from inside VR.
+ auto* whisper_start_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_START_BUTTON, "\nBegin transcribing\n\n");
+ auto* whisper_stop_button = new wxButton(whisper_config_panel,
+ ID_WHISPER_STOP_BUTTON, "\nStop transcribing\n\n");
+
+ auto* sizer = new wxBoxSizer(wxVERTICAL);
+ whisper_config_panel->SetSizer(sizer);
+ sizer->Add(whisper_setup_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_dump_mics_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_config_panel_pairs, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_enable_local_beep, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_use_cpu, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_use_builtin, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_start_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ sizer->Add(whisper_stop_button, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+ }
+
+ auto* sizer = new wxBoxSizer(wxHORIZONTAL);
+ whisper_panel->SetSizer(sizer);
+ sizer->Add(whisper_config_panel, /*proportion=*/0, /*flags=*/wxEXPAND);
+ sizer->Add(whisper_out, /*proportion=*/1, /*flags=*/wxEXPAND);
+ }
+ whisper_panel_->Hide();
+ whisper_ = std::make_unique<WhisperCPP>(whisper_out_);
+
auto* debug_panel = new wxPanel(main_panel, ID_DEBUG_PANEL);
debug_panel_ = debug_panel;
{
@@ -887,6 +1164,7 @@ Frame::Frame()
sizer->Add(transcribe_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
sizer->Add(unity_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
sizer->Add(debug_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
+ sizer->Add(whisper_panel, /*proportion=*/1, /*flags=*/wxEXPAND);
}
Bind(wxEVT_MENU, &Frame::OnExit, this, wxID_EXIT);
@@ -894,11 +1172,15 @@ Frame::Frame()
ID_NAVBAR_BUTTON_TRANSCRIBE);
Bind(wxEVT_BUTTON, &Frame::OnNavbarUnity, this, ID_NAVBAR_BUTTON_UNITY);
Bind(wxEVT_BUTTON, &Frame::OnNavbarDebug, this, ID_NAVBAR_BUTTON_DEBUG);
+ Bind(wxEVT_BUTTON, &Frame::OnNavbarWhisper, this, ID_NAVBAR_BUTTON_WHISPER);
Bind(wxEVT_BUTTON, &Frame::OnAppStart, this, ID_PY_APP_START_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnAppStop, this, ID_PY_APP_STOP_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperStart, this, ID_WHISPER_START_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperStop, this, ID_WHISPER_STOP_BUTTON);
Bind(wxEVT_TIMER, &Frame::OnAppDrain, this, ID_PY_APP_DRAIN);
Bind(wxEVT_BUTTON, &Frame::OnSetupPython, this, ID_PY_SETUP_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnDumpMics, this, ID_PY_DUMP_MICS_BUTTON);
+ Bind(wxEVT_BUTTON, &Frame::OnWhisperDumpMics, this, ID_WHISPER_DUMP_MICS_BUTTON);
Bind(wxEVT_BUTTON, &Frame::OnGenerateFX, this,
ID_UNITY_BUTTON_GEN_ANIMATOR);
Bind(wxEVT_BUTTON, &Frame::OnListPip, this, ID_DEBUG_BUTTON_LIST_PIP);
@@ -974,6 +1256,35 @@ void Frame::ApplyConfigToInputFields()
py_app_cols->Clear();
py_app_cols->AppendText(std::to_string(app_c_.cols));
+ // Whisper panel
+ auto* whisper_mic = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MIC));
+ whisper_mic->SetSelection(mic_idx);
+
+ auto* whisper_lang = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_LANG));
+ whisper_lang->SetSelection(lang_idx);
+
+ auto* whisper_model = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_MODEL));
+ int whisper_model_idx = GetDropdownChoiceIndex(kWhisperModelChoices,
+ kNumWhisperModelChoices, app_c_.whisper_model, kWhisperModelDefault);
+ whisper_model->SetSelection(model_idx);
+
+ auto* whisper_button = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_BUTTON));
+ whisper_button->SetSelection(button_idx);
+
+ auto* whisper_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_CHARS_PER_SYNC));
+ whisper_chars_per_sync->SetSelection(chars_idx);
+
+ auto* whisper_bytes_per_char = static_cast<wxChoice*>(FindWindowById(ID_WHISPER_BYTES_PER_CHAR));
+ whisper_bytes_per_char->SetSelection(bytes_idx);
+
+ auto* whisper_rows = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_ROWS));
+ whisper_rows->Clear();
+ whisper_rows->AppendText(std::to_string(app_c_.rows));
+
+ auto* whisper_cols = static_cast<wxTextCtrl*>(FindWindowById(ID_WHISPER_COLS));
+ whisper_cols->Clear();
+ whisper_cols->AppendText(std::to_string(app_c_.cols));
+
// Unity panel
auto* unity_chars_per_sync = static_cast<wxChoice*>(FindWindowById(ID_UNITY_CHARS_PER_SYNC));
unity_chars_per_sync->SetSelection(chars_idx);
@@ -1001,9 +1312,13 @@ void Frame::OnNavbarTranscribe(wxCommandEvent& event)
// Initialize input fields using AppConfig.
ApplyConfigToInputFields();
- transcribe_panel_->Show();
+ transcribe_panel_->Hide();
unity_panel_->Hide();
debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ transcribe_panel_->Show();
Resize();
}
@@ -1013,8 +1328,12 @@ void Frame::OnNavbarUnity(wxCommandEvent& event)
ApplyConfigToInputFields();
transcribe_panel_->Hide();
- unity_panel_->Show();
+ unity_panel_->Hide();
debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ unity_panel_->Show();
Resize();
}
@@ -1025,10 +1344,32 @@ void Frame::OnNavbarDebug(wxCommandEvent& event)
transcribe_panel_->Hide();
unity_panel_->Hide();
+ debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
debug_panel_->Show();
Resize();
}
+void Frame::OnNavbarWhisper(wxCommandEvent& event)
+{
+ // Initialize input fields using AppConfig.
+ ApplyConfigToInputFields();
+
+ transcribe_panel_->Hide();
+ unity_panel_->Hide();
+ debug_panel_->Hide();
+ whisper_panel_->Hide();
+ Resize();
+
+ whisper_panel_->Show();
+
+ whisper_->Init();
+
+ Resize();
+}
+
void Frame::OnSetupPython(wxCommandEvent& event)
{
if (env_proc_) {
@@ -1081,6 +1422,17 @@ void Frame::OnDumpMics(wxCommandEvent& event)
Log(transcribe_out_, "{}\n", PythonWrapper::DumpMics());
}
+void Frame::OnWhisperDumpMics(wxCommandEvent& event)
+{
+ whisper_->Init();
+ std::vector<std::string> mics;
+ whisper_->GetMics(mics);
+ Log(whisper_out_, "Microphones:\n");
+ for (int i = 0; i < mics.size(); i++) {
+ Log(whisper_out_, " {}: {}\n", i, mics[i]);
+ }
+}
+
bool GetUserPath(const std::string& raw, std::filesystem::path& clean,
const std::string& err_prefix = "", bool must_exist = true) {
clean = raw;
@@ -1537,25 +1889,107 @@ void Frame::OnAppStop(wxCommandEvent& event) {
}
}
-void Frame::OnAppDrain(wxTimerEvent& event) {
- DrainAsyncOutput(py_app_, transcribe_out_);
- DrainAsyncOutput(env_proc_, transcribe_out_);
-}
+void Frame::OnWhisperStart(wxCommandEvent& event) {
+ Log(whisper_out_, "Launching transcription engine\n");
-void Frame::DrainAsyncOutput(wxProcess* proc, wxTextCtrl* frame) {
- if (!proc) {
+ int which_mic = whisper_mic_->GetSelection();
+ if (which_mic == wxNOT_FOUND) {
+ which_mic = kMicDefault;
+ }
+ int which_lang = whisper_lang_->GetSelection();
+ if (which_lang == wxNOT_FOUND) {
+ which_lang = kLangDefault;
+ }
+ int which_model = whisper_model_->GetSelection();
+ if (which_model == wxNOT_FOUND) {
+ which_model = kModelDefault;
+ }
+ int chars_per_sync_idx = whisper_chars_per_sync_->GetSelection();
+ if (chars_per_sync_idx == wxNOT_FOUND) {
+ chars_per_sync_idx = kCharsDefault;
+ }
+ int bytes_per_char_idx = whisper_bytes_per_char_->GetSelection();
+ if (bytes_per_char_idx == wxNOT_FOUND) {
+ bytes_per_char_idx = kBytesDefault;
+ }
+ int button_idx = whisper_button_->GetSelection();
+ if (button_idx == wxNOT_FOUND) {
+ button_idx = kBytesDefault;
+ }
+ const bool enable_local_beep = whisper_enable_local_beep_->GetValue();
+ const bool use_cpu = whisper_use_cpu_->GetValue();
+ const bool use_builtin = whisper_use_builtin_->GetValue();
+ std::string rows_str = whisper_rows_->GetValue().ToStdString();
+ std::string cols_str = whisper_cols_->GetValue().ToStdString();
+ std::string chars_per_sync_str =
+ kCharsPerSync[chars_per_sync_idx].ToStdString();
+ std::string bytes_per_char_str =
+ kBytesPerChar[bytes_per_char_idx].ToStdString();
+ std::string window_duration_str =
+ whisper_window_duration_->GetValue().ToStdString();
+ int rows, cols, chars_per_sync, bytes_per_char, window_duration;
+ try {
+ rows = std::stoi(rows_str);
+ cols = std::stoi(cols_str);
+ chars_per_sync = std::stoi(chars_per_sync_str);
+ bytes_per_char = std::stoi(bytes_per_char_str);
+ window_duration = std::stoi(window_duration_str);
+ }
+ catch (const std::invalid_argument&) {
+ Log(whisper_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
+ "per sync \"{}\", bytes per char \"{}\" or window duration \"{}\" "
+ "as an integer\n", rows_str, cols_str, chars_per_sync_str,
+ bytes_per_char_str, window_duration_str);
+ return;
+ }
+ catch (const std::out_of_range&) {
+ Log(whisper_out_, "Rows \"{}\", cols \"{}\", chars per sync "
+ "\"{}\", bytes per char \"{}\" or window duration \"{}\" are out "
+ "of range\n", rows_str, cols_str, chars_per_sync_str,
+ bytes_per_char_str, window_duration_str);
+ return;
+ }
+ const int max_rows = 10;
+ const int max_cols = 240;
+ const int min_window_duration_s = 10;
+ const int max_window_duration_s = 28;
+ if (rows < 0 || rows > max_rows ||
+ cols < 0 || cols > max_cols ||
+ window_duration < min_window_duration_s ||
+ window_duration > max_window_duration_s) {
+ Log(whisper_out_, "Rows not on [{},{}] or cols not on [{},{}] or "
+ "window_duration not on [{},{}]\n",
+ 0, max_rows,
+ 0, max_cols,
+ min_window_duration_s, max_window_duration_s);
return;
}
- while (proc->IsInputAvailable()) {
- wxTextInputStream iss(*(proc->GetInputStream()));
- Log(frame, " {}\n", iss.ReadLine());
- }
+ app_c_.microphone = kMicChoices[which_mic].ToStdString();
+ app_c_.language = kLangChoices[which_lang].ToStdString();
+ app_c_.whisper_model = kWhisperModelChoices[which_model].ToStdString();
+ app_c_.chars_per_sync = chars_per_sync;
+ app_c_.bytes_per_char = bytes_per_char;
+ app_c_.button = kButton[button_idx].ToStdString();
+ app_c_.rows = rows;
+ app_c_.cols = cols;
+ app_c_.window_duration = std::to_string(window_duration);
+ app_c_.enable_local_beep = enable_local_beep;
+ app_c_.use_cpu = use_cpu;
+ app_c_.use_builtin = use_builtin;
+ app_c_.Serialize(AppConfig::kConfigPath);
- while (proc->IsErrorAvailable()) {
- wxTextInputStream iss(*(proc->GetErrorStream()));
- Log(frame, " {}\n", iss.ReadLine());
- }
+ whisper_->Start(app_c_);
+ Log(whisper_out_, "Control flow exit start button\n");
+}
+
+void Frame::OnWhisperStop(wxCommandEvent& event) {
+ whisper_->Stop();
+}
+
+void Frame::OnAppDrain(wxTimerEvent& event) {
+ DrainAsyncOutput(py_app_, transcribe_out_);
+ DrainAsyncOutput(env_proc_, transcribe_out_);
}
void Frame::LoadAndSetIcons() {
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 7d3465f..ef65d7f 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -8,6 +8,7 @@
#endif
#include "Config.h"
+#include "WhisperCPP.h"
#include <memory>
@@ -23,10 +24,12 @@ private:
wxPanel* transcribe_panel_;
wxPanel* unity_panel_;
wxPanel* debug_panel_;
+ wxPanel* whisper_panel_;
wxTextCtrl* transcribe_out_;
wxTextCtrl* unity_out_;
wxTextCtrl* debug_out_;
+ wxTextCtrl* whisper_out_;
wxTextCtrl* unity_animator_generated_dir_;
wxTextCtrl* unity_animator_generated_name_;
@@ -38,6 +41,9 @@ private:
wxTextCtrl* py_app_window_duration_;
wxTextCtrl* unity_rows_;
wxTextCtrl* unity_cols_;
+ wxTextCtrl* whisper_rows_;
+ wxTextCtrl* whisper_cols_;
+ wxTextCtrl* whisper_window_duration_;
wxDirPickerCtrl* unity_assets_file_picker_;
wxFilePickerCtrl* unity_animator_file_picker_;
@@ -53,18 +59,32 @@ private:
wxChoice* py_app_button_;
wxChoice* unity_chars_per_sync_;
wxChoice* unity_bytes_per_char_;
+ wxChoice* whisper_mic_;
+ wxChoice* whisper_lang_;
+ wxChoice* whisper_model_;
+ wxChoice* whisper_chars_per_sync_;
+ wxChoice* whisper_bytes_per_char_;
+ wxChoice* whisper_button_;
wxCheckBox* py_app_enable_local_beep_;
wxCheckBox* py_app_use_cpu_;
wxCheckBox* py_app_use_builtin_;
wxCheckBox* unity_clear_osc_;
+ wxCheckBox* whisper_enable_local_beep_;
+ wxCheckBox* whisper_use_cpu_;
+ wxCheckBox* whisper_use_builtin_;
wxProcess* py_app_;
wxProcess* env_proc_;
wxTimer py_app_drain_;
+ wxProcess* whisper_app_;
+ wxTimer whisper_app_drain_;
+
AppConfig app_c_;
+ std::unique_ptr<WhisperCPP> whisper_;
+
// Initialize GUI input fields using `app_c_`.
void ApplyConfigToInputFields();
@@ -72,12 +92,15 @@ private:
void OnNavbarTranscribe(wxCommandEvent& event);
void OnNavbarUnity(wxCommandEvent& event);
void OnNavbarDebug(wxCommandEvent& event);
+ void OnNavbarWhisper(wxCommandEvent& event);
void OnSetupPython(wxCommandEvent& event);
void OnDumpMics(wxCommandEvent& event);
+ void OnWhisperDumpMics(wxCommandEvent& event);
void OnAppStart(wxCommandEvent& event);
void OnAppStop(wxCommandEvent& event);
+ void OnWhisperStart(wxCommandEvent& event);
+ void OnWhisperStop(wxCommandEvent& event);
void OnAppDrain(wxTimerEvent& event);
- void DrainAsyncOutput(wxProcess* proc, wxTextCtrl *frame);
void OnGenerateFX(wxCommandEvent& event);
void OnUnityParamChangeImpl();
void OnUnityParamChange(wxCommandEvent& event);
diff --git a/GUI/GUI/GUI/GUI.vcxproj b/GUI/GUI/GUI/GUI.vcxproj
index 23f3d88..0243d73 100644
--- a/GUI/GUI/GUI/GUI.vcxproj
+++ b/GUI/GUI/GUI/GUI.vcxproj
@@ -74,6 +74,10 @@
<Import Project="..\..\Libraries\wx\wxwidgets.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(ProjectDir)/whisper/</LibraryPath>
+ <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(ProjectDir)/whisper</IncludePath>
+ </PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
@@ -104,6 +108,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -136,6 +141,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
@@ -145,6 +151,7 @@
<ClCompile Include="Logging.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="PythonWrapper.cpp" />
+ <ClCompile Include="WhisperCPP.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="App.h" />
@@ -156,6 +163,7 @@
<ClInclude Include="ryml.h" />
<ClInclude Include="ScopeGuard.h" />
<ClInclude Include="Util.h" />
+ <ClInclude Include="WhisperCPP.h" />
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GUI.rc" />
diff --git a/GUI/GUI/GUI/GUI.vcxproj.filters b/GUI/GUI/GUI/GUI.vcxproj.filters
index 2798d1e..1ec73d1 100644
--- a/GUI/GUI/GUI/GUI.vcxproj.filters
+++ b/GUI/GUI/GUI/GUI.vcxproj.filters
@@ -33,6 +33,9 @@
<ClCompile Include="Config.cpp">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="WhisperCPP.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="PythonWrapper.h">
@@ -62,6 +65,9 @@
<ClInclude Include="Util.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="WhisperCPP.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GUI.rc">
diff --git a/GUI/GUI/GUI/Logging.h b/GUI/GUI/GUI/Logging.h
index 9fb88fd..99462c0 100644
--- a/GUI/GUI/GUI/Logging.h
+++ b/GUI/GUI/GUI/Logging.h
@@ -8,6 +8,9 @@
#include <wx/wx.h>
#endif
+#include <wx/process.h>
+#include <wx/txtstrm.h>
+
#include <format>
#include <string>
#include <string_view>
@@ -51,5 +54,21 @@ namespace Logging {
frame->Remove(0, frame->GetLastPosition() - max_frame_len_bytes);
}
}
+
+ inline void DrainAsyncOutput(wxProcess* proc, wxTextCtrl* frame) {
+ if (!proc) {
+ return;
+ }
+
+ while (proc->IsInputAvailable()) {
+ wxTextInputStream iss(*(proc->GetInputStream()));
+ Log(frame, " {}\n", iss.ReadLine());
+ }
+
+ while (proc->IsErrorAvailable()) {
+ wxTextInputStream iss(*(proc->GetErrorStream()));
+ Log(frame, " {}\n", iss.ReadLine());
+ }
+ }
}
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index c353003..bcb7b1d 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -7,6 +7,7 @@
#include <stdio.h>
#include <filesystem>
+#include <fstream>
#include <sstream>
using ::Logging::Log;
@@ -167,8 +168,21 @@ std::string PythonWrapper::DumpMics() {
bool PythonWrapper::InstallPip(std::string* out) {
std::string result;
+ std::filesystem::path pip_flag = "Resources/Python/.pip_installed";
+ if (std::filesystem::exists(pip_flag)) {
+ return true;
+ }
+
std::string pip_path = "Resources/Python/get-pip.py";
- return InvokeWithArgs({ pip_path }, out);
+ if (!InvokeWithArgs({ pip_path }, out)) {
+ return false;
+ }
+
+ // Create the flag file so subsstd::chrono::milliseconds(100));equent calls don't reinstall.
+ std::ofstream flag_ofs(pip_path);
+ flag_ofs.close();
+
+ return true;
}
wxProcess* PythonWrapper::StartApp(
diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp
new file mode 100644
index 0000000..742f0a4
--- /dev/null
+++ b/GUI/GUI/GUI/WhisperCPP.cpp
@@ -0,0 +1,419 @@
+#include "Logging.h"
+#include "PythonWrapper.h"
+#include "ScopeGuard.h"
+#include "Util.h"
+#include "WhisperCPP.h"
+
+#include <unknwn.h>
+#include <wchar.h>
+#include <winerror.h>
+
+#include "whisperWindows.h"
+
+#include <charconv>
+#include <codecvt>
+#include <cwchar>
+#include <fstream>
+#include <locale>
+#include <string>
+#include <vector>
+
+using namespace Whisper;
+using ::Logging::DrainAsyncOutput;
+using ::Logging::Log;
+
+namespace {
+ std::string wcharToAsciiString(const wchar_t* wc_str) {
+ int len = wcslen(wc_str);
+ std::string result(len, 0);
+
+ size_t len_out;
+ wcstombs_s(&len_out, result.data(), len, wc_str, _TRUNCATE);
+
+ return result;
+ }
+
+ std::string hresultToString(HRESULT err) {
+ LPWSTR errorText = nullptr;
+
+ // Call FormatMessage to retrieve the error message
+ DWORD size = FormatMessage(
+ FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ nullptr,
+ err,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPWSTR)&errorText,
+ 0,
+ nullptr);
+
+ // Check if the error message was retrieved successfully
+ if (size <= 0) {
+ std::ostringstream oss;
+ oss << "HRESULT:" << err;
+ return oss.str();
+ }
+ std::wstring errorMessage(errorText, size);
+ LocalFree(errorText);
+ // Convert the wide string to a narrow string for printing
+ return std::string(errorMessage.begin(), errorMessage.end());
+ }
+};
+
+WhisperCPP::WhisperCPP(wxTextCtrl* out)
+ : out_(out), f_(nullptr), did_init_(false), proc_(nullptr), run_(false)
+{
+ Log(out_, "Setting concurrency to 2: {}\n", wxThread::SetConcurrency(2));
+}
+
+WhisperCPP::~WhisperCPP() {
+ f_->Release();
+}
+
+bool WhisperCPP::Init() {
+ if (did_init_) {
+ return true;
+ }
+
+ iMediaFoundation* tmp_f = nullptr;
+ HRESULT err = initMediaFoundation(&tmp_f);
+ if (FAILED(err)) {
+ Log(out_, "Failed to initialize media layer: {}", err);
+ return false;
+ }
+ f_ = tmp_f;
+
+ did_init_ = true;
+ Log(out_, "Initialized successfully\n");
+ return true;
+}
+
+bool WhisperCPP::GetMics(std::vector<std::string>& mics) {
+ if (!did_init_) {
+ return false;
+ }
+
+ std::vector<sCaptureDevice> mics_raw;
+ if (!GetMicsImpl(mics_raw)) {
+ return false;
+ }
+
+ mics.clear();
+ for (const auto& raw_mic : mics_raw) {
+ mics.push_back(wcharToAsciiString(raw_mic.displayName));
+ }
+
+ return true;
+}
+
+bool WhisperCPP::OpenMic(const int idx, Whisper::iAudioCapture*& stream) {
+ if (!did_init_) {
+ Log(out_, "Whisper not initialized\n");
+ return false;
+ }
+
+ std::vector<sCaptureDevice> mics_raw;
+ if (!GetMicsImpl(mics_raw)) {
+ return false;
+ }
+
+ if (mics_raw.size() <= idx) {
+ Log(out_, "Mic index out of range: {} vs. {}\n", idx, mics_raw.size());
+ return false;
+ }
+
+ Whisper::sCaptureParams params{};
+ stream = nullptr;
+ HRESULT err = f_->openCaptureDevice(mics_raw[idx].endpoint, params,
+ &stream);
+ if (FAILED(err)) {
+ Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx,
+ wcharToAsciiString(mics_raw[idx].displayName),
+ hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+bool WhisperCPP::InstallDependencies(wxProcess*& proc) {
+ std::filesystem::path flag_file = "Resources/.whisper_deps_installed";
+ flag_file = flag_file.lexically_normal();
+
+ if (std::filesystem::exists(flag_file)) {
+ proc = nullptr;
+ return true;
+ }
+
+ auto cb = [&](wxProcess* proc, int ret) -> void {
+ Log(out_, "Dependency installation exited with code {}\n", ret);
+ if (ret == 0) {
+ Log(out_, "Dependency installation finished\n");
+ }
+ DrainAsyncOutput(proc, out_);
+ return;
+ };
+
+ proc = PythonWrapper::InvokeAsyncWithArgs({
+ "-u", // Unbuffered output
+ "-m pip",
+ "install",
+ "-r Resources/Scripts/whisper_requirements.txt",
+ }, std::move(cb));
+ if (!proc) {
+ Log(out_, "Failed to launch installation thread!\n");
+ return false;
+ }
+
+ // Create the flag file so subsequent calls don't reinstall.
+ std::ofstream flagfile_ofs(flag_file);
+ flagfile_ofs.close();
+
+ return true;
+}
+
+bool WhisperCPP::DownloadModel(const std::string& model_name,
+ const std::filesystem::path& fs_path, wxProcess*& proc) {
+ auto cb = [&](wxProcess* proc, int ret) -> void {
+ Log(out_, "Model download completed with code {}\n", ret);
+ if (ret == 0) {
+ Log(out_, "Model download finished\n");
+ }
+ DrainAsyncOutput(proc, out_);
+ return;
+ };
+
+ std::ostringstream url_oss;
+ url_oss << "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/";
+ url_oss << model_name;
+ Log(out_, "Model will be saved to {}\n", fs_path.lexically_normal().string());
+ proc = PythonWrapper::InvokeAsyncWithArgs({
+ "-u", // Unbuffered output
+ "-m wget",
+ url_oss.str(),
+ "-o", fs_path.string(),
+ }, std::move(cb));
+ if (!proc) {
+ Log(out_, "Failed to launch download thread!\n");
+ return false;
+ }
+ return true;
+}
+
+std::wstring utf8ToUtf16(const std::string& utf8) {
+ int wide_str_len = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
+ std::wstring utf16(wide_str_len, 0);
+ MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, utf16.data(), wide_str_len);
+ return utf16;
+}
+
+bool WhisperCPP::LoadModel(const std::string& path, Whisper::iModel*& model) {
+ model = nullptr;
+ HRESULT err = Whisper::loadModel(utf8ToUtf16(path).c_str(),
+ eModelImplementation::GPU, /*flags=*/0, /*callbacks=*/nullptr, &model);
+ if (FAILED(err)) {
+ Log(out_, "Failed to load model: {}\n", hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& context) {
+ context = nullptr;
+ HRESULT err = model->createContext(&context);
+ if (FAILED(err)) {
+ Log(out_, "Failed to create context: {}\n", hresultToString(err));
+ return false;
+ }
+
+ return true;
+}
+
+WhisperCPP::AppThread::AppThread(
+ const std::function<void(AppThread* thd)>&& cb,
+ WhisperCPP* app)
+ : wxThread(wxTHREAD_DETACHED), cb_(cb), app_(app)
+{}
+WhisperCPP::AppThread::~AppThread()
+{
+ Log(app_->out_, "Destroy transcription thread\n");
+ app_->proc_ = nullptr;
+}
+
+void* WhisperCPP::AppThread::Entry() {
+ cb_(this);
+ return nullptr;
+}
+
+void WhisperCPP::Start(const AppConfig& c) {
+ if (proc_) {
+ Log(out_, "Transcription engine already running\n");
+ return;
+ }
+
+ // TODO(yum) use app config to select mic
+ proc_ = new AppThread([&](AppThread* thd) {
+ Log(out_, "Transcription thread top\n");
+ run_ = true;
+
+ Whisper::iAudioCapture* mic_stream;
+ if (!OpenMic(1, mic_stream)) {
+ return;
+ }
+ ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); });
+
+ {
+ std::string output;
+ Log(out_, "Installing pip\n");
+ if (!PythonWrapper::InstallPip(&output)) {
+ Log(out_, "Failed to install pip: {}\n", output);
+ }
+ }
+
+ {
+ Log(out_, "Installing Python dependencies\n");
+ wxProcess* proc = nullptr;
+ if (!InstallDependencies(proc)) {
+ return;
+ }
+ while (proc && proc->Exists(proc->GetPid())) {
+ if (!run_ || thd->TestDestroy()) {
+ proc->Kill(proc->GetPid(), wxSIGKILL);
+ return;
+ }
+ wxThread::Sleep(100);
+ }
+ }
+
+ std::filesystem::path model_path = "Resources/Models";
+ model_path /= c.whisper_model;
+ if (std::filesystem::exists(model_path)) {
+ Log(out_, "Model found at {}\n", model_path.string());
+ }
+ else {
+ Log(out_, "Downloading model {}\n", c.whisper_model);
+ wxProcess* proc = nullptr;
+ model_path = model_path.lexically_normal();
+ if (!DownloadModel(c.whisper_model, model_path, proc)) {
+ return;
+ }
+ while (proc->Exists(proc->GetPid())) {
+ if (!run_ || thd->TestDestroy()) {
+ proc->Kill(proc->GetPid(), wxSIGKILL);
+ std::filesystem::remove(model_path);
+ return;
+ }
+ wxThread::Sleep(100);
+ }
+ }
+
+ Whisper::iModel* model;
+ if (!LoadModel(model_path.string(), model)) {
+ return;
+ }
+ ScopeGuard model_cleanup([model]() { model->Release(); });
+
+ Whisper::iContext* context;
+ if (!CreateContext(model, context)) {
+ return;
+ }
+ ScopeGuard context_cleanup([context]() { context->Release(); });
+
+ Whisper::sFullParams wparams{};
+ context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams);
+ wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config
+ wparams.n_max_text_ctx = 20;
+
+ wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT {
+ wxTextCtrl* out = static_cast<wxTextCtrl*>(user_data);
+ iTranscribeResult* results = nullptr;
+ HRESULT err = context->getResults(eResultFlags::Timestamps | eResultFlags::Tokens, &results);
+ if (FAILED(err)) {
+ Log(out, "Failed to get transcription: {}\n", hresultToString(err));
+ return S_OK;
+ }
+ ScopeGuard results_cleanup([results]() { results->Release(); });
+
+ sTranscribeLength length;
+ err = results->getSize(length);
+ if (FAILED(err)) {
+ Log(out, "Failed to get transcription size: {}\n", hresultToString(err));
+ return S_OK;
+ }
+
+ const sSegment* const segments = results->getSegments();
+ const sToken* const tokens = results->getTokens();
+ const int s0 = length.countSegments - n_new;
+ for (int i = s0; i < length.countSegments; i++) {
+ const sSegment& seg = segments[i];
+ Log(out, "{} ", seg.text);
+ for (int j = 0; j < seg.countTokens; j++) {
+ const sToken& tok = tokens[seg.firstToken + j];
+ if (*tok.text == 0 || tok.text[0] == '[') {
+ continue;
+ }
+ }
+ }
+ if (n_new) {
+ Log(out, "\n");
+ }
+
+ return S_OK;
+ };
+ wparams.new_segment_callback_user_data = out_;
+
+ sCaptureCallbacks callbacks{};
+ callbacks.shouldCancel = [](void* pv) noexcept -> HRESULT __stdcall {
+ WhisperCPP* app = static_cast<WhisperCPP*>(pv);
+ if (app->proc_->TestDestroy() || !app->run_) {
+ Log(app->out_, "Exit transcription loop\n");
+ return S_FALSE;
+ }
+ static int i = 0;
+ if (++i % 10 == 0) {
+ Log(app->out_, "Spin {}\n", i);
+ }
+ // Sleeping here prevents the GUI from hanging.
+ wxThread::Sleep(10);
+ return S_OK;
+ };
+ callbacks.pv = this;
+
+ // This will block.
+ HRESULT err = context->runCapture(wparams, callbacks, mic_stream);
+ if (FAILED(err)) {
+ Log(out_, "Capture failed: {}\n", hresultToString(err));
+ return;
+ }
+
+ Log(out_, "Exit transcription engine\n");
+ }, this);
+
+ proc_->Run();
+
+ Log(out_, "Success!\n");
+ return;
+}
+
+void WhisperCPP::Stop() {
+ Log(out_, "Stopping transcription engine...\n");
+ run_ = false;
+}
+
+bool WhisperCPP::GetMicsImpl(std::vector<sCaptureDevice>& mics) {
+ pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall {
+ std::vector<sCaptureDevice>* mics = static_cast<std::vector<sCaptureDevice>*>(pv);
+ for (int i = 0; i < len; i++) {
+ mics->push_back(buf[i]);
+ }
+ return S_OK;
+ };
+ mics.clear();
+ HRESULT err = f_->listCaptureDevices(dev_cb, &mics);
+ if (FAILED(err)) {
+ Log(out_, "Failed to get microphones: {}\n", err);
+ return false;
+ }
+
+ return true;
+}
diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h
new file mode 100644
index 0000000..20b0106
--- /dev/null
+++ b/GUI/GUI/GUI/WhisperCPP.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <wx/filepicker.h>
+#include <wx/wxprec.h>
+#include <wx/process.h>
+#include <wx/thread.h>
+
+#ifndef WX_PRECOMP
+#include <wx/wx.h>
+#endif
+
+#include <unknwn.h>
+#include <wchar.h>
+#include <winerror.h>
+
+#include "whisperWindows.h"
+
+#include "Config.h"
+
+#include <filesystem>
+#include <functional>
+#include <string>
+#include <vector>
+
+class WhisperCPP {
+public:
+ WhisperCPP(wxTextCtrl* out);
+ ~WhisperCPP();
+
+ bool Init();
+ bool GetMics(std::vector<std::string>& mics);
+ bool OpenMic(const int idx, Whisper::iAudioCapture*& stream);
+ bool InstallDependencies(wxProcess*& proc);
+ bool DownloadModel(const std::string& model_name,
+ const std::filesystem::path& fs_path, wxProcess*& proc);
+ bool LoadModel(const std::string& path, Whisper::iModel*& model);
+ bool CreateContext(Whisper::iModel* model, Whisper::iContext*& context);
+
+ void Start(const AppConfig& c);
+ void Stop();
+
+private:
+ bool GetMicsImpl(std::vector<Whisper::sCaptureDevice>& mics);
+
+ class AppThread : public wxThread {
+ public:
+ AppThread(const std::function<void(AppThread* thd)>&& cb, WhisperCPP* app);
+
+ virtual ~AppThread();
+
+ virtual void* Entry() wxOVERRIDE;
+
+ private:
+ const std::function<void(AppThread* thd)> cb_;
+ WhisperCPP* app_;
+ };
+
+ wxTextCtrl* out_;
+ Whisper::iMediaFoundation* f_;
+ bool did_init_;
+ AppThread* volatile proc_;
+ volatile bool run_;
+};
diff --git a/GUI/Libraries/.gitignore b/GUI/Libraries/.gitignore
index fb46029..0e14a69 100644
--- a/GUI/Libraries/.gitignore
+++ b/GUI/Libraries/.gitignore
@@ -1,4 +1,5 @@
# Don't check in anything we fetch
wx
rapidyaml
+whisper
diff --git a/GUI/Libraries/fetch.ps1 b/GUI/Libraries/fetch.ps1
index 78bf0d5..f13bad5 100644
--- a/GUI/Libraries/fetch.ps1
+++ b/GUI/Libraries/fetch.ps1
@@ -1,34 +1,65 @@
+param(
+ [switch]$overwrite = $false
+)
+
Set-PSDebug -trace 0
$WX_3_2_1_URL = "https://github.com/wxWidgets/wxWidgets/releases/download/v3.2.1/wxWidgets-3.2.1.zip"
$WX_URL = $WX_3_2_1_URL
$WX_FILE = $(Split-Path -Path $WX_URL -Leaf)
+$WHISPER_1_7_0_URL = "https://github.com/Const-me/Whisper/releases/download/1.7.0/Library.zip"
+$WHISPER_URL = $WHISPER_1_7_0_URL
+$WHISPER_FILE = $(Split-Path -Path $WHISPER_URL -Leaf)
+
pushd $PSScriptRoot
# WX
-if (Test-Path wx) {
+if ((Test-Path wx) -And ($overwrite)) {
rm -Recurse wx
}
-mkdir wx
-pushd wx > $null
-Invoke-WebRequest $WX_URL -OutFile $WX_FILE
-Expand-Archive $WX_FILE -DestinationPath .
-popd > $null
+if (-Not (Test-Path wx)) {
+ mkdir wx
+ pushd wx > $null
+ Invoke-WebRequest $WX_URL -OutFile $WX_FILE
+ Expand-Archive $WX_FILE -DestinationPath .
+ popd > $null
+}
# RAPIDYAML
-if (Test-Path rapidyaml) {
+if ((Test-Path rapidyaml) -And ($overwrite)) {
rm -Recurse rapidyaml
}
-git clone https://github.com/biojppm/rapidyaml
-pushd rapidyaml > $null
-git checkout v0.5.0
-git submodule update --init --recursive
+if (-Not (Test-Path rapidyaml)) {
+ git clone https://github.com/biojppm/rapidyaml
+ pushd rapidyaml > $null
+ git checkout v0.5.0
+ git submodule update --init --recursive
+
+ python3 tools/amalgamate.py ryml.h
+ cp ryml.h ../../GUI/GUI/ryml.h
+}
+
+if ((Test-Path whisper) -And ($overwrite)) {
+ rm -Recurse whisper
+}
-python3 tools/amalgamate.py ryml.h
-cp ryml.h ../../GUI/GUI/ryml.h
+if (-Not (Test-Path whisper)) {
+ mkdir whisper
+ pushd whisper > $null
+ Invoke-WebRequest $WHISPER_URL -OutFile $WHISPER_FILE
+ Expand-Archive $WHISPER_FILE -DestinationPath .
+ if (Test-Path ../../GUI/GUI/whisper/) {
+ rm -Recurse ../../GUI/GUI/whisper/
+ }
+ mkdir ../../GUI/GUI/whisper/
+ cp Include/*.h ../../GUI/GUI/whisper/
+ cp Linker/*.lib ../../GUI/GUI/whisper/Whisper.lib
+ cp Binary/*.dll ../../GUI/GUI/whisper/Whisper.dll
+ popd > $null
+}
popd > $null # rapidyaml
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 400ad6c..fa9eee4 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -5,7 +5,7 @@ param(
$install_dir = "TaSTT"
if (Test-Path $install_dir) {
- rm -Recurse $install_dir
+ rm -Recurse -Force $install_dir
}
$py_dir = "Python"
@@ -61,6 +61,12 @@ if (-Not (Test-Path $git_dir)) {
Read-Host -Prompt "Press enter once PortableGit is installed at $pwd\PortableGit"
}
+#$WHISPER_CHECKPOINT_URL = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
+#$WHISPER_CHECKPOINT_FILE = $(Split-Path -Path $WHISPER_CHECKPOINT_URL -Leaf)
+#if (-Not (Test-Path $WHISPER_CHECKPOINT_FILE)) {
+# Invoke-WebRequest $WHISPER_CHECKPOINT_URL -OutFile $WHISPER_CHECKPOINT_FILE
+#}
+
mkdir $install_dir > $null
mkdir $install_dir/Resources > $null
cp -Recurse ../Animations TaSTT/Resources/Animations
@@ -75,6 +81,9 @@ cp -Recurse ../Shaders TaSTT/Resources/Shaders
cp -Recurse ../Sounds TaSTT/Resources/Sounds
cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets
cp GUI/x64/Release/GUI.exe TaSTT/TaSTT.exe
+cp GUI/GUI/Whisper/Whisper.dll TaSTT/Whisper.dll
+mkdir TaSTT/Resources/Models
+#cp $WHISPER_CHECKPOINT_FILE TaSTT/Resources/Models/
if (-Not $skip_zip) {
Compress-Archive -Path "$install_dir" -DestinationPath "$install_dir.zip" -Force
diff --git a/Scripts/string_matcher.py b/Scripts/string_matcher.py
index 26241f2..a56308a 100644
--- a/Scripts/string_matcher.py
+++ b/Scripts/string_matcher.py
@@ -55,6 +55,10 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
if DEBUG:
print("STRING MATCH exception path 1")
return old_text
+ elif len(new_text) == 0:
+ return old_text
+ elif len(old_text) == 0:
+ return new_text
elif len(old_text) >= window_size and len(new_text) >= window_size:
# Find the window where the cumulative string distance
# between the text in that window in the old/new transcription
diff --git a/Scripts/whisper_requirements.txt b/Scripts/whisper_requirements.txt
new file mode 100644
index 0000000..e99fe9e
--- /dev/null
+++ b/Scripts/whisper_requirements.txt
@@ -0,0 +1,8 @@
+editdistance
+future==0.18.2
+openvr
+pillow
+playsound==1.2.2
+pyaudio
+python-osc
+wget