summaryrefslogtreecommitdiffstats
path: root/GUI
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-06-26 17:21:59 -0700
committeryum <yum.food.vr@gmail.com>2023-06-26 17:21:59 -0700
commit694756a96a6109cd79a77221dd4e40638ff55b82 (patch)
treeb0602a3e794ca3a773c11ca4ea00a0f0254ddcb8 /GUI
parent011cfdd4bab866a64b06406ceaa7563294af9225 (diff)
Scrub out old C++-based Whisper code
No longer used.
Diffstat (limited to 'GUI')
-rw-r--r--GUI/GUI/GUI/Frame.cpp32
-rw-r--r--GUI/GUI/GUI/Frame.h25
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj4
-rw-r--r--GUI/GUI/GUI/GUI.vcxproj.filters6
-rw-r--r--GUI/GUI/GUI/WhisperCPP.cpp558
-rw-r--r--GUI/GUI/GUI/WhisperCPP.h73
-rw-r--r--GUI/Libraries/fetch.ps19
-rw-r--r--GUI/README.md23
-rw-r--r--GUI/package.ps11
9 files changed, 9 insertions, 722 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 191aa82..76b85ae 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -20,7 +20,6 @@ namespace {
ID_NAVBAR_BUTTON_TRANSCRIBE,
ID_NAVBAR_BUTTON_UNITY,
ID_NAVBAR_BUTTON_DEBUG,
- ID_NAVBAR_BUTTON_WHISPER,
ID_PY_PANEL,
ID_PY_CONFIG_PANEL,
ID_PY_APP_CONFIG_PANEL_PAIRS,
@@ -81,37 +80,6 @@ namespace {
ID_DEBUG_BUTTON_BACKUP_VENV,
ID_DEBUG_BUTTON_RESTORE_VENV,
ID_DEBUG_BUTTON_SETUP_VENV,
- ID_WHISPER_PANEL,
- ID_WHISPER_OUT,
- ID_WHISPER_CONFIG_PANEL,
- ID_WHISPER_SETUP_BUTTON,
- //ID_WHISPER_DUMP_MICS_BUTTON,
- ID_WHISPER_CONFIG_PANEL_PAIRS,
- ID_WHISPER_MIC,
- ID_WHISPER_LANG,
- ID_WHISPER_MODEL,
- ID_WHISPER_CHARS_PER_SYNC,
- ID_WHISPER_BYTES_PER_CHAR,
- ID_WHISPER_BUTTON,
- ID_WHISPER_ROWS,
- ID_WHISPER_COLS,
- ID_WHISPER_BROWSER_SRC_PORT,
- ID_WHISPER_ENABLE_LOCAL_BEEP,
- ID_WHISPER_USE_CPU,
- ID_WHISPER_DECODE_METHOD,
- ID_WHISPER_MAX_CTXT,
- ID_WHISPER_BEAM_WIDTH,
- ID_WHISPER_BEAM_N_BEST,
- ID_WHISPER_VAD_MIN_DURATION,
- ID_WHISPER_VAD_MAX_DURATION,
- ID_WHISPER_VAD_DROP_START_SILENCE,
- ID_WHISPER_VAD_PAUSE_DURATION,
- ID_WHISPER_VAD_RETAIN_DURATION,
- ID_WHISPER_ENABLE_BUILTIN,
- ID_WHISPER_ENABLE_CUSTOM,
- ID_WHISPER_ENABLE_BROWSER_SRC,
- ID_WHISPER_START_BUTTON,
- ID_WHISPER_STOP_BUTTON,
};
const wxString kMicChoices[] = {
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 5969cd8..ede2afc 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -25,12 +25,10 @@ private:
wxPanel* transcribe_panel_;
wxPanel* unity_panel_;
wxPanel* debug_panel_;
- wxPanel* whisper_panel_;
wxTextCtrl* transcribe_out_;
wxTextCtrl* unity_out_;
wxTextCtrl* debug_out_;
- wxTextCtrl* whisper_out_;
wxTextCtrl* unity_animator_generated_dir_;
wxTextCtrl* unity_animator_generated_name_;
@@ -44,17 +42,6 @@ private:
wxTextCtrl* py_app_browser_src_port_;
wxTextCtrl* unity_rows_;
wxTextCtrl* unity_cols_;
- wxTextCtrl* whisper_rows_;
- wxTextCtrl* whisper_cols_;
- wxTextCtrl* whisper_browser_src_port_;
- wxTextCtrl* whisper_max_ctxt_;
- wxTextCtrl* whisper_beam_width_;
- wxTextCtrl* whisper_beam_n_best_;
- wxTextCtrl* whisper_vad_min_duration_;
- wxTextCtrl* whisper_vad_max_duration_;
- wxTextCtrl* whisper_vad_drop_start_silence_;
- wxTextCtrl* whisper_vad_pause_duration_;
- wxTextCtrl* whisper_vad_retain_duration_;
wxDirPickerCtrl* unity_assets_file_picker_;
wxFilePickerCtrl* unity_animator_file_picker_;
@@ -72,13 +59,6 @@ private:
wxChoice* py_app_button_;
wxChoice* unity_chars_per_sync_;
wxChoice* unity_bytes_per_char_;
- wxChoice* whisper_mic_;
- wxChoice* whisper_lang_;
- wxChoice* whisper_model_;
- wxChoice* whisper_chars_per_sync_;
- wxChoice* whisper_bytes_per_char_;
- wxChoice* whisper_button_;
- wxChoice* whisper_decode_method_;
wxCheckBox* py_app_enable_local_beep_;
wxCheckBox* py_app_enable_browser_src_;
@@ -89,11 +69,6 @@ private:
wxCheckBox* py_app_enable_uppercase_filter_;
wxCheckBox* py_app_enable_lowercase_filter_;
wxCheckBox* unity_clear_osc_;
- wxCheckBox* whisper_enable_local_beep_;
- wxCheckBox* whisper_use_cpu_;
- wxCheckBox* whisper_enable_builtin_;
- wxCheckBox* whisper_enable_custom_;
- wxCheckBox* whisper_enable_browser_src_;
std::future<bool> py_app_;
std::future<bool> obs_app_;
diff --git a/GUI/GUI/GUI/GUI.vcxproj b/GUI/GUI/GUI/GUI.vcxproj
index 2d08d30..e5874c4 100644
--- a/GUI/GUI/GUI/GUI.vcxproj
+++ b/GUI/GUI/GUI/GUI.vcxproj
@@ -147,7 +147,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
- <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
@@ -162,7 +162,6 @@
<ClCompile Include="PythonWrapper.cpp" />
<ClCompile Include="Transcript.cpp" />
<ClCompile Include="WebServer.cpp" />
- <ClCompile Include="WhisperCPP.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="App.h" />
@@ -180,7 +179,6 @@
<ClInclude Include="Util.h" />
<ClInclude Include="WebCommon.h" />
<ClInclude Include="WebServer.h" />
- <ClInclude Include="WhisperCPP.h" />
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GUI.rc" />
diff --git a/GUI/GUI/GUI/GUI.vcxproj.filters b/GUI/GUI/GUI/GUI.vcxproj.filters
index 6a41329..10069e8 100644
--- a/GUI/GUI/GUI/GUI.vcxproj.filters
+++ b/GUI/GUI/GUI/GUI.vcxproj.filters
@@ -36,9 +36,6 @@
<ClCompile Include="Config.cpp">
<Filter>Source Files</Filter>
</ClCompile>
- <ClCompile Include="WhisperCPP.cpp">
- <Filter>Source Files</Filter>
- </ClCompile>
<ClCompile Include="BrowserSource.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@@ -80,9 +77,6 @@
<ClInclude Include="Util.h">
<Filter>Header Files</Filter>
</ClInclude>
- <ClInclude Include="WhisperCPP.h">
- <Filter>Header Files</Filter>
- </ClInclude>
<ClInclude Include="BrowserSource.h">
<Filter>Header Files</Filter>
</ClInclude>
diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp
deleted file mode 100644
index fbe78ae..0000000
--- a/GUI/GUI/GUI/WhisperCPP.cpp
+++ /dev/null
@@ -1,558 +0,0 @@
-#include "BrowserSource.h"
-#include "Logging.h"
-#include "PythonWrapper.h"
-#include "ScopeGuard.h"
-#include "Util.h"
-#include "WhisperCPP.h"
-
-#include <unknwn.h>
-#include <wchar.h>
-#include <winerror.h>
-
-#include "whisper/whisperWindows.h"
-
-#include <charconv>
-#include <codecvt>
-#include <cwchar>
-#include <fstream>
-#include <future>
-#include <locale>
-#include <string>
-#include <vector>
-
-using namespace Whisper;
-using ::Logging::DrainAsyncOutput;
-using ::Logging::Log;
-
-namespace {
- std::string wcharToAsciiString(const wchar_t* wc_str) {
- int len = wcslen(wc_str);
- if (len == 0) {
- return "";
- }
-
- std::string result(len + 1, 0);
- size_t len_out;
- wcstombs_s(&len_out, result.data(), len, wc_str, _TRUNCATE);
-
- return result;
- }
-
- std::string hresultToString(HRESULT err) {
- LPWSTR errorText = nullptr;
-
- // Call FormatMessage to retrieve the error message
- DWORD size = FormatMessage(
- FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
- nullptr,
- err,
- MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
- (LPWSTR)&errorText,
- 0,
- nullptr);
-
- // Check if the error message was retrieved successfully
- if (size <= 0) {
- std::ostringstream oss;
- oss << "HRESULT:" << err;
- return oss.str();
- }
- std::wstring errorMessage(errorText, size);
- LocalFree(errorText);
- // Convert the wide string to a narrow string for printing
- return std::string(errorMessage.begin(), errorMessage.end());
- }
-};
-
-WhisperCPP::WhisperCPP(wxTextCtrl* out)
- : out_(out), run_transcription_(false), run_browser_src_(false)
-{
- // Initialize futures so that valid() returns true. We use this as a proxy
- // to tell whether they're still executing.
- {
- auto p = std::promise<void>();
- transcription_thd_ = p.get_future();
- p.set_value();
- }
- {
- auto p = std::promise<void>();
- browser_src_thd_ = p.get_future();
- p.set_value();
- }
- {
- auto p = std::promise<void>();
- custom_chatbox_thd_ = p.get_future();
- p.set_value();
- }
-}
-
-WhisperCPP::~WhisperCPP() {}
-
-bool WhisperCPP::GetMediaFoundation(Whisper::iMediaFoundation*& f) {
- iMediaFoundation* tmp_f = nullptr;
- HRESULT err = initMediaFoundation(&tmp_f);
- if (FAILED(err)) {
- Log(out_, "Failed to initialize media layer: {}", err);
- return false;
- }
-
- f = tmp_f;
- return true;
-}
-
-bool WhisperCPP::GetMics(Whisper::iMediaFoundation* f, std::vector<std::string>& mics) {
- std::vector<std::unique_ptr<MicInfo>> mics_raw;
- if (!GetMicsImpl(f, mics_raw)) {
- return false;
- }
-
- mics.clear();
- for (const auto& raw_mic : mics_raw) {
- mics.push_back(wcharToAsciiString(raw_mic->name.c_str()));
- }
-
- return true;
-}
-
-bool WhisperCPP::OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream) {
- std::vector<std::unique_ptr<MicInfo>> mics_raw;
- if (!GetMicsImpl(f, mics_raw)) {
- return false;
- }
-
- if (mics_raw.size() <= idx) {
- Log(out_, "Mic index out of range: {} vs. {}\n", idx, mics_raw.size());
- return false;
- }
-
- Whisper::sCaptureParams params{};
- params.dropStartSilence = c.whisper_vad_drop_start_silence;
- params.pauseDuration = c.whisper_vad_pause_duration;
- params.minDuration = c.whisper_vad_min_duration;
- params.maxDuration = c.whisper_vad_max_duration;
- params.retainDuration = c.whisper_vad_retain_duration;
- stream = nullptr;
- HRESULT err = f->openCaptureDevice(mics_raw[idx]->endpoint.c_str(),
- params, &stream);
- if (FAILED(err)) {
- Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx,
- wcharToAsciiString(mics_raw[idx]->name.c_str()),
- hresultToString(err));
- return false;
- }
-
- return true;
-}
-
-bool WhisperCPP::InstallDependencies() {
- std::filesystem::path flag_file = "Resources/.whisper_deps_installed";
- flag_file = flag_file.lexically_normal();
-
- if (std::filesystem::exists(flag_file)) {
- return true;
- }
-
- auto out_cb = [&](const std::string& out, const std::string& err) -> void {
- Log(out_, "{}", out);
- Log(out_, "{}", err);
- };
- auto in_cb = [&](std::string& in) {};
- auto run_cb = [&]() -> bool {
- return run_transcription_;
- };
- bool ret = PythonWrapper::InvokeWithArgs({
- "-u", // Unbuffered output
- "-m pip",
- "install",
- "-r Resources/Scripts/whisper_requirements.txt",
- }, std::move(out_cb), std::move(in_cb), std::move(run_cb));
-
- if (!ret) {
- Log(out_, "Failed to install dependencies!\n");
- return false;
- }
-
- // Create the flag file so subsequent calls don't reinstall.
- std::ofstream flagfile_ofs(flag_file);
- flagfile_ofs.close();
-
- return true;
-}
-
-bool WhisperCPP::DownloadModel(const std::string& model_name,
- const std::filesystem::path& fs_path) {
- std::ostringstream url_oss;
- url_oss << "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/";
- url_oss << model_name;
- Log(out_, "Model will be saved to {}\n", fs_path.lexically_normal().string());
- auto out_cb = [&](const std::string& out, const std::string& err) {
- Log(out_, "{}", out);
- Log(out_, "{}", err);
- };
- auto in_cb = [&](std::string& in) {};
- auto run_cb = [&]() -> bool {
- return run_transcription_;
- };
- bool ret = PythonWrapper::InvokeWithArgs({
- "-u", // Unbuffered output
- "-m wget",
- url_oss.str(),
- "-o", fs_path.string(),
- }, std::move(out_cb), std::move(in_cb), std::move(run_cb));
- if (!ret) {
- Log(out_, "Failed to download model!\n");
- return false;
- }
-
- return true;
-}
-
-std::wstring utf8ToUtf16(const std::string& utf8) {
- int wide_str_len = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
- std::wstring utf16(wide_str_len, 0);
- MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, utf16.data(), wide_str_len);
- return utf16;
-}
-
-bool WhisperCPP::LoadModel(const std::string& path, Whisper::iModel*& model) {
- model = nullptr;
- HRESULT err = Whisper::loadModel(utf8ToUtf16(path).c_str(),
- eModelImplementation::GPU, /*flags=*/0, /*callbacks=*/nullptr, &model);
- if (FAILED(err)) {
- Log(out_, "Failed to load model: {}\n", hresultToString(err));
- return false;
- }
-
- return true;
-}
-
-bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& context) {
- context = nullptr;
- HRESULT err = model->createContext(&context);
- if (FAILED(err)) {
- Log(out_, "Failed to create context: {}\n", hresultToString(err));
- return false;
- }
-
- return true;
-}
-
-void WhisperCPP::Start(const AppConfig& c) {
- transcript_.Clear();
-
- if (!transcription_thd_.valid()) {
- Log(out_, "Transcription engine already running\n");
- return;
- }
-
- transcription_thd_ = std::async(std::launch::async, [&]() -> void {
- run_transcription_ = true;
-
- iMediaFoundation* f = nullptr;
- if (!GetMediaFoundation(f)) {
- return;
- }
- ScopeGuard f_cleanup([f]() { f->Release(); });
-
-
- Whisper::iAudioCapture* mic_stream;
- if (!OpenMic(f, c, c.whisper_mic, mic_stream)) {
- return;
- }
- ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); });
-
- {
- auto out_cb = [&](const std::string& out, const std::string& err) -> void {
- Log(out_, "{}", out);
- Log(out_, "{}", err);
- };
- auto in_cb = [&](std::string& in) {};
- auto run_cb = [&]() -> bool {
- return run_transcription_;
- };
- Log(out_, "Installing pip\n");
- if (!PythonWrapper::InstallPip(std::move(out_cb), std::move(in_cb),
- std::move(run_cb))) {
- Log(out_, "Failed to install pip!\n");
- return;
- }
- }
- Log(out_, "Installing Python dependencies\n");
- if (!InstallDependencies()) {
- return;
- }
-
- std::filesystem::path model_path = "Resources/Models";
- model_path /= c.whisper_model;
- if (std::filesystem::exists(model_path)) {
- Log(out_, "Model found at {}\n", model_path.string());
- }
- else {
- Log(out_, "Downloading model {}\n", c.whisper_model);
- if (!DownloadModel(c.whisper_model, model_path)) {
- return;
- }
- }
-
- Whisper::iModel* model;
- if (!LoadModel(model_path.string(), model)) {
- return;
- }
- ScopeGuard model_cleanup([model]() { model->Release(); });
-
- Whisper::iContext* context;
- if (!CreateContext(model, context)) {
- return;
- }
- ScopeGuard context_cleanup([context]() { context->Release(); });
-
- Whisper::sFullParams wparams{};
- if (c.whisper_decode_method == "greedy") {
- Log(out_, "Using greedy decoding\n");
- context->fullDefaultParams(eSamplingStrategy::Greedy, &wparams);
- }
- else if (c.whisper_decode_method == "beam") {
- Log(out_, "Using beam search decoding\n");
- context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams);
- wparams.beam_search.beam_width = c.whisper_beam_width;
- wparams.beam_search.n_best = c.whisper_beam_n_best;
- } else {
- Log(out_, "Invalid decoding method: {}\n", c.whisper_decode_method);
- return;
- }
- wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config
- // This must be set to keep memory usage from growing without bound.
- wparams.n_max_text_ctx = c.whisper_max_ctxt;
-
- wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT {
- WhisperCPP* app = static_cast<WhisperCPP*>(user_data);
- iTranscribeResult* results = nullptr;
- HRESULT err = context->getResults(eResultFlags::Timestamps | eResultFlags::Tokens, &results);
- if (FAILED(err)) {
- Log(app->out_, "Failed to get transcription: {}\n", hresultToString(err));
- return S_OK;
- }
- ScopeGuard results_cleanup([results]() { results->Release(); });
-
- sTranscribeLength length;
- err = results->getSize(length);
- if (FAILED(err)) {
- Log(app->out_, "Failed to get transcription size: {}\n", hresultToString(err));
- return S_OK;
- }
-
- // Scanning a vector is faster than using a hashtable up to ~1k
- // entries (source: I heard it from someone once).
- static const std::vector<std::string> banned_words{
- " -",
- " *fades out*",
- " *no audio*",
- };
-
- const sSegment* const segments = results->getSegments();
- const sToken* const tokens = results->getTokens();
- const int s0 = length.countSegments - n_new;
- int n_tok = 0;
- for (int i = s0; i < length.countSegments; i++) {
- const sSegment& seg = segments[i];
- bool is_metadata = false;
- for (int j = 0; j < seg.countTokens; j++) {
- const sToken& tok = tokens[seg.firstToken + j];
- std::string_view tok_str(tok.text);
- if (tok_str.starts_with("[") ||
- tok_str.starts_with("(") ||
- tok_str.starts_with(" [") ||
- tok_str.starts_with(" (")) {
- is_metadata = true;
- }
- if (is_metadata) {
- if (tok_str.ends_with("]") ||
- tok_str.ends_with(")")) {
- is_metadata = false;
- }
- continue;
- }
- std::vector<std::string>::const_iterator word_iter =
- std::find(banned_words.cbegin(), banned_words.cend(),
- tok_str);
- if (word_iter != banned_words.end()) {
- continue;
- }
- ++n_tok;
- Log(app->out_, "{}", tok.text);
- app->transcript_.Append(tok.text);
- }
- }
- if (n_tok) {
- Log(app->out_, "\n");
- }
-
- return S_OK;
- };
- wparams.new_segment_callback_user_data = this;
-
- sCaptureCallbacks callbacks{};
-
- callbacks.shouldCancel = [](void* pv) noexcept -> HRESULT __stdcall {
- WhisperCPP* app = static_cast<WhisperCPP*>(pv);
- if (!app->run_transcription_) {
- Log(app->out_, "Exit transcription loop\n");
- return S_FALSE;
- }
- return S_OK;
- };
- callbacks.pv = this;
-
- // This will block.
- HRESULT err = context->runCapture(wparams, callbacks, mic_stream);
- if (FAILED(err)) {
- Log(out_, "Capture failed: {}\n", hresultToString(err));
- return;
- }
-
- Log(out_, "Exit transcription engine\n");
- });
-
- return;
-}
-
-void WhisperCPP::Stop() {
- Log(out_, "Stopping transcription engine...\n");
- run_transcription_ = false;
- transcription_thd_.wait();
- Log(out_, "Done!\n");
-}
-
-void WhisperCPP::StartBrowserSource(const AppConfig& c) {
- if (!browser_src_thd_.valid()) {
- Log(out_, "Browser source already running\n");
- return;
- }
-
- browser_src_thd_ = std::async(std::launch::async, [&]() -> void {
- run_browser_src_ = true;
- BrowserSource src(c.browser_src_port, out_, &transcript_);
- src.Run(&run_browser_src_);
- Log(out_, "Browser source thread exit\n");
- });
-}
-
-void WhisperCPP::StopBrowserSource() {
- Log(out_, "Stopping browser source...\n");
- run_browser_src_ = false;
- browser_src_thd_.wait();
- Log(out_, "Done!\n");
-}
-
-// TODO(yum) we should have a thread which simply tells us when to
-// start/stop transcription.
-void WhisperCPP::StartCustomChatbox(const AppConfig& c) {
- if (!custom_chatbox_thd_.valid()) {
- Log(out_, "Custom chatbox already running\n");
- return;
- }
-
- custom_chatbox_thd_ = std::async(std::launch::async, [&]() -> void {
- run_custom_chatbox_ = true;
- Log(out_, "Launching custom chatbox OSC layer\n");
-
- while (run_custom_chatbox_) {
- bool send_transcript = false;
- auto out_cb = [&](const std::string& out, const std::string& err) {
- std::string delim = "\r\n";
- size_t begin = 0;
- size_t end = out.size();
- while (begin < out.size()) {
- end = out.find(delim, begin);
- if (end == std::string::npos) {
- end = out.size();
- }
- ScopeGuard advance_begin([&]() { begin = end + delim.size(); });
- std::string line = out.substr(begin, end - begin);
- if (line == "1") {
- Log(out_, "Control message get: send transcript\n");
- transcript_.Clear();
- send_transcript = true;
- }
- else if (line == "0") {
- // TODO pause transcription loop?
- Log(out_, "Control message get: stop transcript\n");
- send_transcript = false;
- }
- else {
- Log(out_, " custom chatbox: Unrecognized control sequence: {}\n", line);
- }
- }
-
- begin = 0;
- end = err.size();
- while (begin < err.size()) {
- end = err.find(delim, begin);
- if (end == std::string::npos) {
- end = err.size();
- }
- ScopeGuard advance_begin([&]() { begin = end + delim.size(); });
- std::string line = err.substr(begin, end - begin);
- Log(out_, " {}\n", line);
- }
- };
- auto in_cb = [&](std::string& in) {
- if (!send_transcript) {
- return;
- }
- // TODO(yum) use a streaming interface for this. As written, we
- // have to copy a ton of redundant text every time.
- const std::vector<std::string> segments = transcript_.Get();
- std::ostringstream oss;
- for (const auto& segment : segments) {
- oss << segment;
- }
- oss << std::endl;
- in = oss.str();
- };
- auto run_cb = [&]() {
- return run_custom_chatbox_;
- };
- if (!PythonWrapper::InvokeWithArgs({
- "Resources/Scripts/cpp_transcribe.py",
- "--bytes_per_char", std::to_string(c.bytes_per_char),
- "--chars_per_sync", std::to_string(c.chars_per_sync),
- "--rows", std::to_string(c.rows),
- "--cols", std::to_string(c.cols),
- "--button", Quote(c.button),
- "--enable_local_beep", c.enable_local_beep ? "1" : "0",
- "--use_builtin", "0",
- }, out_cb, in_cb, run_cb)) {
- Log(out_, "Failed to launch custom chatbox OSC layer!\n");
- break;
- }
- }
-
- Log(out_, "Custom chatbox thread exit\n");
- });
-}
-
-void WhisperCPP::StopCustomChatbox() {
- Log(out_, "Stopping custom chatbox...\n");
- run_custom_chatbox_ = false;
- custom_chatbox_thd_.wait();
- Log(out_, "Done!\n");
-}
-
-bool WhisperCPP::GetMicsImpl(Whisper::iMediaFoundation* f, std::vector<std::unique_ptr<MicInfo>>& mics) {
- pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall {
- auto mics = static_cast<std::vector<std::unique_ptr<MicInfo>>*>(pv);
- for (int i = 0; i < len; i++) {
- mics->push_back(std::make_unique<MicInfo>(buf[i].displayName, buf[i].endpoint));
- }
- return S_OK;
- };
- mics.clear();
- HRESULT err = f->listCaptureDevices(dev_cb, &mics);
- if (FAILED(err)) {
- Log(out_, "Failed to get microphones: {}\n", err);
- return false;
- }
-
- return true;
-}
diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h
deleted file mode 100644
index 530d65a..0000000
--- a/GUI/GUI/GUI/WhisperCPP.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#include <wx/wxprec.h>
-
-#ifndef WX_PRECOMP
-#include <wx/wx.h>
-#endif
-
-#include <unknwn.h>
-#include <wchar.h>
-#include <winerror.h>
-
-#include "whisper/whisperWindows.h"
-
-#include "Config.h"
-#include "Transcript.h"
-
-#include <filesystem>
-#include <functional>
-#include <future>
-#include <memory>
-#include <string>
-#include <vector>
-
-class WhisperCPP {
-public:
- WhisperCPP(wxTextCtrl* out);
- ~WhisperCPP();
-
- bool GetMediaFoundation(Whisper::iMediaFoundation*& f);
- bool GetMics(Whisper::iMediaFoundation* f, std::vector<std::string>& mics);
- bool OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream);
- bool InstallDependencies();
- bool DownloadModel(const std::string& model_name,
- const std::filesystem::path& fs_path);
- bool LoadModel(const std::string& path, Whisper::iModel*& model);
- bool CreateContext(Whisper::iModel* model, Whisper::iContext*& context);
-
- void Start(const AppConfig& c);
- void Stop();
-
- void StartBrowserSource(const AppConfig& c);
- void StopBrowserSource();
-
- void StartCustomChatbox(const AppConfig& c);
- void StopCustomChatbox();
-
-private:
- struct MicInfo {
- MicInfo(const wchar_t* n, const wchar_t* e)
- : name(n), endpoint(e)
- {}
-
- std::wstring name;
- std::wstring endpoint;
- };
- bool GetMicsImpl(
- Whisper::iMediaFoundation* f,
- std::vector<std::unique_ptr<MicInfo>>& mics);
-
- wxTextCtrl* out_;
-
- std::future<void> transcription_thd_;
- volatile bool run_transcription_;
-
- std::future<void> browser_src_thd_;
- volatile bool run_browser_src_;
-
- std::future<void> custom_chatbox_thd_;
- volatile bool run_custom_chatbox_;
-
- Transcript transcript_;
-};
diff --git a/GUI/Libraries/fetch.ps1 b/GUI/Libraries/fetch.ps1
index d71ce44..f558d3f 100644
--- a/GUI/Libraries/fetch.ps1
+++ b/GUI/Libraries/fetch.ps1
@@ -27,14 +27,5 @@ if (-Not (Test-Path wx)) {
popd > $null
}
-
-if (Test-Path ../GUI/GUI/whisper/) {
- rm -Recurse ../GUI/GUI/whisper/
-}
-
-mkdir ../GUI/GUI/whisper/
-cp ../../TaSTT-Whisper/Whisper/API/*.h ../GUI/GUI/whisper/
-cp ../../TaSTT-Whisper/x64/Release/Whisper.lib ../GUI/GUI/whisper/
-
popd > $null # $PSScriptRoot
diff --git a/GUI/README.md b/GUI/README.md
index 6d5049d..ff3e358 100644
--- a/GUI/README.md
+++ b/GUI/README.md
@@ -18,8 +18,6 @@ $ git submodule update
Unrestricted` in an admin instance of powershell. Heed the warning,
this is a security risk! Never run code from someone you don't trust
unless you've carefully audited it.
- 3.1. If you haven't built TaSTT-Whisper before, you'll see an error. Ignore
- it.
4. Open `Libraries/wx/build/msw/wx_vc17.sln` with Visual Studio 2022.
5. Select every project in the Solution Explorer except for `_custom_build`.
6. Right click, select Properties, go to C/C++, Code Generation, and set
@@ -29,18 +27,14 @@ $ git submodule update
1. The build configuration is in the top. By default it's probably Debug/x64.
2. To build: ctrl+shift+B
3. If you saw an error in 3.1, rerun Libraries/fetch.ps1.
-8. Follow `Build instructions` section of TaSTT-Whisper/Readme.md and build it
- as x64/Release.
- 8.0. If you see a message like `Based on your solution... you might need to
- install additional components`, do it.
-9. Open GUI/GUI.sln with Visual Studio 2022.
-10. Build x64/Release.
-11. Run package.ps1 from powershell.
- 11.0. If you're not creating a redistributable release, use this command
+8. Open GUI/GUI.sln with Visual Studio 2022.
+9. Build x64/Release.
+10. Run package.ps1 from powershell.
+ 10.0. If you're not creating a redistributable release, use this command
instead (it's way faster): `package.ps1 -skip_zip`.
- 11.1. When PortableGit creates a window, wait for it to complete, then press
+ 10.1. When PortableGit creates a window, wait for it to complete, then press
then press enter in Powershell.
- 11.2. The first time you run this it'll take a long time since it has to
+ 10.2. The first time you run this it'll take a long time since it has to
fetch a few large packages. Subsequent invocations will be much faster
since it won't reacquire anything already downloaded. On my connection,
it took 90 minutes to finish downloading, mostly because Google Drive
@@ -49,9 +43,8 @@ $ git submodule update
## High level design
* The GUI is written using wxWidgets.
-* Python executes core business logic. We can't migrate away since
- there's no CUDA-enabled Whisper implementation available in a good
- systems programming language.
+* Python executes core business logic. With libraries like faster\_whisper
+ available, this provides a nice balance between flexibility and performance.
* To skirt licensing complexity, we distribute an embedded python
that's hacked up to allow installing packages via pip. We use this
to install packages at runtime (like a net installer), so we don't
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 808bedd..4800777 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -149,7 +149,6 @@ cp -Recurse ../Sounds TaSTT/Resources/Sounds
cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets
cp -Recurse ../BrowserSource TaSTT/Resources/BrowserSource
cp GUI/x64/$release/GUI.exe TaSTT/TaSTT.exe
-cp ../"TaSTT-Whisper"/x64/Release/Whisper.dll TaSTT/Whisper.dll
mkdir TaSTT/Resources/Models
mkdir TaSTT/Resources/Uwu
cp UwwwuPP/build/Src/Debug/Uwwwu.exe TaSTT/Resources/Uwu/