From 694756a96a6109cd79a77221dd4e40638ff55b82 Mon Sep 17 00:00:00 2001 From: yum Date: Mon, 26 Jun 2023 17:21:59 -0700 Subject: Scrub out old C++-based Whisper code No longer used. --- GUI/GUI/GUI/Frame.cpp | 32 --- GUI/GUI/GUI/Frame.h | 25 -- GUI/GUI/GUI/GUI.vcxproj | 4 +- GUI/GUI/GUI/GUI.vcxproj.filters | 6 - GUI/GUI/GUI/WhisperCPP.cpp | 558 ---------------------------------------- GUI/GUI/GUI/WhisperCPP.h | 73 ------ GUI/Libraries/fetch.ps1 | 9 - GUI/README.md | 23 +- GUI/package.ps1 | 1 - 9 files changed, 9 insertions(+), 722 deletions(-) delete mode 100644 GUI/GUI/GUI/WhisperCPP.cpp delete mode 100644 GUI/GUI/GUI/WhisperCPP.h (limited to 'GUI') diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 191aa82..76b85ae 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -20,7 +20,6 @@ namespace { ID_NAVBAR_BUTTON_TRANSCRIBE, ID_NAVBAR_BUTTON_UNITY, ID_NAVBAR_BUTTON_DEBUG, - ID_NAVBAR_BUTTON_WHISPER, ID_PY_PANEL, ID_PY_CONFIG_PANEL, ID_PY_APP_CONFIG_PANEL_PAIRS, @@ -81,37 +80,6 @@ namespace { ID_DEBUG_BUTTON_BACKUP_VENV, ID_DEBUG_BUTTON_RESTORE_VENV, ID_DEBUG_BUTTON_SETUP_VENV, - ID_WHISPER_PANEL, - ID_WHISPER_OUT, - ID_WHISPER_CONFIG_PANEL, - ID_WHISPER_SETUP_BUTTON, - //ID_WHISPER_DUMP_MICS_BUTTON, - ID_WHISPER_CONFIG_PANEL_PAIRS, - ID_WHISPER_MIC, - ID_WHISPER_LANG, - ID_WHISPER_MODEL, - ID_WHISPER_CHARS_PER_SYNC, - ID_WHISPER_BYTES_PER_CHAR, - ID_WHISPER_BUTTON, - ID_WHISPER_ROWS, - ID_WHISPER_COLS, - ID_WHISPER_BROWSER_SRC_PORT, - ID_WHISPER_ENABLE_LOCAL_BEEP, - ID_WHISPER_USE_CPU, - ID_WHISPER_DECODE_METHOD, - ID_WHISPER_MAX_CTXT, - ID_WHISPER_BEAM_WIDTH, - ID_WHISPER_BEAM_N_BEST, - ID_WHISPER_VAD_MIN_DURATION, - ID_WHISPER_VAD_MAX_DURATION, - ID_WHISPER_VAD_DROP_START_SILENCE, - ID_WHISPER_VAD_PAUSE_DURATION, - ID_WHISPER_VAD_RETAIN_DURATION, - ID_WHISPER_ENABLE_BUILTIN, - ID_WHISPER_ENABLE_CUSTOM, - ID_WHISPER_ENABLE_BROWSER_SRC, - ID_WHISPER_START_BUTTON, - ID_WHISPER_STOP_BUTTON, }; const wxString kMicChoices[] = { diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 5969cd8..ede2afc 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -25,12 +25,10 @@ private: wxPanel* transcribe_panel_; wxPanel* unity_panel_; wxPanel* debug_panel_; - wxPanel* whisper_panel_; wxTextCtrl* transcribe_out_; wxTextCtrl* unity_out_; wxTextCtrl* debug_out_; - wxTextCtrl* whisper_out_; wxTextCtrl* unity_animator_generated_dir_; wxTextCtrl* unity_animator_generated_name_; @@ -44,17 +42,6 @@ private: wxTextCtrl* py_app_browser_src_port_; wxTextCtrl* unity_rows_; wxTextCtrl* unity_cols_; - wxTextCtrl* whisper_rows_; - wxTextCtrl* whisper_cols_; - wxTextCtrl* whisper_browser_src_port_; - wxTextCtrl* whisper_max_ctxt_; - wxTextCtrl* whisper_beam_width_; - wxTextCtrl* whisper_beam_n_best_; - wxTextCtrl* whisper_vad_min_duration_; - wxTextCtrl* whisper_vad_max_duration_; - wxTextCtrl* whisper_vad_drop_start_silence_; - wxTextCtrl* whisper_vad_pause_duration_; - wxTextCtrl* whisper_vad_retain_duration_; wxDirPickerCtrl* unity_assets_file_picker_; wxFilePickerCtrl* unity_animator_file_picker_; @@ -72,13 +59,6 @@ private: wxChoice* py_app_button_; wxChoice* unity_chars_per_sync_; wxChoice* unity_bytes_per_char_; - wxChoice* whisper_mic_; - wxChoice* whisper_lang_; - wxChoice* whisper_model_; - wxChoice* whisper_chars_per_sync_; - wxChoice* whisper_bytes_per_char_; - wxChoice* whisper_button_; - wxChoice* whisper_decode_method_; wxCheckBox* py_app_enable_local_beep_; wxCheckBox* py_app_enable_browser_src_; @@ -89,11 +69,6 @@ private: wxCheckBox* py_app_enable_uppercase_filter_; wxCheckBox* py_app_enable_lowercase_filter_; wxCheckBox* unity_clear_osc_; - wxCheckBox* whisper_enable_local_beep_; - wxCheckBox* whisper_use_cpu_; - wxCheckBox* whisper_enable_builtin_; - wxCheckBox* whisper_enable_custom_; - wxCheckBox* whisper_enable_browser_src_; std::future py_app_; std::future obs_app_; diff --git a/GUI/GUI/GUI/GUI.vcxproj b/GUI/GUI/GUI/GUI.vcxproj index 2d08d30..e5874c4 100644 --- a/GUI/GUI/GUI/GUI.vcxproj +++ b/GUI/GUI/GUI/GUI.vcxproj @@ -147,7 +147,7 @@ true true true - kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;Whisper.lib;%(AdditionalDependencies) + kernel32.lib;user32.lib;gdi32.lib;comdlg32.lib;winspool.lib;shell32.lib;shlwapi.lib;ole32.lib;oleaut32.lib;uuid.lib;advapi32.lib;version.lib;comctl32.lib;rpcrt4.lib;ws2_32.lib;wininet.lib;winmm.lib;%(AdditionalDependencies) @@ -162,7 +162,6 @@ - @@ -180,7 +179,6 @@ - diff --git a/GUI/GUI/GUI/GUI.vcxproj.filters b/GUI/GUI/GUI/GUI.vcxproj.filters index 6a41329..10069e8 100644 --- a/GUI/GUI/GUI/GUI.vcxproj.filters +++ b/GUI/GUI/GUI/GUI.vcxproj.filters @@ -36,9 +36,6 @@ Source Files - - Source Files - Source Files @@ -80,9 +77,6 @@ Header Files - - Header Files - Header Files diff --git a/GUI/GUI/GUI/WhisperCPP.cpp b/GUI/GUI/GUI/WhisperCPP.cpp deleted file mode 100644 index fbe78ae..0000000 --- a/GUI/GUI/GUI/WhisperCPP.cpp +++ /dev/null @@ -1,558 +0,0 @@ -#include "BrowserSource.h" -#include "Logging.h" -#include "PythonWrapper.h" -#include "ScopeGuard.h" -#include "Util.h" -#include "WhisperCPP.h" - -#include -#include -#include - -#include "whisper/whisperWindows.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace Whisper; -using ::Logging::DrainAsyncOutput; -using ::Logging::Log; - -namespace { - std::string wcharToAsciiString(const wchar_t* wc_str) { - int len = wcslen(wc_str); - if (len == 0) { - return ""; - } - - std::string result(len + 1, 0); - size_t len_out; - wcstombs_s(&len_out, result.data(), len, wc_str, _TRUNCATE); - - return result; - } - - std::string hresultToString(HRESULT err) { - LPWSTR errorText = nullptr; - - // Call FormatMessage to retrieve the error message - DWORD size = FormatMessage( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - nullptr, - err, - MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPWSTR)&errorText, - 0, - nullptr); - - // Check if the error message was retrieved successfully - if (size <= 0) { - std::ostringstream oss; - oss << "HRESULT:" << err; - return oss.str(); - } - std::wstring errorMessage(errorText, size); - LocalFree(errorText); - // Convert the wide string to a narrow string for printing - return std::string(errorMessage.begin(), errorMessage.end()); - } -}; - -WhisperCPP::WhisperCPP(wxTextCtrl* out) - : out_(out), run_transcription_(false), run_browser_src_(false) -{ - // Initialize futures so that valid() returns true. We use this as a proxy - // to tell whether they're still executing. - { - auto p = std::promise(); - transcription_thd_ = p.get_future(); - p.set_value(); - } - { - auto p = std::promise(); - browser_src_thd_ = p.get_future(); - p.set_value(); - } - { - auto p = std::promise(); - custom_chatbox_thd_ = p.get_future(); - p.set_value(); - } -} - -WhisperCPP::~WhisperCPP() {} - -bool WhisperCPP::GetMediaFoundation(Whisper::iMediaFoundation*& f) { - iMediaFoundation* tmp_f = nullptr; - HRESULT err = initMediaFoundation(&tmp_f); - if (FAILED(err)) { - Log(out_, "Failed to initialize media layer: {}", err); - return false; - } - - f = tmp_f; - return true; -} - -bool WhisperCPP::GetMics(Whisper::iMediaFoundation* f, std::vector& mics) { - std::vector> mics_raw; - if (!GetMicsImpl(f, mics_raw)) { - return false; - } - - mics.clear(); - for (const auto& raw_mic : mics_raw) { - mics.push_back(wcharToAsciiString(raw_mic->name.c_str())); - } - - return true; -} - -bool WhisperCPP::OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream) { - std::vector> mics_raw; - if (!GetMicsImpl(f, mics_raw)) { - return false; - } - - if (mics_raw.size() <= idx) { - Log(out_, "Mic index out of range: {} vs. {}\n", idx, mics_raw.size()); - return false; - } - - Whisper::sCaptureParams params{}; - params.dropStartSilence = c.whisper_vad_drop_start_silence; - params.pauseDuration = c.whisper_vad_pause_duration; - params.minDuration = c.whisper_vad_min_duration; - params.maxDuration = c.whisper_vad_max_duration; - params.retainDuration = c.whisper_vad_retain_duration; - stream = nullptr; - HRESULT err = f->openCaptureDevice(mics_raw[idx]->endpoint.c_str(), - params, &stream); - if (FAILED(err)) { - Log(out_, "Failed to open mic with idx {} ({}): {}\n", idx, - wcharToAsciiString(mics_raw[idx]->name.c_str()), - hresultToString(err)); - return false; - } - - return true; -} - -bool WhisperCPP::InstallDependencies() { - std::filesystem::path flag_file = "Resources/.whisper_deps_installed"; - flag_file = flag_file.lexically_normal(); - - if (std::filesystem::exists(flag_file)) { - return true; - } - - auto out_cb = [&](const std::string& out, const std::string& err) -> void { - Log(out_, "{}", out); - Log(out_, "{}", err); - }; - auto in_cb = [&](std::string& in) {}; - auto run_cb = [&]() -> bool { - return run_transcription_; - }; - bool ret = PythonWrapper::InvokeWithArgs({ - "-u", // Unbuffered output - "-m pip", - "install", - "-r Resources/Scripts/whisper_requirements.txt", - }, std::move(out_cb), std::move(in_cb), std::move(run_cb)); - - if (!ret) { - Log(out_, "Failed to install dependencies!\n"); - return false; - } - - // Create the flag file so subsequent calls don't reinstall. - std::ofstream flagfile_ofs(flag_file); - flagfile_ofs.close(); - - return true; -} - -bool WhisperCPP::DownloadModel(const std::string& model_name, - const std::filesystem::path& fs_path) { - std::ostringstream url_oss; - url_oss << "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/"; - url_oss << model_name; - Log(out_, "Model will be saved to {}\n", fs_path.lexically_normal().string()); - auto out_cb = [&](const std::string& out, const std::string& err) { - Log(out_, "{}", out); - Log(out_, "{}", err); - }; - auto in_cb = [&](std::string& in) {}; - auto run_cb = [&]() -> bool { - return run_transcription_; - }; - bool ret = PythonWrapper::InvokeWithArgs({ - "-u", // Unbuffered output - "-m wget", - url_oss.str(), - "-o", fs_path.string(), - }, std::move(out_cb), std::move(in_cb), std::move(run_cb)); - if (!ret) { - Log(out_, "Failed to download model!\n"); - return false; - } - - return true; -} - -std::wstring utf8ToUtf16(const std::string& utf8) { - int wide_str_len = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0); - std::wstring utf16(wide_str_len, 0); - MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, utf16.data(), wide_str_len); - return utf16; -} - -bool WhisperCPP::LoadModel(const std::string& path, Whisper::iModel*& model) { - model = nullptr; - HRESULT err = Whisper::loadModel(utf8ToUtf16(path).c_str(), - eModelImplementation::GPU, /*flags=*/0, /*callbacks=*/nullptr, &model); - if (FAILED(err)) { - Log(out_, "Failed to load model: {}\n", hresultToString(err)); - return false; - } - - return true; -} - -bool WhisperCPP::CreateContext(Whisper::iModel* model, Whisper::iContext*& context) { - context = nullptr; - HRESULT err = model->createContext(&context); - if (FAILED(err)) { - Log(out_, "Failed to create context: {}\n", hresultToString(err)); - return false; - } - - return true; -} - -void WhisperCPP::Start(const AppConfig& c) { - transcript_.Clear(); - - if (!transcription_thd_.valid()) { - Log(out_, "Transcription engine already running\n"); - return; - } - - transcription_thd_ = std::async(std::launch::async, [&]() -> void { - run_transcription_ = true; - - iMediaFoundation* f = nullptr; - if (!GetMediaFoundation(f)) { - return; - } - ScopeGuard f_cleanup([f]() { f->Release(); }); - - - Whisper::iAudioCapture* mic_stream; - if (!OpenMic(f, c, c.whisper_mic, mic_stream)) { - return; - } - ScopeGuard mic_stream_cleanup([mic_stream]() { mic_stream->Release(); }); - - { - auto out_cb = [&](const std::string& out, const std::string& err) -> void { - Log(out_, "{}", out); - Log(out_, "{}", err); - }; - auto in_cb = [&](std::string& in) {}; - auto run_cb = [&]() -> bool { - return run_transcription_; - }; - Log(out_, "Installing pip\n"); - if (!PythonWrapper::InstallPip(std::move(out_cb), std::move(in_cb), - std::move(run_cb))) { - Log(out_, "Failed to install pip!\n"); - return; - } - } - Log(out_, "Installing Python dependencies\n"); - if (!InstallDependencies()) { - return; - } - - std::filesystem::path model_path = "Resources/Models"; - model_path /= c.whisper_model; - if (std::filesystem::exists(model_path)) { - Log(out_, "Model found at {}\n", model_path.string()); - } - else { - Log(out_, "Downloading model {}\n", c.whisper_model); - if (!DownloadModel(c.whisper_model, model_path)) { - return; - } - } - - Whisper::iModel* model; - if (!LoadModel(model_path.string(), model)) { - return; - } - ScopeGuard model_cleanup([model]() { model->Release(); }); - - Whisper::iContext* context; - if (!CreateContext(model, context)) { - return; - } - ScopeGuard context_cleanup([context]() { context->Release(); }); - - Whisper::sFullParams wparams{}; - if (c.whisper_decode_method == "greedy") { - Log(out_, "Using greedy decoding\n"); - context->fullDefaultParams(eSamplingStrategy::Greedy, &wparams); - } - else if (c.whisper_decode_method == "beam") { - Log(out_, "Using beam search decoding\n"); - context->fullDefaultParams(eSamplingStrategy::BeamSearch, &wparams); - wparams.beam_search.beam_width = c.whisper_beam_width; - wparams.beam_search.n_best = c.whisper_beam_n_best; - } else { - Log(out_, "Invalid decoding method: {}\n", c.whisper_decode_method); - return; - } - wparams.language = Whisper::makeLanguageKey("en"); // TODO(yum) use config - // This must be set to keep memory usage from growing without bound. - wparams.n_max_text_ctx = c.whisper_max_ctxt; - - wparams.new_segment_callback = [](iContext* context, uint32_t n_new, void* user_data) noexcept -> HRESULT { - WhisperCPP* app = static_cast(user_data); - iTranscribeResult* results = nullptr; - HRESULT err = context->getResults(eResultFlags::Timestamps | eResultFlags::Tokens, &results); - if (FAILED(err)) { - Log(app->out_, "Failed to get transcription: {}\n", hresultToString(err)); - return S_OK; - } - ScopeGuard results_cleanup([results]() { results->Release(); }); - - sTranscribeLength length; - err = results->getSize(length); - if (FAILED(err)) { - Log(app->out_, "Failed to get transcription size: {}\n", hresultToString(err)); - return S_OK; - } - - // Scanning a vector is faster than using a hashtable up to ~1k - // entries (source: I heard it from someone once). - static const std::vector banned_words{ - " -", - " *fades out*", - " *no audio*", - }; - - const sSegment* const segments = results->getSegments(); - const sToken* const tokens = results->getTokens(); - const int s0 = length.countSegments - n_new; - int n_tok = 0; - for (int i = s0; i < length.countSegments; i++) { - const sSegment& seg = segments[i]; - bool is_metadata = false; - for (int j = 0; j < seg.countTokens; j++) { - const sToken& tok = tokens[seg.firstToken + j]; - std::string_view tok_str(tok.text); - if (tok_str.starts_with("[") || - tok_str.starts_with("(") || - tok_str.starts_with(" [") || - tok_str.starts_with(" (")) { - is_metadata = true; - } - if (is_metadata) { - if (tok_str.ends_with("]") || - tok_str.ends_with(")")) { - is_metadata = false; - } - continue; - } - std::vector::const_iterator word_iter = - std::find(banned_words.cbegin(), banned_words.cend(), - tok_str); - if (word_iter != banned_words.end()) { - continue; - } - ++n_tok; - Log(app->out_, "{}", tok.text); - app->transcript_.Append(tok.text); - } - } - if (n_tok) { - Log(app->out_, "\n"); - } - - return S_OK; - }; - wparams.new_segment_callback_user_data = this; - - sCaptureCallbacks callbacks{}; - - callbacks.shouldCancel = [](void* pv) noexcept -> HRESULT __stdcall { - WhisperCPP* app = static_cast(pv); - if (!app->run_transcription_) { - Log(app->out_, "Exit transcription loop\n"); - return S_FALSE; - } - return S_OK; - }; - callbacks.pv = this; - - // This will block. - HRESULT err = context->runCapture(wparams, callbacks, mic_stream); - if (FAILED(err)) { - Log(out_, "Capture failed: {}\n", hresultToString(err)); - return; - } - - Log(out_, "Exit transcription engine\n"); - }); - - return; -} - -void WhisperCPP::Stop() { - Log(out_, "Stopping transcription engine...\n"); - run_transcription_ = false; - transcription_thd_.wait(); - Log(out_, "Done!\n"); -} - -void WhisperCPP::StartBrowserSource(const AppConfig& c) { - if (!browser_src_thd_.valid()) { - Log(out_, "Browser source already running\n"); - return; - } - - browser_src_thd_ = std::async(std::launch::async, [&]() -> void { - run_browser_src_ = true; - BrowserSource src(c.browser_src_port, out_, &transcript_); - src.Run(&run_browser_src_); - Log(out_, "Browser source thread exit\n"); - }); -} - -void WhisperCPP::StopBrowserSource() { - Log(out_, "Stopping browser source...\n"); - run_browser_src_ = false; - browser_src_thd_.wait(); - Log(out_, "Done!\n"); -} - -// TODO(yum) we should have a thread which simply tells us when to -// start/stop transcription. -void WhisperCPP::StartCustomChatbox(const AppConfig& c) { - if (!custom_chatbox_thd_.valid()) { - Log(out_, "Custom chatbox already running\n"); - return; - } - - custom_chatbox_thd_ = std::async(std::launch::async, [&]() -> void { - run_custom_chatbox_ = true; - Log(out_, "Launching custom chatbox OSC layer\n"); - - while (run_custom_chatbox_) { - bool send_transcript = false; - auto out_cb = [&](const std::string& out, const std::string& err) { - std::string delim = "\r\n"; - size_t begin = 0; - size_t end = out.size(); - while (begin < out.size()) { - end = out.find(delim, begin); - if (end == std::string::npos) { - end = out.size(); - } - ScopeGuard advance_begin([&]() { begin = end + delim.size(); }); - std::string line = out.substr(begin, end - begin); - if (line == "1") { - Log(out_, "Control message get: send transcript\n"); - transcript_.Clear(); - send_transcript = true; - } - else if (line == "0") { - // TODO pause transcription loop? - Log(out_, "Control message get: stop transcript\n"); - send_transcript = false; - } - else { - Log(out_, " custom chatbox: Unrecognized control sequence: {}\n", line); - } - } - - begin = 0; - end = err.size(); - while (begin < err.size()) { - end = err.find(delim, begin); - if (end == std::string::npos) { - end = err.size(); - } - ScopeGuard advance_begin([&]() { begin = end + delim.size(); }); - std::string line = err.substr(begin, end - begin); - Log(out_, " {}\n", line); - } - }; - auto in_cb = [&](std::string& in) { - if (!send_transcript) { - return; - } - // TODO(yum) use a streaming interface for this. As written, we - // have to copy a ton of redundant text every time. - const std::vector segments = transcript_.Get(); - std::ostringstream oss; - for (const auto& segment : segments) { - oss << segment; - } - oss << std::endl; - in = oss.str(); - }; - auto run_cb = [&]() { - return run_custom_chatbox_; - }; - if (!PythonWrapper::InvokeWithArgs({ - "Resources/Scripts/cpp_transcribe.py", - "--bytes_per_char", std::to_string(c.bytes_per_char), - "--chars_per_sync", std::to_string(c.chars_per_sync), - "--rows", std::to_string(c.rows), - "--cols", std::to_string(c.cols), - "--button", Quote(c.button), - "--enable_local_beep", c.enable_local_beep ? "1" : "0", - "--use_builtin", "0", - }, out_cb, in_cb, run_cb)) { - Log(out_, "Failed to launch custom chatbox OSC layer!\n"); - break; - } - } - - Log(out_, "Custom chatbox thread exit\n"); - }); -} - -void WhisperCPP::StopCustomChatbox() { - Log(out_, "Stopping custom chatbox...\n"); - run_custom_chatbox_ = false; - custom_chatbox_thd_.wait(); - Log(out_, "Done!\n"); -} - -bool WhisperCPP::GetMicsImpl(Whisper::iMediaFoundation* f, std::vector>& mics) { - pfnFoundCaptureDevices dev_cb = [](int len, const sCaptureDevice* buf, void* pv)->HRESULT __stdcall { - auto mics = static_cast>*>(pv); - for (int i = 0; i < len; i++) { - mics->push_back(std::make_unique(buf[i].displayName, buf[i].endpoint)); - } - return S_OK; - }; - mics.clear(); - HRESULT err = f->listCaptureDevices(dev_cb, &mics); - if (FAILED(err)) { - Log(out_, "Failed to get microphones: {}\n", err); - return false; - } - - return true; -} diff --git a/GUI/GUI/GUI/WhisperCPP.h b/GUI/GUI/GUI/WhisperCPP.h deleted file mode 100644 index 530d65a..0000000 --- a/GUI/GUI/GUI/WhisperCPP.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include - -#ifndef WX_PRECOMP -#include -#endif - -#include -#include -#include - -#include "whisper/whisperWindows.h" - -#include "Config.h" -#include "Transcript.h" - -#include -#include -#include -#include -#include -#include - -class WhisperCPP { -public: - WhisperCPP(wxTextCtrl* out); - ~WhisperCPP(); - - bool GetMediaFoundation(Whisper::iMediaFoundation*& f); - bool GetMics(Whisper::iMediaFoundation* f, std::vector& mics); - bool OpenMic(Whisper::iMediaFoundation* f, const AppConfig& c, const int idx, Whisper::iAudioCapture*& stream); - bool InstallDependencies(); - bool DownloadModel(const std::string& model_name, - const std::filesystem::path& fs_path); - bool LoadModel(const std::string& path, Whisper::iModel*& model); - bool CreateContext(Whisper::iModel* model, Whisper::iContext*& context); - - void Start(const AppConfig& c); - void Stop(); - - void StartBrowserSource(const AppConfig& c); - void StopBrowserSource(); - - void StartCustomChatbox(const AppConfig& c); - void StopCustomChatbox(); - -private: - struct MicInfo { - MicInfo(const wchar_t* n, const wchar_t* e) - : name(n), endpoint(e) - {} - - std::wstring name; - std::wstring endpoint; - }; - bool GetMicsImpl( - Whisper::iMediaFoundation* f, - std::vector>& mics); - - wxTextCtrl* out_; - - std::future transcription_thd_; - volatile bool run_transcription_; - - std::future browser_src_thd_; - volatile bool run_browser_src_; - - std::future custom_chatbox_thd_; - volatile bool run_custom_chatbox_; - - Transcript transcript_; -}; diff --git a/GUI/Libraries/fetch.ps1 b/GUI/Libraries/fetch.ps1 index d71ce44..f558d3f 100644 --- a/GUI/Libraries/fetch.ps1 +++ b/GUI/Libraries/fetch.ps1 @@ -27,14 +27,5 @@ if (-Not (Test-Path wx)) { popd > $null } - -if (Test-Path ../GUI/GUI/whisper/) { - rm -Recurse ../GUI/GUI/whisper/ -} - -mkdir ../GUI/GUI/whisper/ -cp ../../TaSTT-Whisper/Whisper/API/*.h ../GUI/GUI/whisper/ -cp ../../TaSTT-Whisper/x64/Release/Whisper.lib ../GUI/GUI/whisper/ - popd > $null # $PSScriptRoot diff --git a/GUI/README.md b/GUI/README.md index 6d5049d..ff3e358 100644 --- a/GUI/README.md +++ b/GUI/README.md @@ -18,8 +18,6 @@ $ git submodule update Unrestricted` in an admin instance of powershell. Heed the warning, this is a security risk! Never run code from someone you don't trust unless you've carefully audited it. - 3.1. If you haven't built TaSTT-Whisper before, you'll see an error. Ignore - it. 4. Open `Libraries/wx/build/msw/wx_vc17.sln` with Visual Studio 2022. 5. Select every project in the Solution Explorer except for `_custom_build`. 6. Right click, select Properties, go to C/C++, Code Generation, and set @@ -29,18 +27,14 @@ $ git submodule update 1. The build configuration is in the top. By default it's probably Debug/x64. 2. To build: ctrl+shift+B 3. If you saw an error in 3.1, rerun Libraries/fetch.ps1. -8. Follow `Build instructions` section of TaSTT-Whisper/Readme.md and build it - as x64/Release. - 8.0. If you see a message like `Based on your solution... you might need to - install additional components`, do it. -9. Open GUI/GUI.sln with Visual Studio 2022. -10. Build x64/Release. -11. Run package.ps1 from powershell. - 11.0. If you're not creating a redistributable release, use this command +8. Open GUI/GUI.sln with Visual Studio 2022. +9. Build x64/Release. +10. Run package.ps1 from powershell. + 10.0. If you're not creating a redistributable release, use this command instead (it's way faster): `package.ps1 -skip_zip`. - 11.1. When PortableGit creates a window, wait for it to complete, then press + 10.1. When PortableGit creates a window, wait for it to complete, then press then press enter in Powershell. - 11.2. The first time you run this it'll take a long time since it has to + 10.2. The first time you run this it'll take a long time since it has to fetch a few large packages. Subsequent invocations will be much faster since it won't reacquire anything already downloaded. On my connection, it took 90 minutes to finish downloading, mostly because Google Drive @@ -49,9 +43,8 @@ $ git submodule update ## High level design * The GUI is written using wxWidgets. -* Python executes core business logic. We can't migrate away since - there's no CUDA-enabled Whisper implementation available in a good - systems programming language. +* Python executes core business logic. With libraries like faster\_whisper + available, this provides a nice balance between flexibility and performance. * To skirt licensing complexity, we distribute an embedded python that's hacked up to allow installing packages via pip. We use this to install packages at runtime (like a net installer), so we don't diff --git a/GUI/package.ps1 b/GUI/package.ps1 index 808bedd..4800777 100644 --- a/GUI/package.ps1 +++ b/GUI/package.ps1 @@ -149,7 +149,6 @@ cp -Recurse ../Sounds TaSTT/Resources/Sounds cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets cp -Recurse ../BrowserSource TaSTT/Resources/BrowserSource cp GUI/x64/$release/GUI.exe TaSTT/TaSTT.exe -cp ../"TaSTT-Whisper"/x64/Release/Whisper.dll TaSTT/Whisper.dll mkdir TaSTT/Resources/Models mkdir TaSTT/Resources/Uwu cp UwwwuPP/build/Src/Debug/Uwwwu.exe TaSTT/Resources/Uwu/ -- cgit v1.2.3