From 0f08da58a59a126f5307395e822fd140f15f8b36 Mon Sep 17 00:00:00 2001 From: yum Date: Tue, 23 May 2023 12:28:01 -0700 Subject: Automatically set up virtual env Remove the button. This is a big source of confusion for new users. Now it happens automatically upon starting any task that needs it. * Begin removing CPP implementation of Whisper. faster-whisper is a much easier/better solution. * Flip default of `clear OSC configs` from false to true. --- GUI/GUI/GUI/Config.cpp | 2 +- GUI/GUI/GUI/Frame.cpp | 784 ++++-------------------------------------- GUI/GUI/GUI/Frame.h | 12 +- GUI/GUI/GUI/PythonWrapper.cpp | 4 +- GUI/GUI/GUI/PythonWrapper.h | 3 +- 5 files changed, 70 insertions(+), 735 deletions(-) (limited to 'GUI') diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 73b28bc..0cf6d9d 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -83,7 +83,7 @@ AppConfig::AppConfig(wxTextCtrl* out) fx_path(), params_path(), menu_path(), - clear_osc(false), + clear_osc(true), whisper_model("ggml-medium.bin"), whisper_mic(0), diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 3bf1545..74a0233 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -5,6 +5,7 @@ #include "Util.h" #include +#include #include #include #include @@ -21,7 +22,6 @@ namespace { ID_PY_PANEL, ID_PY_CONFIG_PANEL, ID_PY_APP_CONFIG_PANEL_PAIRS, - ID_PY_SETUP_BUTTON, ID_PY_DUMP_MICS_BUTTON, ID_PY_APP_DRAIN, ID_PY_APP_START_BUTTON, @@ -122,21 +122,6 @@ namespace { const size_t kNumMicChoices = sizeof(kMicChoices) / sizeof(kMicChoices[0]); constexpr int kMicDefault = 0; // index - wxString kWhisperMicChoices[] = { - "0", - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - }; - const size_t kNumWhisperMicChoices = sizeof(kWhisperMicChoices) / sizeof(kWhisperMicChoices[0]); - constexpr int kWhisperMicDefault = 0; // index - // lifted from whisper/tokenizer.py const wxString kLangChoices[] = { "english", @@ -258,22 +243,6 @@ namespace { const size_t kNumModelChoices = sizeof(kModelChoices) / sizeof(kModelChoices[0]); constexpr int kModelDefault = 2; // base.en - // Source: https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main - const wxString kWhisperModelChoices[] = { - "ggml-tiny.bin", - "ggml-tiny.en.bin", - "ggml-base.bin", - "ggml-base.en.bin", - "ggml-small.bin", - "ggml-small.en.bin", - "ggml-medium.bin", - "ggml-medium.en.bin", - "ggml-large.bin", - }; - const size_t kNumWhisperModelChoices = sizeof(kWhisperModelChoices) / - sizeof(kWhisperModelChoices[0]); - constexpr int kWhisperModelDefault = 6; // medium.bin - const wxString kCharsPerSync[] = { "5", "6", @@ -386,9 +355,7 @@ Frame::Frame() auto* navbar = new wxPanel(main_panel, ID_NAVBAR); { auto* navbar_button_transcribe = new wxButton(navbar, - ID_NAVBAR_BUTTON_TRANSCRIBE, "Transcription (PY)"); - auto* navbar_button_whisper = new wxButton(navbar, - ID_NAVBAR_BUTTON_WHISPER, "Transcription (CPP)"); + ID_NAVBAR_BUTTON_TRANSCRIBE, "Transcription"); auto* navbar_button_unity = new wxButton(navbar, ID_NAVBAR_BUTTON_UNITY, "Unity"); auto* navbar_button_debug = new wxButton(navbar, @@ -398,8 +365,6 @@ Frame::Frame() navbar->SetSizer(sizer); sizer->Add(navbar_button_transcribe, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(navbar_button_whisper, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(navbar_button_unity, /*proportion=*/0, /*flags=*/wxEXPAND); @@ -422,15 +387,6 @@ Frame::Frame() auto* py_config_panel = new wxPanel(transcribe_panel, ID_PY_CONFIG_PANEL); { - auto* py_setup_button = new wxButton(py_config_panel, - ID_PY_SETUP_BUTTON, "Set up Python virtual environment"); - py_setup_button->SetToolTip( - "TaSTT uses the Python programming language to provide both " - "transcription services and to interface with Unity. " - "It installs its dependencies into an isolated folder " - "called a 'virtual environment'. Click this button to " - "install those dependencies. This only has to be done " - "once when you install a new version of TaSTT."); auto* py_dump_mics_button = new wxButton(py_config_panel, ID_PY_DUMP_MICS_BUTTON, "List input devices"); py_dump_mics_button->SetToolTip( @@ -646,8 +602,6 @@ Frame::Frame() auto* sizer = new wxBoxSizer(wxVERTICAL); py_config_panel->SetSizer(sizer); - sizer->Add(py_setup_button, /*proportion=*/0, - /*flags=*/wxEXPAND); sizer->Add(py_dump_mics_button, /*proportion=*/0, /*flags=*/wxEXPAND); sizer->Add(py_app_config_panel_pairs, /*proportion=*/0, @@ -924,366 +878,6 @@ Frame::Frame() } unity_panel_->Hide(); - auto* whisper_panel = new wxPanel(main_panel, ID_WHISPER_PANEL); - whisper_panel_ = whisper_panel; - { - const auto whisper_out_sz = wxSize(/*x_px=*/480, /*y_px=*/160); - auto* whisper_out = new wxTextCtrl(whisper_panel, - ID_WHISPER_OUT, wxEmptyString, wxDefaultPosition, - whisper_out_sz, wxTE_MULTILINE | wxTE_READONLY); - whisper_out->SetMinSize(whisper_out_sz); - whisper_out_ = whisper_out; - - auto* whisper_config_panel = new wxPanel(whisper_panel, - ID_WHISPER_CONFIG_PANEL); - { - auto* whisper_config_panel_pairs = new wxPanel(whisper_config_panel, - ID_WHISPER_CONFIG_PANEL_PAIRS); - { - auto* whisper_mic = new wxChoice(whisper_config_panel_pairs, - ID_WHISPER_MIC, wxDefaultPosition, - wxDefaultSize, kNumWhisperMicChoices, kWhisperMicChoices); - whisper_mic->SetToolTip( - "Select which microphone to listen to when " - "transcribing. To get list microphones and get their " - "numbers, click 'List input devices'."); - whisper_mic_ = whisper_mic; - - auto* whisper_lang = new wxChoice(whisper_config_panel_pairs, - ID_WHISPER_LANG, wxDefaultPosition, wxDefaultSize, - kNumLangChoices, kLangChoices); - whisper_lang->SetToolTip("Select which language you will " - "speak in. It will be whisperd into that language. " - "If using a language with non-ASCII characters (i.e. " - "not English), make sure you have 'bytes per char' " - "set to 2. If using something other than English, " - "make sure you're not using a *.en model."); - whisper_lang_ = whisper_lang; - - auto* whisper_model = new wxChoice( - whisper_config_panel_pairs, ID_WHISPER_MODEL, - wxDefaultPosition, wxDefaultSize, kNumWhisperModelChoices, - kWhisperModelChoices); - whisper_model->SetToolTip("Select which version of " - "the transcription model to use. 'base' is a good " - "choice for most users. 'small' is slightly more " - "accurate, slower, and uses more VRAM. The *.en " - "models are fine-tuned English language models, and " - "don't work for other languages."); - whisper_model_ = whisper_model; - - auto* whisper_chars_per_sync = new wxChoice( - whisper_config_panel_pairs, ID_WHISPER_CHARS_PER_SYNC, - wxDefaultPosition, wxDefaultSize, kNumCharsPerSync, - kCharsPerSync); - whisper_chars_per_sync->SetToolTip( - "VRChat syncs avatar parameters roughly 5 times per " - "second. We use this to send text to the box. By " - "sending more characters per sync, the box will be " - "faster, but you'll use more avatar parameters."); - whisper_chars_per_sync_ = whisper_chars_per_sync; - - auto* whisper_bytes_per_char = new wxChoice( - whisper_config_panel_pairs, ID_WHISPER_BYTES_PER_CHAR, - wxDefaultPosition, wxDefaultSize, kNumBytesPerChar, - kBytesPerChar); - whisper_bytes_per_char->SetToolTip( - "If you speak a language that uses non-ASCII " - "characters (i.e. not English), set this to 2."); - whisper_bytes_per_char_ = whisper_bytes_per_char; - - auto* whisper_button = new wxChoice(whisper_config_panel_pairs, - ID_WHISPER_BUTTON, wxDefaultPosition, - wxDefaultSize, kNumButtons, kButton); - whisper_button->SetToolTip( - "You will use this button in game to start and stop " - "transcription. Set it to a button you're not using " - "for anything else!"); - whisper_button_ = whisper_button; - - auto* whisper_rows = new wxTextCtrl(whisper_config_panel_pairs, - ID_WHISPER_ROWS, std::to_string(app_c_->rows), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_rows->SetToolTip( - "The number of rows on the text box."); - whisper_rows_ = whisper_rows; - - auto* whisper_cols = new wxTextCtrl(whisper_config_panel_pairs, - ID_WHISPER_COLS, std::to_string(app_c_->cols), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_cols->SetToolTip( - "The number of columns on the text box."); - whisper_cols_ = whisper_cols; - - auto* whisper_browser_src_port = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_BROWSER_SRC_PORT, - std::to_string(app_c_->browser_src_port), wxDefaultPosition, - wxDefaultSize, /*style=*/0); - whisper_browser_src_port->SetToolTip( - "This is the port that the browser source is hosted " - "on. If you aren't using TaSTT to stream, you can " - "ignore this option."); - whisper_browser_src_port_ = whisper_browser_src_port; - - auto* whisper_decode_method = new wxChoice( - whisper_config_panel_pairs, - ID_WHISPER_DECODE_METHOD, wxDefaultPosition, - wxDefaultSize, kNumDecodeMethods, kDecodeMethods); - whisper_decode_method->SetToolTip( - "Decoding method to use with whisper. Greedy is faster " - "and slightly less accurate."); - whisper_decode_method_ = whisper_decode_method; - - auto* whisper_max_ctxt = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_MAX_CTXT, - std::to_string(app_c_->whisper_max_ctxt), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_max_ctxt->SetToolTip("TODO"); - whisper_max_ctxt_ = whisper_max_ctxt; - - auto* whisper_beam_width = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_BEAM_WIDTH, - std::to_string(app_c_->whisper_beam_width), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_beam_width->SetToolTip("TODO"); - whisper_beam_width_ = whisper_beam_width; - - auto* whisper_beam_n_best = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_BEAM_N_BEST, - std::to_string(app_c_->whisper_beam_n_best), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_beam_n_best->SetToolTip("TODO"); - whisper_beam_n_best_ = whisper_beam_n_best; - - auto* whisper_vad_min_duration = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_VAD_MIN_DURATION, - std::to_string(app_c_->whisper_vad_min_duration), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_vad_min_duration->SetToolTip("TODO"); - whisper_vad_min_duration_ = whisper_vad_min_duration; - - auto* whisper_vad_max_duration = new wxTextCtrl( - whisper_config_panel_pairs, ID_WHISPER_VAD_MAX_DURATION, - std::to_string(app_c_->whisper_vad_max_duration), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_vad_max_duration->SetToolTip("TODO"); - whisper_vad_max_duration_ = whisper_vad_max_duration; - - auto* whisper_vad_drop_start_silence = new wxTextCtrl( - whisper_config_panel_pairs, - ID_WHISPER_VAD_DROP_START_SILENCE, - std::to_string(app_c_->whisper_vad_drop_start_silence), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_vad_drop_start_silence->SetToolTip("TODO"); - whisper_vad_drop_start_silence_ = whisper_vad_drop_start_silence; - - auto* whisper_vad_pause_duration = new wxTextCtrl( - whisper_config_panel_pairs, - ID_WHISPER_VAD_PAUSE_DURATION, - std::to_string(app_c_->whisper_vad_pause_duration), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_vad_pause_duration->SetToolTip("TODO"); - whisper_vad_pause_duration_ = whisper_vad_pause_duration; - - auto* whisper_vad_retain_duration = new wxTextCtrl( - whisper_config_panel_pairs, - ID_WHISPER_VAD_RETAIN_DURATION, - std::to_string(app_c_->whisper_vad_retain_duration), - wxDefaultPosition, wxDefaultSize, /*style=*/0); - whisper_vad_retain_duration->SetToolTip("TODO"); - whisper_vad_retain_duration_ = whisper_vad_retain_duration; - - auto* sizer = new wxFlexGridSizer(/*cols=*/2); - whisper_config_panel_pairs->SetSizer(sizer); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Microphone:")); - sizer->Add(whisper_mic, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Language:")); - sizer->Add(whisper_lang, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Model:")); - sizer->Add(whisper_model, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Decode method:")); - sizer->Add(whisper_decode_method, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Max audio contexts:")); - sizer->Add(whisper_max_ctxt, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Beam width:")); - sizer->Add(whisper_beam_width, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Beam n best:")); - sizer->Add(whisper_beam_n_best, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"VAD min duration:")); - sizer->Add(whisper_vad_min_duration, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"VAD max duration:")); - sizer->Add(whisper_vad_max_duration, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"VAD drop start silence:")); - sizer->Add(whisper_vad_drop_start_silence, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"VAD pause duration:")); - sizer->Add(whisper_vad_pause_duration, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"VAD retain duration:")); - sizer->Add(whisper_vad_retain_duration, /*proportion=*/0, - /*flags=*/wxEXPAND); - -#if 0 - // Not implemented. - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Characters per sync:")); - sizer->Add(whisper_chars_per_sync, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Bytes per character:")); - sizer->Add(whisper_bytes_per_char, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Button:")); - sizer->Add(whisper_button, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Text box rows:")); - sizer->Add(whisper_rows, /*proportion=*/0, - /*flags=*/wxEXPAND); - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Text box columns:")); - sizer->Add(whisper_cols, /*proportion=*/0, - /*flags=*/wxEXPAND); -#else - whisper_chars_per_sync->Hide(); - whisper_bytes_per_char->Hide(); - whisper_button->Hide(); - whisper_rows->Hide(); - whisper_cols->Hide(); -#endif - - sizer->Add(new wxStaticText(whisper_config_panel_pairs, - wxID_ANY, /*label=*/"Browser source port:")); - sizer->Add(whisper_browser_src_port, /*proportion=*/0, - /*flags=*/wxEXPAND); - } - - auto* whisper_enable_local_beep = new wxCheckBox(whisper_config_panel, - ID_WHISPER_ENABLE_LOCAL_BEEP, "Enable local beep"); - whisper_enable_local_beep->SetValue(app_c_->enable_local_beep); - whisper_enable_local_beep->SetToolTip( - "By default, TaSTT will play a sound (audible only to " - "you) when it begins transcription and when it stops. " - "Uncheck this to disable that behavior." - ); - whisper_enable_local_beep_ = whisper_enable_local_beep; - - auto* whisper_use_cpu = new wxCheckBox(whisper_config_panel, - ID_WHISPER_USE_CPU, "Use CPU"); - whisper_use_cpu->SetValue(app_c_->use_cpu); - whisper_use_cpu->SetToolTip( - "If checked, the transcription engine will run on your " - "CPU instead of your GPU. This is typically much slower " - "and should only be used if you aren't able to use your " - "GPU." - ); - whisper_use_cpu_ = whisper_use_cpu; - - auto* whisper_enable_builtin = new wxCheckBox(whisper_config_panel, - ID_WHISPER_ENABLE_BUILTIN, "Send to built-in chatbox"); - whisper_enable_builtin->SetValue(app_c_->whisper_enable_builtin); - whisper_enable_builtin->SetToolTip( - "If checked, text will be sent to the built-in text box." - ); - whisper_enable_builtin_ = whisper_enable_builtin; - - auto* whisper_enable_custom = new wxCheckBox(whisper_config_panel, - ID_WHISPER_ENABLE_CUSTOM, "Send to custom chatbox"); - whisper_enable_custom->SetValue(app_c_->whisper_enable_custom); - whisper_enable_custom->SetToolTip( - "If checked, text will be sent to the custom text box." - ); - whisper_enable_custom_ = whisper_enable_custom; - - auto* whisper_enable_browser_src = new wxCheckBox(whisper_config_panel, - ID_WHISPER_ENABLE_BROWSER_SRC, "Send to browser source"); - whisper_enable_browser_src->SetValue(app_c_->whisper_enable_browser_src); - whisper_enable_browser_src->SetToolTip( - "If checked, text will be sent to a browser source. If " - "you're not using TaSTT to stream, you can ignore this option."); - whisper_enable_browser_src_ = whisper_enable_browser_src; - - // Hack: Add newlines before and after the button text to make - // the buttons bigger, and easier to click from inside VR. - auto* whisper_start_button = new wxButton(whisper_config_panel, - ID_WHISPER_START_BUTTON, "\nBegin transcribing\n\n"); - auto* whisper_stop_button = new wxButton(whisper_config_panel, - ID_WHISPER_STOP_BUTTON, "\nStop transcribing\n\n"); - - auto* sizer = new wxBoxSizer(wxVERTICAL); - whisper_config_panel->SetSizer(sizer); - sizer->Add(whisper_config_panel_pairs, /*proportion=*/0, - /*flags=*/wxEXPAND); -#if 0 - sizer->Add(whisper_enable_local_beep, /*proportion=*/0, - /*flags=*/wxEXPAND); - // Not yet implemented. - sizer->Add(whisper_use_cpu, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(whisper_enable_builtin, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(whisper_enable_custom, /*proportion=*/0, - /*flags=*/wxEXPAND); -#else - whisper_enable_local_beep->Hide(); - whisper_use_cpu->Hide(); - whisper_enable_builtin->Hide(); - whisper_enable_custom->Hide(); -#endif - sizer->Add(whisper_enable_browser_src, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(whisper_start_button, /*proportion=*/0, - /*flags=*/wxEXPAND); - sizer->Add(whisper_stop_button, /*proportion=*/0, - /*flags=*/wxEXPAND); - } - - auto* sizer = new wxBoxSizer(wxHORIZONTAL); - whisper_panel->SetSizer(sizer); - sizer->Add(whisper_config_panel, /*proportion=*/0, /*flags=*/wxEXPAND); - sizer->Add(whisper_out, /*proportion=*/1, /*flags=*/wxEXPAND); - } - whisper_ = std::make_unique(whisper_out_); - whisper_panel_->Hide(); - auto* debug_panel = new wxPanel(main_panel, ID_DEBUG_PANEL); debug_panel_ = debug_panel; { @@ -1385,7 +979,6 @@ Frame::Frame() sizer->Add(transcribe_panel, /*proportion=*/1, /*flags=*/wxEXPAND); sizer->Add(unity_panel, /*proportion=*/1, /*flags=*/wxEXPAND); sizer->Add(debug_panel, /*proportion=*/1, /*flags=*/wxEXPAND); - sizer->Add(whisper_panel, /*proportion=*/1, /*flags=*/wxEXPAND); } // Now that transcribe_out_ has been created, we can deserialize. @@ -1397,15 +990,10 @@ Frame::Frame() ID_NAVBAR_BUTTON_TRANSCRIBE); Bind(wxEVT_BUTTON, &Frame::OnNavbarUnity, this, ID_NAVBAR_BUTTON_UNITY); Bind(wxEVT_BUTTON, &Frame::OnNavbarDebug, this, ID_NAVBAR_BUTTON_DEBUG); - Bind(wxEVT_BUTTON, &Frame::OnNavbarWhisper, this, ID_NAVBAR_BUTTON_WHISPER); Bind(wxEVT_BUTTON, &Frame::OnAppStart, this, ID_PY_APP_START_BUTTON); Bind(wxEVT_BUTTON, &Frame::OnAppStop, this, ID_PY_APP_STOP_BUTTON); - Bind(wxEVT_BUTTON, &Frame::OnWhisperStart, this, ID_WHISPER_START_BUTTON); - Bind(wxEVT_BUTTON, &Frame::OnWhisperStop, this, ID_WHISPER_STOP_BUTTON); Bind(wxEVT_TIMER, &Frame::OnAppDrain, this, ID_PY_APP_DRAIN); - Bind(wxEVT_BUTTON, &Frame::OnSetupPython, this, ID_PY_SETUP_BUTTON); Bind(wxEVT_BUTTON, &Frame::OnDumpMics, this, ID_PY_DUMP_MICS_BUTTON); - //Bind(wxEVT_BUTTON, &Frame::OnWhisperDumpMics, this, ID_WHISPER_DUMP_MICS_BUTTON); Bind(wxEVT_BUTTON, &Frame::OnGenerateFX, this, ID_UNITY_BUTTON_GEN_ANIMATOR); Bind(wxEVT_BUTTON, &Frame::OnListPip, this, ID_DEBUG_BUTTON_LIST_PIP); @@ -1430,7 +1018,6 @@ Frame::Frame() // Initialize input fields using AppConfig. ApplyConfigToInputFields(); - PopulateDynamicInputFields(); Resize(); OnUnityParamChangeImpl(); @@ -1482,86 +1069,6 @@ void Frame::ApplyConfigToInputFields() py_app_cols->Clear(); py_app_cols->AppendText(std::to_string(app_c_->cols)); - // Whisper panel - auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); - int whisper_mic_idx = app_c_->whisper_mic; - whisper_mic->SetSelection(whisper_mic_idx); - - auto* whisper_lang = static_cast(FindWindowById(ID_WHISPER_LANG)); - whisper_lang->SetSelection(lang_idx); - - auto* whisper_model = static_cast(FindWindowById(ID_WHISPER_MODEL)); - int whisper_model_idx = GetDropdownChoiceIndex(kWhisperModelChoices, - kNumWhisperModelChoices, app_c_->whisper_model, kWhisperModelDefault); - whisper_model->SetSelection(whisper_model_idx); - - auto* whisper_button = static_cast(FindWindowById(ID_WHISPER_BUTTON)); - whisper_button->SetSelection(button_idx); - -#if 0 - auto* whisper_chars_per_sync = static_cast(FindWindowById(ID_WHISPER_CHARS_PER_SYNC)); - whisper_chars_per_sync->SetSelection(chars_idx); - - auto* whisper_bytes_per_char = static_cast(FindWindowById(ID_WHISPER_BYTES_PER_CHAR)); - whisper_bytes_per_char->SetSelection(bytes_idx); - - auto* whisper_rows = static_cast(FindWindowById(ID_WHISPER_ROWS)); - whisper_rows->Clear(); - whisper_rows->AppendText(std::to_string(app_c_->rows)); - - auto* whisper_cols = static_cast(FindWindowById(ID_WHISPER_COLS)); - whisper_cols->Clear(); - whisper_cols->AppendText(std::to_string(app_c_->cols)); -#endif - - auto* whisper_browser_src_port = static_cast(FindWindowById(ID_WHISPER_BROWSER_SRC_PORT)); - whisper_browser_src_port->Clear(); - whisper_browser_src_port->AppendText(std::to_string(app_c_->browser_src_port)); - - auto* whisper_enable_local_beep = static_cast(FindWindowById(ID_WHISPER_ENABLE_LOCAL_BEEP)); - whisper_enable_local_beep->SetValue(app_c_->enable_local_beep); - - auto* whisper_use_cpu = static_cast(FindWindowById(ID_WHISPER_USE_CPU)); - whisper_use_cpu->SetValue(app_c_->use_cpu); - - auto* whisper_enable_builtin = static_cast(FindWindowById(ID_WHISPER_ENABLE_BUILTIN)); - whisper_enable_builtin->SetValue(app_c_->whisper_enable_builtin); - - auto* whisper_enable_custom = static_cast(FindWindowById(ID_WHISPER_ENABLE_CUSTOM)); - whisper_enable_custom->SetValue(app_c_->whisper_enable_custom); - - auto* whisper_enable_browser_src = static_cast(FindWindowById(ID_WHISPER_ENABLE_BROWSER_SRC)); - whisper_enable_browser_src->SetValue(app_c_->whisper_enable_browser_src); - - auto* whisper_decode_method = static_cast(FindWindowById(ID_WHISPER_DECODE_METHOD)); - int whisper_decode_method_idx = GetDropdownChoiceIndex(kDecodeMethods, - kNumDecodeMethods, app_c_->whisper_decode_method, kDecodeMethodDefault); - whisper_decode_method->SetSelection(whisper_decode_method_idx); - - auto* whisper_max_ctxt = static_cast(FindWindowById(ID_WHISPER_MAX_CTXT)); - whisper_max_ctxt->SetValue(std::to_string(app_c_->whisper_max_ctxt)); - - auto* whisper_beam_width = static_cast(FindWindowById(ID_WHISPER_BEAM_WIDTH)); - whisper_beam_width->SetValue(std::to_string(app_c_->whisper_beam_width)); - - auto* whisper_beam_n_best = static_cast(FindWindowById(ID_WHISPER_BEAM_N_BEST)); - whisper_beam_n_best->SetValue(std::to_string(app_c_->whisper_beam_n_best)); - - auto* whisper_vad_min_duration = static_cast(FindWindowById(ID_WHISPER_VAD_MIN_DURATION)); - whisper_vad_min_duration->SetValue(std::to_string(app_c_->whisper_vad_min_duration)); - - auto* whisper_vad_max_duration = static_cast(FindWindowById(ID_WHISPER_VAD_MAX_DURATION)); - whisper_vad_max_duration->SetValue(std::to_string(app_c_->whisper_vad_max_duration)); - - auto* whisper_vad_drop_start_silence = static_cast(FindWindowById(ID_WHISPER_VAD_DROP_START_SILENCE)); - whisper_vad_drop_start_silence->SetValue(std::to_string(app_c_->whisper_vad_drop_start_silence)); - - auto* whisper_vad_pause_duration = static_cast(FindWindowById(ID_WHISPER_VAD_PAUSE_DURATION)); - whisper_vad_pause_duration->SetValue(std::to_string(app_c_->whisper_vad_pause_duration)); - - auto* whisper_vad_retain_duration = static_cast(FindWindowById(ID_WHISPER_VAD_RETAIN_DURATION)); - whisper_vad_retain_duration->SetValue(std::to_string(app_c_->whisper_vad_retain_duration)); - // Unity panel auto* unity_chars_per_sync = static_cast(FindWindowById(ID_UNITY_CHARS_PER_SYNC)); unity_chars_per_sync->SetSelection(chars_idx); @@ -1578,33 +1085,9 @@ void Frame::ApplyConfigToInputFields() unity_cols->AppendText(std::to_string(app_c_->cols)); } -void Frame::PopulateDynamicInputFields() -{ - Whisper::iMediaFoundation* f = nullptr; - if (!whisper_->GetMediaFoundation(f)) { - return; - } - ScopeGuard f_cleanup([f]() { f->Release(); }); - - std::vector mics; - if (whisper_->GetMics(f, mics)) { - std::vector contents(mics.size()); - auto* whisper_mic = static_cast(FindWindowById(ID_WHISPER_MIC)); - for (int i = 0; i < std::min(mics.size(), kNumWhisperMicChoices); i++) { - contents[i] = mics[i]; - } - int mic_idx = whisper_mic->GetSelection(); - whisper_mic->Set(contents); - if (mic_idx < contents.size()) { - whisper_mic->SetSelection(mic_idx); - } - } -} - void Frame::OnExit(wxCloseEvent& event) { OnAppStop(); - OnWhisperStop(); } void Frame::OnNavbarTranscribe(wxCommandEvent& event) @@ -1612,11 +1095,9 @@ void Frame::OnNavbarTranscribe(wxCommandEvent& event) transcribe_panel_->Hide(); unity_panel_->Hide(); debug_panel_->Hide(); - whisper_panel_->Hide(); // Initialize input fields using AppConfig. ApplyConfigToInputFields(); - PopulateDynamicInputFields(); transcribe_panel_->Show(); Resize(); @@ -1627,11 +1108,9 @@ void Frame::OnNavbarUnity(wxCommandEvent& event) transcribe_panel_->Hide(); unity_panel_->Hide(); debug_panel_->Hide(); - whisper_panel_->Hide(); // Initialize input fields using AppConfig. ApplyConfigToInputFields(); - PopulateDynamicInputFields(); unity_panel_->Show(); Resize(); @@ -1642,32 +1121,15 @@ void Frame::OnNavbarDebug(wxCommandEvent& event) transcribe_panel_->Hide(); unity_panel_->Hide(); debug_panel_->Hide(); - whisper_panel_->Hide(); // Initialize input fields using AppConfig. ApplyConfigToInputFields(); - PopulateDynamicInputFields(); debug_panel_->Show(); Resize(); } -void Frame::OnNavbarWhisper(wxCommandEvent& event) -{ - transcribe_panel_->Hide(); - unity_panel_->Hide(); - debug_panel_->Hide(); - whisper_panel_->Hide(); - - // Initialize input fields using AppConfig. - ApplyConfigToInputFields(); - PopulateDynamicInputFields(); - - whisper_panel_->Show(); - Resize(); -} - -void Frame::OnSetupPython(wxCommandEvent& event) +void Frame::EnsureVirtualEnv(bool block) { auto status = env_proc_.wait_for(std::chrono::seconds(0)); if (status != std::future_status::ready) { @@ -1675,11 +1137,44 @@ void Frame::OnSetupPython(wxCommandEvent& event) return; } + static const std::filesystem::path venv_flag = std::filesystem::current_path() / ".venv_is_set_up"; + if (std::filesystem::exists(venv_flag)) { + std::ifstream venv_flag_ifs(venv_flag); + std::string venv_flag_ts_str; + std::getline(venv_flag_ifs, venv_flag_ts_str); + + int64_t venv_flag_ts; + bool is_valid = false; + try { + venv_flag_ts = std::stol(venv_flag_ts_str); + is_valid = true; + } + catch (const std::invalid_argument&) { + Log(transcribe_out_, "Could not venv flag timestamp \"{}\" as long " + "- will re-setup venv"); + } + catch (const std::out_of_range&) { + Log(transcribe_out_, "Could not venv flag timestamp \"{}\" as long " + "- will re-setup venv"); + } + if (is_valid) { + auto now = std::chrono::system_clock::now(); + const int64_t seconds_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); + int64_t seconds_old = seconds_since_epoch - venv_flag_ts; + if (seconds_old >= 0 && + seconds_old < 60 * 60) { + return; + } + Log(transcribe_out_, "Virtual environment last set up {} seconds " + "ago, verifying installation\n", seconds_old); + } + } + env_proc_ = std::move(std::async(std::launch::async, [&]() { Log(transcribe_out_, "Setting up Python virtual environment\n"); Log(transcribe_out_, "This could take several minutes, please be " "patient!\n"); - Log(transcribe_out_, "This will download ~5GB of dependencies.\n"); + Log(transcribe_out_, "This will download ~1GB of dependencies.\n"); { Log(transcribe_out_, " Installing pip\n"); @@ -1708,8 +1203,24 @@ void Frame::OnSetupPython(wxCommandEvent& event) return false; } Log(transcribe_out_, "Successfully set up virtual environment!\n"); + + std::ofstream venv_flag_ofs(venv_flag); + auto now = std::chrono::system_clock::now(); + const int64_t seconds_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); + venv_flag_ofs << std::to_string(seconds_since_epoch); + return true; })); + + if (block) { + // Spinning prevents the GUI from hanging. + while (true) { + auto status = env_proc_.wait_for(std::chrono::milliseconds(1)); + if (status == std::future_status::ready) { + break; + } + } + } } void Frame::OnDumpMics(wxCommandEvent& event) @@ -1720,6 +1231,7 @@ void Frame::OnDumpMics(wxCommandEvent& event) return; } dump_mics_ = std::move(std::async(std::launch::async, [&]() { + EnsureVirtualEnv(/*block=*/true); Log(transcribe_out_, "Getting mics...\n"); Log(transcribe_out_, "{}\n", PythonWrapper::DumpMics()); return true; @@ -1751,6 +1263,8 @@ void Frame::OnGenerateFX(wxCommandEvent& event) unity_app_ = std::move(std::async(std::launch::async, [&]() { Log(unity_out_, "Generating animator\n"); + EnsureVirtualEnv(/*block=*/true); + std::filesystem::path unity_assets_path; if (!GetUserPath(unity_out_, unity_assets_file_picker_->GetPath().ToStdString(), @@ -2044,7 +1558,7 @@ void Frame::OnRestoreVenv(wxCommandEvent& event) Log(debug_out_, "Setting up virtual env to ensure consistency. Most " "packages should not be re-acquired. Output is printed to the " "transcription panel.\n"); - OnSetupPython(event); + EnsureVirtualEnv(/*block=*/false); } void Frame::OnUnityParamChangeImpl() { @@ -2205,7 +1719,11 @@ void Frame::OnAppStart(wxCommandEvent& event) { return run_py_app_; }; run_py_app_ = true; - py_app_ = std::move(PythonWrapper::StartApp(*app_c_, std::move(out_cb), std::move(in_cb), std::move(run_cb))); + auto prestart_cb = [this]() -> void { + EnsureVirtualEnv(/*block=*/true); + }; + py_app_ = std::move(PythonWrapper::StartApp(*app_c_, std::move(out_cb), + std::move(in_cb), std::move(run_cb), std::move(prestart_cb))); Log(transcribe_out_, "py app valid: {}\n", py_app_.valid()); } @@ -2224,184 +1742,6 @@ void Frame::OnAppStop(wxCommandEvent& event) { OnAppStop(); } -void Frame::OnWhisperStart(wxCommandEvent& event) { - Log(whisper_out_, "Launching transcription engine\n"); - - int which_mic = whisper_mic_->GetSelection(); - if (which_mic == wxNOT_FOUND) { - which_mic = kMicDefault; - } - int which_lang = whisper_lang_->GetSelection(); - if (which_lang == wxNOT_FOUND) { - which_lang = kLangDefault; - } - int which_model = whisper_model_->GetSelection(); - if (which_model == wxNOT_FOUND) { - which_model = kWhisperModelDefault; - } - int chars_per_sync_idx = whisper_chars_per_sync_->GetSelection(); - if (chars_per_sync_idx == wxNOT_FOUND) { - chars_per_sync_idx = kCharsDefault; - } - int bytes_per_char_idx = whisper_bytes_per_char_->GetSelection(); - if (bytes_per_char_idx == wxNOT_FOUND) { - bytes_per_char_idx = kBytesDefault; - } - int button_idx = whisper_button_->GetSelection(); - if (button_idx == wxNOT_FOUND) { - button_idx = kBytesDefault; - } - int decode_method_idx = whisper_decode_method_->GetSelection(); - if (decode_method_idx == wxNOT_FOUND) { - decode_method_idx = kDecodeMethodDefault; - } - const bool enable_local_beep = whisper_enable_local_beep_->GetValue(); - const bool use_cpu = whisper_use_cpu_->GetValue(); - std::string rows_str = whisper_rows_->GetValue().ToStdString(); - std::string cols_str = whisper_cols_->GetValue().ToStdString(); - std::string chars_per_sync_str = - kCharsPerSync[chars_per_sync_idx].ToStdString(); - std::string bytes_per_char_str = - kBytesPerChar[bytes_per_char_idx].ToStdString(); - std::string browser_src_port_str = - whisper_browser_src_port_->GetValue().ToStdString(); - int rows, cols, chars_per_sync, bytes_per_char, browser_src_port; - try { - rows = std::stoi(rows_str); - cols = std::stoi(cols_str); - chars_per_sync = std::stoi(chars_per_sync_str); - bytes_per_char = std::stoi(bytes_per_char_str); - browser_src_port = std::stoi(browser_src_port_str); - } - catch (const std::invalid_argument&) { - Log(whisper_out_, "Could not parse rows \"{}\", cols \"{}\", chars " - "per sync \"{}\", or bytes per char \"{}\" " - "as an integer\n", rows_str, cols_str, chars_per_sync_str, - bytes_per_char_str); - return; - } - catch (const std::out_of_range&) { - Log(whisper_out_, "Rows \"{}\", cols \"{}\", chars per sync " - "\"{}\", or bytes per char \"{}\" are out " - "of range\n", rows_str, cols_str, chars_per_sync_str, - bytes_per_char_str); - return; - } - const int max_rows = 10; - const int max_cols = 240; - if (rows < 0 || rows > max_rows || - cols < 0 || cols > max_cols) { - Log(whisper_out_, "Rows not on [{},{}] or cols not on [{},{}]\n", - 0, max_rows, - 0, max_cols); - return; - } - - std::string max_ctxt_str = whisper_max_ctxt_->GetValue().ToStdString(); - std::string beam_sz_str = whisper_beam_width_->GetValue().ToStdString(); - std::string beam_wd_str = whisper_beam_n_best_->GetValue().ToStdString(); - int max_ctxt, beam_sz, beam_wd; - Log(whisper_out_, "here {}\n", __LINE__); - try { - Log(whisper_out_, "whisper max ctxt str: {}\n", max_ctxt_str); - max_ctxt = std::stoi(max_ctxt_str); - Log(whisper_out_, "whisper max ctxt: {}\n", max_ctxt); - beam_sz = std::stoi(beam_sz_str); - beam_wd = std::stoi(beam_wd_str); - } - catch (const std::invalid_argument&) { - Log(whisper_out_, "Could not parse max_ctxt '{}' beam_sz '{}' or beam_wd '{}' as an integer", - max_ctxt_str, beam_sz_str, beam_wd_str); - return; - } - catch (const std::out_of_range&) { - Log(whisper_out_, "Could not parse max_ctxt '{}', beam_sz '{}' or beam_wd '{}' as an integer: out of range", - max_ctxt_str, beam_sz_str, beam_wd_str); - return; - } - - std::string vad_min_dur_str = whisper_vad_min_duration_->GetValue().ToStdString(); - std::string vad_max_dur_str = whisper_vad_max_duration_->GetValue().ToStdString(); - std::string vad_drop_si_dur_str = whisper_vad_drop_start_silence_->GetValue().ToStdString(); - std::string vad_pause_dur_str = whisper_vad_pause_duration_->GetValue().ToStdString(); - std::string vad_ret_dur_str = whisper_vad_retain_duration_->GetValue().ToStdString(); - float vad_min_dur, vad_max_dur, vad_drop_silence_dur, vad_pause_dur, vad_retain_dur; - try { - vad_min_dur = std::stof(vad_min_dur_str); - vad_max_dur = std::stof(vad_max_dur_str); - vad_drop_silence_dur = std::stof(vad_drop_si_dur_str); - vad_pause_dur = std::stof(vad_pause_dur_str); - vad_retain_dur = std::stof(vad_ret_dur_str); - } - catch (const std::invalid_argument&) { - // TODO update error msg - Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer", - beam_sz, beam_wd); - return; - } - catch (const std::out_of_range&) { - // TODO update error msg - Log(whisper_out_, "Could not parse beam_sz '{}' or beam_wd '{}' as an integer: out of range", - beam_sz, beam_wd); - return; - } - - const int min_port = 1024; - const int max_port = 65535; - if (browser_src_port < min_port || browser_src_port > max_port) { - Log(whisper_out_, "Browser source port not on [{},{}]\n", - min_port, max_port); - return; - } - - app_c_->whisper_mic = which_mic; - app_c_->language = kLangChoices[which_lang].ToStdString(); - app_c_->whisper_model = kWhisperModelChoices[which_model].ToStdString(); - app_c_->chars_per_sync = chars_per_sync; - app_c_->bytes_per_char = bytes_per_char; - app_c_->button = kButton[button_idx].ToStdString(); - app_c_->rows = rows; - app_c_->cols = cols; - app_c_->enable_local_beep = enable_local_beep; - app_c_->use_cpu = use_cpu; - app_c_->browser_src_port = browser_src_port; - app_c_->whisper_enable_browser_src = whisper_enable_browser_src_->GetValue(); - app_c_->whisper_enable_builtin = whisper_enable_builtin_->GetValue(); - app_c_->whisper_enable_custom = whisper_enable_custom_->GetValue(); - app_c_->whisper_decode_method = kDecodeMethods[decode_method_idx].ToStdString(); - app_c_->whisper_max_ctxt = max_ctxt; - app_c_->whisper_beam_width = beam_sz; - app_c_->whisper_beam_n_best = beam_wd; - app_c_->whisper_vad_min_duration = vad_min_dur; - app_c_->whisper_vad_max_duration = vad_max_dur; - app_c_->whisper_vad_drop_start_silence = vad_drop_silence_dur; - app_c_->whisper_vad_pause_duration = vad_pause_dur; - app_c_->whisper_vad_retain_duration = vad_retain_dur; - app_c_->Serialize(AppConfig::kConfigPath); - - whisper_->Start(*app_c_); - if (whisper_enable_browser_src_->GetValue()) { - whisper_->StartBrowserSource(*app_c_); - } - if (whisper_enable_custom_->GetValue()) { - whisper_->StartCustomChatbox(*app_c_); - } -} - -void Frame::OnWhisperStop() { - whisper_->Stop(); - if (whisper_enable_browser_src_->GetValue()) { - whisper_->StopBrowserSource(); - } - if (whisper_enable_custom_->GetValue()) { - whisper_->StopCustomChatbox(); - } -} - -void Frame::OnWhisperStop(wxCommandEvent& event) { - OnWhisperStop(); -} - void Frame::OnAppDrain(wxTimerEvent& event) { Logging::kThreadLogger.Drain(); } diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 7e55347..082172e 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -8,7 +8,6 @@ #endif #include "Config.h" -#include "WhisperCPP.h" #include #include @@ -99,26 +98,19 @@ private: std::unique_ptr app_c_; - std::unique_ptr whisper_; - // Initialize GUI input fields using `app_c_`. void ApplyConfigToInputFields(); - // Populate dynamically-generated input fields, such as microphone lists. - void PopulateDynamicInputFields(); + // Ensure that virtual env is set up. + void EnsureVirtualEnv(bool block); void OnExit(wxCloseEvent& event); void OnNavbarTranscribe(wxCommandEvent& event); void OnNavbarUnity(wxCommandEvent& event); void OnNavbarDebug(wxCommandEvent& event); - void OnNavbarWhisper(wxCommandEvent& event); - void OnSetupPython(wxCommandEvent& event); void OnDumpMics(wxCommandEvent& event); void OnAppStart(wxCommandEvent& event); void OnAppStop(); void OnAppStop(wxCommandEvent& event); - void OnWhisperStart(wxCommandEvent& event); - void OnWhisperStop(); - void OnWhisperStop(wxCommandEvent& event); void OnAppDrain(wxTimerEvent& event); void OnGenerateFX(wxCommandEvent& event); void OnUnityParamChangeImpl(); diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index c875874..2347248 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -450,12 +450,14 @@ std::future PythonWrapper::StartApp( const AppConfig& config, const std::function&& out_cb, const std::function&& in_cb, - const std::function&& run_cb) { + const std::function&& run_cb, + const std::function&& prestart_cb) { return std::move(std::async(std::launch::async, [&]( const std::function&& out_cb, const std::function&& in_cb, const std::function&& run_cb) -> bool { + prestart_cb(); return InvokeWithArgs({ "-u", // Unbuffered output "Resources/Scripts/transcribe.py", diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 3c8f53a..17f5e1d 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -75,7 +75,8 @@ namespace PythonWrapper const AppConfig& config, const std::function&& out_cb, const std::function&& in_cb = [](std::string&) {}, - const std::function&& run_cb = []() { return true; }); + const std::function&& run_cb = []() { return true; }, + const std::function&& prestart_cb = []() {}); bool GenerateAnimator( const AppConfig& config, -- cgit v1.2.3