From 9a7190a04bbe6feba2abe4a1590eb8114c04f683 Mon Sep 17 00:00:00 2001 From: yum Date: Fri, 30 Dec 2022 01:35:11 -0800 Subject: GUI: Expose transcription window duration Users can pick longer transcription durations for accuracy-critical tasks, or shorter durations for latency-critical tasks. --- GUI/GUI/GUI/Frame.cpp | 39 ++++++++++++++++++++++++++++++++++++--- GUI/GUI/GUI/Frame.h | 1 + GUI/GUI/GUI/PythonWrapper.cpp | 3 ++- GUI/GUI/GUI/PythonWrapper.h | 20 ++++++++++---------- Scripts/transcribe.py | 14 ++++++++++++-- 5 files changed, 61 insertions(+), 16 deletions(-) diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index b708953..28c0e4f 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -34,6 +34,7 @@ namespace { ID_PY_APP_ENABLE_LOCAL_BEEP, ID_PY_APP_ROWS, ID_PY_APP_COLS, + ID_PY_APP_WINDOW_DURATION, ID_UNITY_PANEL, ID_UNITY_CONFIG_PANEL, ID_UNITY_OUT, @@ -342,6 +343,18 @@ Frame::Frame() "The number of columns on the text box."); py_app_cols_ = py_app_cols; + auto* py_app_window_duration = new wxTextCtrl(py_app_config_panel_pairs, + ID_PY_APP_WINDOW_DURATION, /*value=*/"15", + wxDefaultPosition, wxDefaultSize, /*style=*/0); + py_app_window_duration->SetToolTip( + "This controls how long the slice of audio that " + "we feed the transcription algorithm is, in seconds. " + "Shorter values (as low as 10 seconds) can be transcribed " + "more quickly, but are less accurate. Longer values " + "(as high as 28 seconds) take longer to transcribe, " + "but are far more accurate."); + py_app_window_duration_ = py_app_window_duration; + auto* sizer = new wxFlexGridSizer(/*cols=*/2); py_app_config_panel_pairs->SetSizer(sizer); @@ -365,6 +378,9 @@ Frame::Frame() sizer->Add(new wxStaticText(py_app_config_panel_pairs, wxID_ANY, /*label=*/"Text box columns:")); sizer->Add(py_app_cols, /*proportion=*/0, /*flags=*/wxEXPAND); + + sizer->Add(new wxStaticText(py_app_config_panel_pairs, wxID_ANY, /*label=*/"Window duration (s):")); + sizer->Add(py_app_window_duration, /*proportion=*/0, /*flags=*/wxEXPAND); } auto* py_app_enable_local_beep = new wxCheckBox(py_config_panel, @@ -873,17 +889,33 @@ void Frame::OnAppStart(wxCommandEvent& event) { const bool enable_local_beep = py_app_enable_local_beep_->GetValue(); std::string rows_str = py_app_rows_->GetValue().ToStdString(); std::string cols_str = py_app_cols_->GetValue().ToStdString(); - int rows, cols; + std::string window_duration_str = py_app_window_duration_->GetValue().ToStdString(); + int rows, cols, window_duration; try { rows = std::stoi(rows_str); cols = std::stoi(cols_str); + window_duration = std::stoi(window_duration_str); } catch (const std::invalid_argument& e) { - Log(transcribe_out_, "Could not parse rows \"{}\" or cols \"{}\" as an integer\n", rows_str, cols_str); + Log(transcribe_out_, "Could not parse rows \"{}\", cols \"{}\", or window duration \"{}\" as an integer\n", rows_str, cols_str); return; } catch (const std::out_of_range& e) { - Log(transcribe_out_, "Rows \"{}\" or cols \"{}\" are out of range\n", rows_str, cols_str); + Log(transcribe_out_, "Rows \"{}\", cols \"{}\", or window duration \"{}\" are out of range\n", rows_str, cols_str, window_duration); + return; + } + const int max_rows = 10; + const int max_cols = 240; + const int min_window_duration_s = 10; + const int max_window_duration_s = 28; + if (rows < 0 || rows > max_rows || + cols < 0 || cols > max_cols || + window_duration < min_window_duration_s || window_duration > max_window_duration_s) { + Log(transcribe_out_, "Rows not on [{},{}] or cols not on [{},{}] or " + "window_duration not on [{},{}]\n", + 0, max_rows, + 0, max_cols, + min_window_duration_s, max_window_duration_s); return; } @@ -895,6 +927,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { kBytesPerChar[bytes_per_char_idx].ToStdString(), rows, cols, + window_duration, enable_local_beep); if (!p) { Log(transcribe_out_, "Failed to launch transcription engine\n"); diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 3ce6cb9..2a2760a 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -31,6 +31,7 @@ private: wxTextCtrl* py_app_rows_; wxTextCtrl* py_app_cols_; + wxTextCtrl* py_app_window_duration_; wxTextCtrl* unity_rows_; wxTextCtrl* unity_cols_; diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index 4162a4e..da63a4a 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -125,7 +125,7 @@ wxProcess* PythonWrapper::StartApp( std::function&& exit_callback, const std::string& mic, const std::string& lang, const std::string& model, const std::string& chars_per_sync, const std::string& bytes_per_char, - int rows, int cols, bool enable_local_beep) { + int rows, int cols, int window_duration_s, bool enable_local_beep) { return InvokeAsyncWithArgs({ "-u", "Resources/Scripts/transcribe.py", @@ -137,6 +137,7 @@ wxProcess* PythonWrapper::StartApp( "--enable_local_beep", enable_local_beep ? "1" : "0", "--rows", std::to_string(rows), "--cols", std::to_string(cols), + "--window_duration_s", std::to_string(window_duration_s), }, std::move(exit_callback)); } diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 95195b9..25855a4 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -44,20 +44,20 @@ namespace PythonWrapper std::function&& exit_callback, const std::string& mic, const std::string& lang, const std::string& model, const std::string& chars_per_sync, const std::string& bytes_per_char, - int rows, int cols, bool enable_local_beep + int rows, int cols, int window_duration_s, bool enable_local_beep ); bool GenerateAnimator( const std::string& unity_assets_path, - const std::string& unity_animator_path, - const std::string& unity_parameters_path, - const std::string& unity_menu_path, - const std::string& unity_animator_generated_dir, - const std::string& unity_animator_generated_name, - const std::string& unity_parameters_generated_name, - const std::string& unity_menu_generated_name, - const std::string& chars_per_sync, - const std::string& bytes_per_char, + const std::string& unity_animator_path, + const std::string& unity_parameters_path, + const std::string& unity_menu_path, + const std::string& unity_animator_generated_dir, + const std::string& unity_animator_generated_name, + const std::string& unity_parameters_generated_name, + const std::string& unity_menu_generated_name, + const std::string& chars_per_sync, + const std::string& bytes_per_char, int rows, int cols, wxTextCtrl* out); diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 0530946..f90867a 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -25,6 +25,13 @@ import wave # License: MIT. import whisper +class Config: + def __init__(self): + # The maximum length that recordAudio() will put into frames before it + # starts dropping from the start. + self.MAX_LENGTH_S = 10 +config = Config() + class AudioState: def __init__(self): self.CHUNK = 1024 @@ -35,7 +42,6 @@ class AudioState: # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. - self.MAX_LENGTH_S = 10 self.MAX_LENGTH_S_WHISPER = 30 # The minimum length that recordAudio() will wait for before saving audio. self.MIN_LENGTH_S = 1 @@ -119,7 +125,7 @@ def onAudioFramesAvailable( audio_state.frames.append(decimated) - max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK) + max_frames = int(input_rate * config.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: audio_state.frames = audio_state.frames[-1 * max_frames :] @@ -428,6 +434,7 @@ if __name__ == "__main__": parser.add_argument("--enable_local_beep", type=int, help="Whether to play a local auditory indicator when transcription starts/stops."); parser.add_argument("--rows", type=int, help="The number of rows on the board") parser.add_argument("--cols", type=int, help="The number of columns on the board") + parser.add_argument("--window_duration_s", type=int, help="The length in seconds of the audio recording handed to the transcription algorithm"); args = parser.parse_args() if not args.mic: @@ -447,6 +454,9 @@ if __name__ == "__main__": print("--rows and --cols required", file=sys.stderr) sys.exit(1) + if args.window_duration_s: + config.MAX_LENGTH_S = int(args.window_duration_s) + generate_utils.config.BYTES_PER_CHAR = int(args.bytes_per_char) generate_utils.config.CHARS_PER_SYNC = int(args.chars_per_sync) generate_utils.config.BOARD_ROWS = int(args.rows) -- cgit v1.2.3