diff options
| author | yum <yum.food.vr@gmail.com> | 2023-08-30 17:45:53 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-08-30 17:45:53 -0700 |
| commit | 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch) | |
| tree | 0750a03f9620fe8b8bf66355feb399efb3ec29da | |
| parent | 358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff) | |
Deprecate commit similarity threshold
This is now dynamically set inside transcribe.py.
As the buffer grows long, the threshold grows exponentially, keeping the
buffer short. The threshold starts small so that transcription starts
strict (accurate, slow) and get looser (inaccurate, fast) as needed.
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 52 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.cpp | 1 | ||||
| -rw-r--r-- | Scripts/transcribe.py | 22 |
3 files changed, 29 insertions, 46 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 82e4536..63962df 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -733,20 +733,6 @@ Frame::Frame() "value you configure here.");
py_app_browser_src_port_ = py_app_browser_src_port;
- auto* py_app_commit_fuzz_threshold = new wxTextCtrl(
- py_app_config_panel_pairs, ID_PY_APP_COMMIT_FUZZ_THRESHOLD,
- std::to_string(app_c_->commit_fuzz_threshold), wxDefaultPosition,
- wxDefaultSize, /*style=*/0);
- py_app_commit_fuzz_threshold->SetToolTip(
- "The transcription app requires subsequent "
- "transcripts to be within this edit distance of each "
- "other before it commits them. Higher values make "
- "transcripts commit more easily, making the app "
- "faster but less accurate. Lower values make "
- "transcripts commit less easily, making the app "
- "slower but more accurate.");
- py_app_commit_fuzz_threshold_ = py_app_commit_fuzz_threshold;
-
auto* sizer = new wxFlexGridSizer(/*cols=*/2);
py_app_config_panel_pairs->SetSizer(sizer);
@@ -811,11 +797,6 @@ Frame::Frame() /*flags=*/wxEXPAND);
sizer->Add(new wxStaticText(py_app_config_panel_pairs,
- wxID_ANY, /*label=*/"Commit similarity threshold:"));
- sizer->Add(py_app_commit_fuzz_threshold, /*proportion=*/0,
- /*flags=*/wxEXPAND);
-
- sizer->Add(new wxStaticText(py_app_config_panel_pairs,
wxID_ANY, /*label=*/"Browser source port:"));
sizer->Add(py_app_browser_src_port, /*proportion=*/0,
/*flags=*/wxEXPAND);
@@ -1459,10 +1440,6 @@ void Frame::ApplyConfigToInputFields() py_app_desktop_browser_src_port->Clear();
py_app_desktop_browser_src_port->AppendText(std::to_string(app_c_->browser_src_port));
- auto* py_app_desktop_commit_fuzz_threshold = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_COMMIT_FUZZ_THRESHOLD));
- py_app_desktop_commit_fuzz_threshold->Clear();
- py_app_desktop_commit_fuzz_threshold->AppendText(std::to_string(app_c_->commit_fuzz_threshold));
-
auto* py_app_rows = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_ROWS));
py_app_rows->Clear();
py_app_rows->AppendText(std::to_string(app_c_->rows));
@@ -2225,9 +2202,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { py_app_keybind_->GetValue().ToStdString();
std::string browser_src_port_str =
py_app_browser_src_port_->GetValue().ToStdString();
- std::string commit_fuzz_threshold_str =
- py_app_commit_fuzz_threshold_->GetValue().ToStdString();
- int rows, cols, chars_per_sync, bytes_per_char, gpu_idx, browser_src_port, commit_fuzz_threshold;
+ int rows, cols, chars_per_sync, bytes_per_char, gpu_idx, browser_src_port;
try {
rows = std::stoi(rows_str);
cols = std::stoi(cols_str);
@@ -2235,30 +2210,26 @@ void Frame::OnAppStart(wxCommandEvent& event) { bytes_per_char = std::stoi(bytes_per_char_str);
gpu_idx = std::stoi(gpu_idx_str);
browser_src_port = std::stoi(browser_src_port_str);
- commit_fuzz_threshold = std::stoi(commit_fuzz_threshold_str);
}
catch (const std::invalid_argument&) {
- Log(transcribe_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
+ Log(transcribe_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
"per sync \"{}\", "
"bytes per char \"{}\", "
"gpu_idx \"{}\", "
- "browser src port \"{}\"", ""
- "or commit_fuzz_threshold \"{}\""
+ "or browser src port \"{}\""
"as an integer\n", rows_str, cols_str, chars_per_sync_str,
- bytes_per_char_str, gpu_idx_str, browser_src_port_str,
- commit_fuzz_threshold_str);
+ bytes_per_char_str, gpu_idx_str, browser_src_port_str);
return;
}
catch (const std::out_of_range&) {
- Log(transcribe_out_, "Rows \"{}\", "
+ Log(transcribe_out_, "Rows \"{}\", "
"cols \"{}\", "
"chars per sync \"{}\", "
"bytes per char \"{}\", "
"gpu idx \"{}\", "
- "browser src port \"{}\", "
- "or commit_fuzz_threshold \"{}\" "
+ "or browser src port \"{}\" "
"are out of range\n", rows_str, cols_str, chars_per_sync_str,
- bytes_per_char_str, gpu_idx_str, browser_src_port_str, commit_fuzz_threshold_str);
+ bytes_per_char_str, gpu_idx_str, browser_src_port_str);
return;
}
const int max_rows = 10;
@@ -2267,13 +2238,10 @@ void Frame::OnAppStart(wxCommandEvent& event) { const int max_gpu_idx = 10;
const int min_browser_src_port = 1024;
const int max_browser_src_port = 65535;
- const int min_commit_fuzz_threshold = 0;
- const int max_commit_fuzz_threshold = 100;
if (rows < 0 || rows > max_rows ||
cols < 0 || cols > max_cols ||
gpu_idx < min_gpu_idx || gpu_idx > max_gpu_idx ||
- browser_src_port < min_browser_src_port || browser_src_port > max_browser_src_port ||
- commit_fuzz_threshold < min_commit_fuzz_threshold || commit_fuzz_threshold > max_commit_fuzz_threshold) {
+ browser_src_port < min_browser_src_port || browser_src_port > max_browser_src_port) {
Log(transcribe_out_, "Rows not on [{},{}] or cols not on [{},{}] or "
"gpu_idx not on [{}, {}] or "
"browser src port not on [{}, {}] or "
@@ -2282,8 +2250,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { 0, max_rows,
0, max_cols,
min_gpu_idx, max_gpu_idx,
- min_browser_src_port, max_browser_src_port,
- min_commit_fuzz_threshold, max_commit_fuzz_threshold);
+ min_browser_src_port, max_browser_src_port);
return;
}
@@ -2300,7 +2267,6 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->enable_local_beep = enable_local_beep;
app_c_->enable_browser_src = enable_browser_src;
app_c_->browser_src_port = browser_src_port;
- app_c_->commit_fuzz_threshold = commit_fuzz_threshold;
app_c_->use_cpu = use_cpu;
app_c_->use_builtin = use_builtin;
app_c_->enable_uwu_filter = enable_uwu_filter;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index cf210fa..c4367e8 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -499,7 +499,6 @@ std::future<bool> PythonWrapper::StartApp( "--gpu_idx", std::to_string(config.gpu_idx), "--keybind", Quote(config.keybind), "--reset_on_toggle", config.reset_on_toggle ? "1" : "0", - "--commit_fuzz_threshold", std::to_string(config.commit_fuzz_threshold), }, std::move(out_cb), std::move(in_cb), diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 5301b0b..cea2da0 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -66,7 +66,7 @@ class AudioState: # The edit distance under which two consecutive transcripts are # considered to match. This affects how easily `preview_text` # gets appended to `text`. - self.commit_fuzz_threshold = 8 + self.commit_fuzz_threshold = 1 # If set, profanity in transcriptions will have their vowels replaced # with asterisks. Only works in English. @@ -157,6 +157,19 @@ def onAudioFramesAvailable( if not audio_state.audio_paused: audio_state.frames.append(decimated) + # If buffer is getting long, tell the transcription loop to be more ready + # to accept transcripts. + fps = int(input_rate / audio_state.CHUNK) + cur_len_s = len(audio_state.frames) / fps + double_at_s = 3.0 + double_every_s = 1.5 + delta_s = cur_len_s - double_at_s + n_doubles = ceil(delta_s / double_every_s) + if n_doubles >= 1: + audio_state.commit_fuzz_threshold = 2 ** n_doubles + else: + audio_state.commit_fuzz_threshold = 1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: @@ -380,7 +393,12 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() + # Prevent audio buffer from holding more than 1 second of silence + # before real speech. + audio_state.MAX_LENGTH_S = 1 continue + else: + audio_state.MAX_LENGTH_S = 300 if audio_state.drop_transcription: audio_state.drop_transcription = False @@ -720,7 +738,7 @@ def transcribeLoop(mic: str, audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - audio_state.commit_fuzz_threshold = commit_fuzz_threshold + #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter |
