summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-08-30 17:45:53 -0700
committeryum <yum.food.vr@gmail.com>2023-08-30 17:45:53 -0700
commit4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (patch)
tree0750a03f9620fe8b8bf66355feb399efb3ec29da
parent358f3ed8c44bbe45d8f4546afeeb0afaae85ea8b (diff)
Deprecate commit similarity threshold
This is now dynamically set inside transcribe.py. As the buffer grows long, the threshold grows exponentially, keeping the buffer short. The threshold starts small so that transcription starts strict (accurate, slow) and get looser (inaccurate, fast) as needed.
-rw-r--r--GUI/GUI/GUI/Frame.cpp52
-rw-r--r--GUI/GUI/GUI/PythonWrapper.cpp1
-rw-r--r--Scripts/transcribe.py22
3 files changed, 29 insertions, 46 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 82e4536..63962df 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -733,20 +733,6 @@ Frame::Frame()
"value you configure here.");
py_app_browser_src_port_ = py_app_browser_src_port;
- auto* py_app_commit_fuzz_threshold = new wxTextCtrl(
- py_app_config_panel_pairs, ID_PY_APP_COMMIT_FUZZ_THRESHOLD,
- std::to_string(app_c_->commit_fuzz_threshold), wxDefaultPosition,
- wxDefaultSize, /*style=*/0);
- py_app_commit_fuzz_threshold->SetToolTip(
- "The transcription app requires subsequent "
- "transcripts to be within this edit distance of each "
- "other before it commits them. Higher values make "
- "transcripts commit more easily, making the app "
- "faster but less accurate. Lower values make "
- "transcripts commit less easily, making the app "
- "slower but more accurate.");
- py_app_commit_fuzz_threshold_ = py_app_commit_fuzz_threshold;
-
auto* sizer = new wxFlexGridSizer(/*cols=*/2);
py_app_config_panel_pairs->SetSizer(sizer);
@@ -811,11 +797,6 @@ Frame::Frame()
/*flags=*/wxEXPAND);
sizer->Add(new wxStaticText(py_app_config_panel_pairs,
- wxID_ANY, /*label=*/"Commit similarity threshold:"));
- sizer->Add(py_app_commit_fuzz_threshold, /*proportion=*/0,
- /*flags=*/wxEXPAND);
-
- sizer->Add(new wxStaticText(py_app_config_panel_pairs,
wxID_ANY, /*label=*/"Browser source port:"));
sizer->Add(py_app_browser_src_port, /*proportion=*/0,
/*flags=*/wxEXPAND);
@@ -1459,10 +1440,6 @@ void Frame::ApplyConfigToInputFields()
py_app_desktop_browser_src_port->Clear();
py_app_desktop_browser_src_port->AppendText(std::to_string(app_c_->browser_src_port));
- auto* py_app_desktop_commit_fuzz_threshold = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_COMMIT_FUZZ_THRESHOLD));
- py_app_desktop_commit_fuzz_threshold->Clear();
- py_app_desktop_commit_fuzz_threshold->AppendText(std::to_string(app_c_->commit_fuzz_threshold));
-
auto* py_app_rows = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_ROWS));
py_app_rows->Clear();
py_app_rows->AppendText(std::to_string(app_c_->rows));
@@ -2225,9 +2202,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
py_app_keybind_->GetValue().ToStdString();
std::string browser_src_port_str =
py_app_browser_src_port_->GetValue().ToStdString();
- std::string commit_fuzz_threshold_str =
- py_app_commit_fuzz_threshold_->GetValue().ToStdString();
- int rows, cols, chars_per_sync, bytes_per_char, gpu_idx, browser_src_port, commit_fuzz_threshold;
+ int rows, cols, chars_per_sync, bytes_per_char, gpu_idx, browser_src_port;
try {
rows = std::stoi(rows_str);
cols = std::stoi(cols_str);
@@ -2235,30 +2210,26 @@ void Frame::OnAppStart(wxCommandEvent& event) {
bytes_per_char = std::stoi(bytes_per_char_str);
gpu_idx = std::stoi(gpu_idx_str);
browser_src_port = std::stoi(browser_src_port_str);
- commit_fuzz_threshold = std::stoi(commit_fuzz_threshold_str);
}
catch (const std::invalid_argument&) {
- Log(transcribe_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
+ Log(transcribe_out_, "Could not parse rows \"{}\", cols \"{}\", chars "
"per sync \"{}\", "
"bytes per char \"{}\", "
"gpu_idx \"{}\", "
- "browser src port \"{}\"", ""
- "or commit_fuzz_threshold \"{}\""
+ "or browser src port \"{}\""
"as an integer\n", rows_str, cols_str, chars_per_sync_str,
- bytes_per_char_str, gpu_idx_str, browser_src_port_str,
- commit_fuzz_threshold_str);
+ bytes_per_char_str, gpu_idx_str, browser_src_port_str);
return;
}
catch (const std::out_of_range&) {
- Log(transcribe_out_, "Rows \"{}\", "
+ Log(transcribe_out_, "Rows \"{}\", "
"cols \"{}\", "
"chars per sync \"{}\", "
"bytes per char \"{}\", "
"gpu idx \"{}\", "
- "browser src port \"{}\", "
- "or commit_fuzz_threshold \"{}\" "
+ "or browser src port \"{}\" "
"are out of range\n", rows_str, cols_str, chars_per_sync_str,
- bytes_per_char_str, gpu_idx_str, browser_src_port_str, commit_fuzz_threshold_str);
+ bytes_per_char_str, gpu_idx_str, browser_src_port_str);
return;
}
const int max_rows = 10;
@@ -2267,13 +2238,10 @@ void Frame::OnAppStart(wxCommandEvent& event) {
const int max_gpu_idx = 10;
const int min_browser_src_port = 1024;
const int max_browser_src_port = 65535;
- const int min_commit_fuzz_threshold = 0;
- const int max_commit_fuzz_threshold = 100;
if (rows < 0 || rows > max_rows ||
cols < 0 || cols > max_cols ||
gpu_idx < min_gpu_idx || gpu_idx > max_gpu_idx ||
- browser_src_port < min_browser_src_port || browser_src_port > max_browser_src_port ||
- commit_fuzz_threshold < min_commit_fuzz_threshold || commit_fuzz_threshold > max_commit_fuzz_threshold) {
+ browser_src_port < min_browser_src_port || browser_src_port > max_browser_src_port) {
Log(transcribe_out_, "Rows not on [{},{}] or cols not on [{},{}] or "
"gpu_idx not on [{}, {}] or "
"browser src port not on [{}, {}] or "
@@ -2282,8 +2250,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
0, max_rows,
0, max_cols,
min_gpu_idx, max_gpu_idx,
- min_browser_src_port, max_browser_src_port,
- min_commit_fuzz_threshold, max_commit_fuzz_threshold);
+ min_browser_src_port, max_browser_src_port);
return;
}
@@ -2300,7 +2267,6 @@ void Frame::OnAppStart(wxCommandEvent& event) {
app_c_->enable_local_beep = enable_local_beep;
app_c_->enable_browser_src = enable_browser_src;
app_c_->browser_src_port = browser_src_port;
- app_c_->commit_fuzz_threshold = commit_fuzz_threshold;
app_c_->use_cpu = use_cpu;
app_c_->use_builtin = use_builtin;
app_c_->enable_uwu_filter = enable_uwu_filter;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index cf210fa..c4367e8 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -499,7 +499,6 @@ std::future<bool> PythonWrapper::StartApp(
"--gpu_idx", std::to_string(config.gpu_idx),
"--keybind", Quote(config.keybind),
"--reset_on_toggle", config.reset_on_toggle ? "1" : "0",
- "--commit_fuzz_threshold", std::to_string(config.commit_fuzz_threshold),
},
std::move(out_cb),
std::move(in_cb),
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 5301b0b..cea2da0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -66,7 +66,7 @@ class AudioState:
# The edit distance under which two consecutive transcripts are
# considered to match. This affects how easily `preview_text`
# gets appended to `text`.
- self.commit_fuzz_threshold = 8
+ self.commit_fuzz_threshold = 1
# If set, profanity in transcriptions will have their vowels replaced
# with asterisks. Only works in English.
@@ -157,6 +157,19 @@ def onAudioFramesAvailable(
if not audio_state.audio_paused:
audio_state.frames.append(decimated)
+ # If buffer is getting long, tell the transcription loop to be more ready
+ # to accept transcripts.
+ fps = int(input_rate / audio_state.CHUNK)
+ cur_len_s = len(audio_state.frames) / fps
+ double_at_s = 3.0
+ double_every_s = 1.5
+ delta_s = cur_len_s - double_at_s
+ n_doubles = ceil(delta_s / double_every_s)
+ if n_doubles >= 1:
+ audio_state.commit_fuzz_threshold = 2 ** n_doubles
+ else:
+ audio_state.commit_fuzz_threshold = 1
+
max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
@@ -380,7 +393,12 @@ def transcribeAudio(audio_state,
if audio_state.enable_debug_mode:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
+ # Prevent audio buffer from holding more than 1 second of silence
+ # before real speech.
+ audio_state.MAX_LENGTH_S = 1
continue
+ else:
+ audio_state.MAX_LENGTH_S = 300
if audio_state.drop_transcription:
audio_state.drop_transcription = False
@@ -720,7 +738,7 @@ def transcribeLoop(mic: str,
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
audio_state.reset_on_toggle = reset_on_toggle
- audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+ #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
audio_state.enable_debug_mode = enable_debug_mode
audio_state.enable_profanity_filter = enable_profanity_filter