7 files changed, 49 insertions, 11 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index dd3d55c..91fd1e9 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -87,6 +87,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	gpu_idx(0),
 	min_silence_duration_ms(250),
 	max_speech_duration_s(5),
+	transcription_loop_delay_ms(100),
 	keybind("ctrl+x"),
 
 	chars_per_sync(8),
@@ -131,6 +132,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
 	cm.Set("gpu_idx", gpu_idx);
 	cm.Set("min_silence_duration_ms", min_silence_duration_ms);
 	cm.Set("max_speech_duration_s", max_speech_duration_s);
+	cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms);
 	cm.Set("keybind", keybind);
 
 	cm.Set("chars_per_sync", chars_per_sync);
@@ -188,6 +190,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
 	cm.Get("gpu_idx", c.gpu_idx);
 	cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
 	cm.Get("max_speech_duration_s", c.max_speech_duration_s);
+	cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms);
 	cm.Get("keybind", c.keybind);
 
 	cm.Get("chars_per_sync", c.chars_per_sync);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index a366090..762adc5 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -73,6 +73,7 @@ public:
 	int gpu_idx;
 	int min_silence_duration_ms;
 	int max_speech_duration_s;
+	int transcription_loop_delay_ms;
 	std::string keybind;
 
 	// Unity and transcription shared settings.
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 384f2a2..a61c821 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -77,6 +77,7 @@ namespace {
         ID_PY_APP_GPU_IDX,
         ID_PY_APP_MIN_SILENCE_DURATION_MS,
         ID_PY_APP_MAX_SPEECH_DURATION_S,
+        ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
         ID_PY_APP_KEYBIND,
         ID_PY_APP_BROWSER_SRC_PORT,
         ID_PY_APP_COMMIT_FUZZ_THRESHOLD,
@@ -775,6 +776,16 @@ Frame::Frame()
                         "milliseconds.");
 					py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
 
+					auto* py_app_transcription_loop_delay_ms = new wxTextCtrl(
+                        py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
+                        std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition,
+						wxDefaultSize, /*style=*/0);
+                    py_app_transcription_loop_delay_ms->SetToolTip(
+						"The amount of time, in milliseconds, that the "
+                        "application will sleep between every subsequent "
+                        "transcription.");
+					py_app_transcription_loop_delay_ms_ = py_app_transcription_loop_delay_ms;
+
 					auto* py_app_keybind = new wxTextCtrl(
 						py_app_config_panel_pairs, ID_PY_APP_KEYBIND,
 						app_c_->keybind, wxDefaultPosition,
@@ -870,6 +881,11 @@ Frame::Frame()
                         /*flags=*/wxEXPAND);
 
                     sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+                        wxID_ANY, /*label=*/"Transcription loop delay (ms):"));
+                    sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0,
+                        /*flags=*/wxEXPAND);
+
+                    sizer->Add(new wxStaticText(py_app_config_panel_pairs,
                         wxID_ANY, /*label=*/"Browser source port:"));
                     sizer->Add(py_app_browser_src_port, /*proportion=*/0,
                         /*flags=*/wxEXPAND);
@@ -1571,6 +1587,10 @@ void Frame::ApplyConfigToInputFields()
 	py_app_max_speech_duration_s->Clear();
 	py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
 
+	auto* py_app_transcription_loop_delay_ms = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS));
+	py_app_transcription_loop_delay_ms->Clear();
+	py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms));
+
     auto* py_app_enable_local_beep = static_cast<wxCheckBox*>(FindWindowById(ID_PY_APP_ENABLE_LOCAL_BEEP));
     py_app_enable_local_beep->SetValue(app_c_->enable_local_beep);
 
@@ -2326,6 +2346,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 	ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
 	ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
 	ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
+	ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000));
 	ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
 
     std::string keybind = py_app_keybind_->GetValue().ToStdString();
@@ -2357,6 +2378,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     app_c_->gpu_idx = gpu_idx;
     app_c_->min_silence_duration_ms = min_silence_duration_ms;
     app_c_->max_speech_duration_s = max_speech_duration_s;
+    app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms;
     app_c_->keybind = keybind;
     app_c_->Serialize(AppConfig::kConfigPath);
 
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 615726a..21f1220 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -40,6 +40,7 @@ private:
     wxTextCtrl* py_app_gpu_idx_;
     wxTextCtrl* py_app_min_silence_duration_ms_;
     wxTextCtrl* py_app_max_speech_duration_s_;
+    wxTextCtrl* py_app_transcription_loop_delay_ms_;
     wxTextCtrl* py_app_keybind_;
     wxTextCtrl* py_app_browser_src_port_;
     wxTextCtrl* py_app_commit_fuzz_threshold_;
diff --git a/GUI/GUI/GUI/Logging.cpp b/GUI/GUI/GUI/Logging.cpp
index 5d0e23e..f6ad3ab 100644
--- a/GUI/GUI/GUI/Logging.cpp
+++ b/GUI/GUI/GUI/Logging.cpp
@@ -48,15 +48,16 @@ void Logging::ThreadLogger::Drain()
 			log_ofs << message;
 		}
 
-		// Constrain wxTextCtrl's to 100-200 lines to keep memory usage /
+		// Constrain wxTextCtrl's to a few hundred lines to keep memory usage /
 		// general snappiness in check.
 		if (frame) {
 			wxString allText = frame->GetValue();
 			wxArrayString lines = wxStringTokenize(allText, "\n");
 			size_t count = lines.GetCount();
-			if (count > 200) {
-				// Keep only the last 100 lines.
-				size_t linesToRemove = count - 100;
+			constexpr int kHalfMaxLines = 1000;
+			if (count > kHalfMaxLines * 2) {
+				// Keep only the last kHalfMaxLines lines.
+				size_t linesToRemove = count - kHalfMaxLines;
 
 				// Remove lines from the beginning
 				lines.RemoveAt(0, linesToRemove);
diff --git a/GUI/GUI/GUI/Transcript.cpp b/GUI/GUI/GUI/Transcript.cpp
index eb798d9..11bab31 100644
--- a/GUI/GUI/GUI/Transcript.cpp
+++ b/GUI/GUI/GUI/Transcript.cpp
@@ -20,6 +20,7 @@ void Transcript::SetPreview(std::string&& segment) {
 void Transcript::Clear() {
 	std::scoped_lock l(mu_);
 	segments_.clear();
+	previews_.clear();
 }
 
 std::vector<std::string> Transcript::Get() {
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index df333bc..f0e994f 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -367,7 +367,8 @@ class Segment:
             end_ts: float,
             wall_ts: float,
             avg_logprob: float,
-            no_speech_prob: float):
+            no_speech_prob: float,
+            compression_ratio: float):
         self.transcript = transcript
         # start_ts, end_ts are timestamps in seconds relative to `wall_ts`.
         self.start_ts = start_ts
@@ -377,6 +378,7 @@ class Segment:
         self.wall_ts = wall_ts
         self.avg_logprob = avg_logprob
         self.no_speech_prob = no_speech_prob
+        self.compression_ratio = compression_ratio
 
     def __str__(self):
         ts = f"(ts: {self.start_ts}-{self.end_ts}) "
@@ -438,11 +440,17 @@ class Whisper:
         for s in segments:
             # Manual touchup. I see a decent number of hallucinations sneaking
             # in with high `no_speech_prob` and modest `avg_logprob`.
-            if s.no_speech_prob > 0.8 and s.avg_logprob < -0.5:
+            if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+                continue
+            if cfg["enable_debug_mode"]:
+                print(f"s get: {s}")
+            if s.avg_logprob < -1.0:
+                continue
+            if s.compression_ratio > 2.4:
                 continue
             res.append(Segment(s.text, s.start, s.end,
                 self.collector.begin(),
-                s.avg_logprob, s.no_speech_prob))
+                s.avg_logprob, s.no_speech_prob, s.compression_ratio))
         return res
 
 class TranscriptCommit:
@@ -490,11 +498,12 @@ class VadCommitter:
             commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
 
             segments = self.whisper.transcribe(commit_audio)
-            for s in segments:
-                print(f"commit segment: {s}", file=sys.stderr)
             delta = ''.join(s.transcript for s in segments)
-            print(f"delta get: {delta}", file=sys.stderr)
             audio = self.collector.getAudio()
+            if cfg["enable_debug_mode"]:
+                for s in segments:
+                    print(f"commit segment: {s}", file=sys.stderr)
+                print(f"delta get: {delta}", file=sys.stderr)
 
             #ts = datetime.fromtimestamp(self.collector.now() - latency_s)
             #filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
@@ -665,7 +674,7 @@ def optimize(cfg,
 
 def transcriptionThread(ctrl: ThreadControl):
     while ctrl.run_app:
-        time.sleep(.005)
+        time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0);
 
         op = None