8 files changed, 83 insertions, 19 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 573238f..2abe5b2 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -88,6 +88,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	gpu_idx(0),
 	min_silence_duration_ms(250),
 	max_speech_duration_s(5),
+	reset_after_silence_s(10),
 	transcription_loop_delay_ms(100),
 	keybind("ctrl+x"),
 
@@ -135,6 +136,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
 	cm.Set("gpu_idx", gpu_idx);
 	cm.Set("min_silence_duration_ms", min_silence_duration_ms);
 	cm.Set("max_speech_duration_s", max_speech_duration_s);
+	cm.Set("reset_after_silence_s", reset_after_silence_s);
 	cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms);
 	cm.Set("keybind", keybind);
 
@@ -195,6 +197,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
 	cm.Get("gpu_idx", c.gpu_idx);
 	cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
 	cm.Get("max_speech_duration_s", c.max_speech_duration_s);
+	cm.Get("reset_after_silence_s", c.reset_after_silence_s);
 	cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms);
 	cm.Get("keybind", c.keybind);
 
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index ede21d6..0d0da66 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -74,6 +74,7 @@ public:
 	int gpu_idx;
 	int min_silence_duration_ms;
 	int max_speech_duration_s;
+	int reset_after_silence_s;
 	int transcription_loop_delay_ms;
 	std::string keybind;
 
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 23ac38c..602bf6d 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -78,6 +78,7 @@ namespace {
         ID_PY_APP_GPU_IDX,
         ID_PY_APP_MIN_SILENCE_DURATION_MS,
         ID_PY_APP_MAX_SPEECH_DURATION_S,
+        ID_PY_APP_RESET_AFTER_SILENCE_S,
         ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
         ID_PY_APP_KEYBIND,
         ID_PY_APP_BROWSER_SRC_PORT,
@@ -796,6 +797,16 @@ Frame::Frame()
                         "milliseconds.");
 					py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
 
+					auto* py_app_reset_after_silence_s = new wxTextCtrl(
+                        py_app_config_panel_pairs, ID_PY_APP_RESET_AFTER_SILENCE_S,
+                        std::to_string(app_c_->reset_after_silence_s), wxDefaultPosition,
+						wxDefaultSize, /*style=*/0);
+                    py_app_reset_after_silence_s->SetToolTip(
+                        "If you pause for at least this long between "
+                        "sentences, the transcript before the pause will be "
+                        "removed. To disable this feature, set it to -1.");
+					py_app_reset_after_silence_s_ = py_app_reset_after_silence_s;
+
 					auto* py_app_transcription_loop_delay_ms = new wxTextCtrl(
                         py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
                         std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition,
@@ -906,6 +917,11 @@ Frame::Frame()
                         /*flags=*/wxEXPAND);
 
                     sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+                        wxID_ANY, /*label=*/"Reset after silence (s):"));
+                    sizer->Add(py_app_reset_after_silence_s, /*proportion=*/0,
+                        /*flags=*/wxEXPAND);
+
+                    sizer->Add(new wxStaticText(py_app_config_panel_pairs,
                         wxID_ANY, /*label=*/"Transcription loop delay (ms):"));
                     sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0,
                         /*flags=*/wxEXPAND);
@@ -1629,6 +1645,10 @@ void Frame::ApplyConfigToInputFields()
 	py_app_max_speech_duration_s->Clear();
 	py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
 
+	auto* py_app_reset_after_silence_s = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_RESET_AFTER_SILENCE_S));
+	py_app_reset_after_silence_s->Clear();
+	py_app_reset_after_silence_s->AppendText(std::to_string(app_c_->reset_after_silence_s));
+
 	auto* py_app_transcription_loop_delay_ms = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS));
 	py_app_transcription_loop_delay_ms->Clear();
 	py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms));
@@ -2405,6 +2425,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 	ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
 	ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
 	ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
+	ASSIGN_OR_RETURN_VOID(int, reset_after_silence_s, stoiInRange(transcribe_out_, py_app_reset_after_silence_s_->GetValue().ToStdString(), "reset_after_silence_s", -1, 30));
 	ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000));
 	ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
 
@@ -2438,6 +2459,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     app_c_->gpu_idx = gpu_idx;
     app_c_->min_silence_duration_ms = min_silence_duration_ms;
     app_c_->max_speech_duration_s = max_speech_duration_s;
+    app_c_->reset_after_silence_s = reset_after_silence_s;
     app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms;
     app_c_->keybind = keybind;
     app_c_->Serialize(AppConfig::kConfigPath);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 468a650..ee91e98 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -40,6 +40,7 @@ private:
     wxTextCtrl* py_app_gpu_idx_;
     wxTextCtrl* py_app_min_silence_duration_ms_;
     wxTextCtrl* py_app_max_speech_duration_s_;
+    wxTextCtrl* py_app_reset_after_silence_s_;
     wxTextCtrl* py_app_transcription_loop_delay_ms_;
     wxTextCtrl* py_app_keybind_;
     wxTextCtrl* py_app_browser_src_port_;
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 31c571e..b5fe518 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -77,10 +77,6 @@ namespace PythonWrapper
 		const std::function<bool()>&& run_cb = []() { return true; });
 	bool InstallPip(std::string* out, std::string* err = nullptr);
 
-	// TODO(yum) both StartApp and GenerateAnimator should be
-	// parameterized with config files instead of these ever-growing lists of
-	// parameters. We could persist those files so settings would persist across
-	// app restarts.
 	std::future<bool> StartApp(
 		const AppConfig& app_c,
 		const std::string& config_path,
diff --git a/Scripts/browser_src.py b/Scripts/browser_src.py
index befb2db..4ed3407 100644
--- a/Scripts/browser_src.py
+++ b/Scripts/browser_src.py
@@ -51,6 +51,10 @@ class MyHandler(http.server.BaseHTTPRequestHandler):
         self.http_server_instance = http_server_instance
         super().__init__(*args, **kwargs)
 
+    def log_message(self, format, *args):
+        # TODO log if cfg["debug_mode_enabled"] is set
+        return
+
     def do_GET(self):
         self.handle_request('GET')
 
@@ -96,6 +100,12 @@ class BrowserSource(StreamingPlugin):
         del commit.audio
         if commit.delta:
             self.commits.append(commit)
+        # Limit commits to last N.
+        now = time.time()
+        self.commits = [commit for commit in self.commits]
+        max_commits = 10
+        if len(self.commits) > max_commits:
+            self.commits = self.commits[-int(max_commits/2):]
         self.preview_commit = commit
         return original_commit
 
diff --git a/Scripts/transcribe_pipeline.py b/Scripts/transcribe_pipeline.py
index 3f48b08..5914afc 100644
--- a/Scripts/transcribe_pipeline.py
+++ b/Scripts/transcribe_pipeline.py
@@ -5,15 +5,22 @@ class TranscriptCommit:
     def __init__(self,
             delta: str,
             preview: str,
-            latency_s: int = None,
+            latency_s: float = None,
             thresh_at_commit: int = None,
-            audio: bytes = None):
+            audio: bytes = None,
+            duration_s: float = None,
+            start_ts: float = None):
         self.delta = delta
         self.preview = preview
         self.latency_s = latency_s
         self.thresh_at_commit = thresh_at_commit
         self.audio = audio
+        # Time at which the commit is generated
         self.ts = time.time()
+        # Time corresponding to the start of the segment
+        self.start_ts = start_ts
+        # The duration of the audio segment, in seconds.
+        self.duration_s = duration_s
 
 
 class StreamingPlugin:
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 2bf605d..889e1cf 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -217,11 +217,11 @@ class AudioCollector:
         return self.frames
 
     def dropAudioPrefix(self, dur_s: float) -> bytes:
-        n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ
+        n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ
         n_bytes = min(n_bytes, len(self.frames))
         cut_portion = self.frames[:n_bytes]
         self.frames = self.frames[n_bytes:]
-        self.wall_ts = self.wall_ts + self.duration()
+        self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS
         return cut_portion
 
     def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes:
@@ -229,7 +229,7 @@ class AudioCollector:
         n_bytes = min(n_bytes, len(self.frames))
         cut_portion = self.frames[:n_bytes]
         self.frames = self.frames[n_bytes:]
-        self.wall_ts = self.wall_ts + self.duration()
+        self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS
         return cut_portion
 
     def keepLast(self, dur_s: float) -> bytes:
@@ -243,7 +243,7 @@ class AudioCollector:
         return cut_portion
 
     def duration(self):
-        return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ)
+        return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ)
 
     def begin(self):
         return self.wall_ts
@@ -486,9 +486,13 @@ class VadCommitter:
         delta = ""
         commit_audio = None
         latency_s = None
+        duration_s = self.collector.duration()
+        start_ts = self.collector.begin()
         if has_audio and stable_cutoff:
             #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
             latency_s = self.collector.now() - self.collector.begin()
+            duration_s = stable_cutoff / AudioStream.FPS
+            start_ts = self.collector.begin()
             commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
 
             segments = self.whisper.transcribe(commit_audio)
@@ -516,13 +520,15 @@ class VadCommitter:
                 delta,
                 preview,
                 latency_s,
-                audio=audio)
+                audio=audio,
+                duration_s=duration_s,
+                start_ts=start_ts)
 
 def install_in_venv(pkgs: typing.List[str]) -> bool:
     pkgs_str = " ".join(pkgs)
     print(f"Installing {pkgs_str}")
     pip_proc = subprocess.Popen(
-            f"Resources/Python/python.exe -m pip install {pkgs_str}".split(),
+            f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(),
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
     pip_stdout, pip_stderr = pip_proc.communicate()
@@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool:
     if pip_proc.returncode != 0:
         print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}",
                 file=sys.stderr)
+        return False
+    return True
 
 class TranslationPlugin(StreamingPlugin):
     def __init__(self, cfg):
@@ -847,6 +855,8 @@ def optimize(cfg,
     return optimized_params
 
 def transcriptionThread(ctrl: ThreadControl):
+    last_stable_commit = None
+
     while ctrl.run_app:
         time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0);
 
@@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl):
             commit = plugin.transform(commit)
 
         if len(commit.delta) > 0 or len(commit.preview) > 0:
+            # Avoid re-sending text after long pauses. User controls the length
+            # of the pause in the UI.
+            if ctrl.cfg["reset_after_silence_s"] > 0:
+                silence_duration = 0
+                if last_stable_commit:
+                    last_commit_end_ts = \
+                            last_stable_commit.start_ts + \
+                            last_stable_commit.duration_s
+                    silence_duration = commit.start_ts - last_commit_end_ts
+                if silence_duration > ctrl.cfg["reset_after_silence_s"]:
+                    print(f"Resetting transcript after {silence_duration}-second "
+                            "silence", file=sys.stderr)
+                    ctrl.transcript = ""
+                    ctrl.preview = ""
+                if commit.delta:
+                    last_stable_commit = commit
+
             # Hard-cap displayed transcript length at 4k characters to prevent
             # runaway memory use in UI. Keep the full transcript to avoid
             # breaking OSC pager.
@@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl):
             try:
                 print(f"Transcript: {transcript}")
             except UnicodeEncodeError:
-                print("Failed to encode transcript - discarding delta")
+                print("Failed to encode transcript - discarding delta",
+                        file=sys.stderr)
                 continue
             try:
                 print(f"Preview: {preview}")
             except UnicodeEncodeError:
-                print("Failed to encode preview - discarding")
+                print("Failed to encode preview - discarding", file=sys.stderr)
 
             if cfg["enable_debug_mode"]:
                 print(f"commit latency: {commit.latency_s}", file=sys.stderr)
                 print(f"commit thresh: {commit.thresh_at_commit}",
                         file=sys.stderr)
-            if len(commit.preview) > 0:
-                print("Finalized: 0")
-            else:
-                print("Finalized: 1")
 
         ctrl.transcript += commit.delta
         ctrl.preview = ctrl.transcript + commit.preview
@@ -1125,7 +1149,7 @@ def run(cfg):
 
     collector = AudioCollector(stream)
     #collector = LengthEnforcingAudioCollector(collector, 5.0)
-    #collector = NormalizingAudioCollector(collector)
+    collector = NormalizingAudioCollector(collector)
     collector = CompressingAudioCollector(collector)
     whisper = Whisper(collector, cfg)
     segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"],