diff options
| -rw-r--r-- | GUI/GUI/GUI/Config.cpp | 3 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Config.h | 1 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 22 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.h | 1 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.h | 4 | ||||
| -rw-r--r-- | Scripts/browser_src.py | 10 | ||||
| -rw-r--r-- | Scripts/transcribe_pipeline.py | 11 | ||||
| -rw-r--r-- | Scripts/transcribe_v2.py | 50 |
8 files changed, 83 insertions, 19 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 573238f..2abe5b2 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -88,6 +88,7 @@ AppConfig::AppConfig(wxTextCtrl* out) gpu_idx(0),
min_silence_duration_ms(250),
max_speech_duration_s(5),
+ reset_after_silence_s(10),
transcription_loop_delay_ms(100),
keybind("ctrl+x"),
@@ -135,6 +136,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("gpu_idx", gpu_idx);
cm.Set("min_silence_duration_ms", min_silence_duration_ms);
cm.Set("max_speech_duration_s", max_speech_duration_s);
+ cm.Set("reset_after_silence_s", reset_after_silence_s);
cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms);
cm.Set("keybind", keybind);
@@ -195,6 +197,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("gpu_idx", c.gpu_idx);
cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
cm.Get("max_speech_duration_s", c.max_speech_duration_s);
+ cm.Get("reset_after_silence_s", c.reset_after_silence_s);
cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms);
cm.Get("keybind", c.keybind);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index ede21d6..0d0da66 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -74,6 +74,7 @@ public: int gpu_idx;
int min_silence_duration_ms;
int max_speech_duration_s;
+ int reset_after_silence_s;
int transcription_loop_delay_ms;
std::string keybind;
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 23ac38c..602bf6d 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -78,6 +78,7 @@ namespace { ID_PY_APP_GPU_IDX,
ID_PY_APP_MIN_SILENCE_DURATION_MS,
ID_PY_APP_MAX_SPEECH_DURATION_S,
+ ID_PY_APP_RESET_AFTER_SILENCE_S,
ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
ID_PY_APP_KEYBIND,
ID_PY_APP_BROWSER_SRC_PORT,
@@ -796,6 +797,16 @@ Frame::Frame() "milliseconds.");
py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
+ auto* py_app_reset_after_silence_s = new wxTextCtrl(
+ py_app_config_panel_pairs, ID_PY_APP_RESET_AFTER_SILENCE_S,
+ std::to_string(app_c_->reset_after_silence_s), wxDefaultPosition,
+ wxDefaultSize, /*style=*/0);
+ py_app_reset_after_silence_s->SetToolTip(
+ "If you pause for at least this long between "
+ "sentences, the transcript before the pause will be "
+ "removed. To disable this feature, set it to -1.");
+ py_app_reset_after_silence_s_ = py_app_reset_after_silence_s;
+
auto* py_app_transcription_loop_delay_ms = new wxTextCtrl(
py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition,
@@ -906,6 +917,11 @@ Frame::Frame() /*flags=*/wxEXPAND);
sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+ wxID_ANY, /*label=*/"Reset after silence (s):"));
+ sizer->Add(py_app_reset_after_silence_s, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(py_app_config_panel_pairs,
wxID_ANY, /*label=*/"Transcription loop delay (ms):"));
sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0,
/*flags=*/wxEXPAND);
@@ -1629,6 +1645,10 @@ void Frame::ApplyConfigToInputFields() py_app_max_speech_duration_s->Clear();
py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
+ auto* py_app_reset_after_silence_s = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_RESET_AFTER_SILENCE_S));
+ py_app_reset_after_silence_s->Clear();
+ py_app_reset_after_silence_s->AppendText(std::to_string(app_c_->reset_after_silence_s));
+
auto* py_app_transcription_loop_delay_ms = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS));
py_app_transcription_loop_delay_ms->Clear();
py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms));
@@ -2405,6 +2425,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
+ ASSIGN_OR_RETURN_VOID(int, reset_after_silence_s, stoiInRange(transcribe_out_, py_app_reset_after_silence_s_->GetValue().ToStdString(), "reset_after_silence_s", -1, 30));
ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000));
ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
@@ -2438,6 +2459,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->gpu_idx = gpu_idx;
app_c_->min_silence_duration_ms = min_silence_duration_ms;
app_c_->max_speech_duration_s = max_speech_duration_s;
+ app_c_->reset_after_silence_s = reset_after_silence_s;
app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms;
app_c_->keybind = keybind;
app_c_->Serialize(AppConfig::kConfigPath);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 468a650..ee91e98 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -40,6 +40,7 @@ private: wxTextCtrl* py_app_gpu_idx_;
wxTextCtrl* py_app_min_silence_duration_ms_;
wxTextCtrl* py_app_max_speech_duration_s_;
+ wxTextCtrl* py_app_reset_after_silence_s_;
wxTextCtrl* py_app_transcription_loop_delay_ms_;
wxTextCtrl* py_app_keybind_;
wxTextCtrl* py_app_browser_src_port_;
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 31c571e..b5fe518 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -77,10 +77,6 @@ namespace PythonWrapper const std::function<bool()>&& run_cb = []() { return true; }); bool InstallPip(std::string* out, std::string* err = nullptr); - // TODO(yum) both StartApp and GenerateAnimator should be - // parameterized with config files instead of these ever-growing lists of - // parameters. We could persist those files so settings would persist across - // app restarts. std::future<bool> StartApp( const AppConfig& app_c, const std::string& config_path, diff --git a/Scripts/browser_src.py b/Scripts/browser_src.py index befb2db..4ed3407 100644 --- a/Scripts/browser_src.py +++ b/Scripts/browser_src.py @@ -51,6 +51,10 @@ class MyHandler(http.server.BaseHTTPRequestHandler): self.http_server_instance = http_server_instance super().__init__(*args, **kwargs) + def log_message(self, format, *args): + # TODO log if cfg["debug_mode_enabled"] is set + return + def do_GET(self): self.handle_request('GET') @@ -96,6 +100,12 @@ class BrowserSource(StreamingPlugin): del commit.audio if commit.delta: self.commits.append(commit) + # Limit commits to last N. + now = time.time() + self.commits = [commit for commit in self.commits] + max_commits = 10 + if len(self.commits) > max_commits: + self.commits = self.commits[-int(max_commits/2):] self.preview_commit = commit return original_commit diff --git a/Scripts/transcribe_pipeline.py b/Scripts/transcribe_pipeline.py index 3f48b08..5914afc 100644 --- a/Scripts/transcribe_pipeline.py +++ b/Scripts/transcribe_pipeline.py @@ -5,15 +5,22 @@ class TranscriptCommit: def __init__(self, delta: str, preview: str, - latency_s: int = None, + latency_s: float = None, thresh_at_commit: int = None, - audio: bytes = None): + audio: bytes = None, + duration_s: float = None, + start_ts: float = None): self.delta = delta self.preview = preview self.latency_s = latency_s self.thresh_at_commit = thresh_at_commit self.audio = audio + # Time at which the commit is generated self.ts = time.time() + # Time corresponding to the start of the segment + self.start_ts = start_ts + # The duration of the audio segment, in seconds. + self.duration_s = duration_s class StreamingPlugin: diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 2bf605d..889e1cf 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -217,11 +217,11 @@ class AudioCollector: return self.frames def dropAudioPrefix(self, dur_s: float) -> bytes: - n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ + n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes: @@ -229,7 +229,7 @@ class AudioCollector: n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def keepLast(self, dur_s: float) -> bytes: @@ -243,7 +243,7 @@ class AudioCollector: return cut_portion def duration(self): - return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ) + return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ) def begin(self): return self.wall_ts @@ -486,9 +486,13 @@ class VadCommitter: delta = "" commit_audio = None latency_s = None + duration_s = self.collector.duration() + start_ts = self.collector.begin() if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() + duration_s = stable_cutoff / AudioStream.FPS + start_ts = self.collector.begin() commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) segments = self.whisper.transcribe(commit_audio) @@ -516,13 +520,15 @@ class VadCommitter: delta, preview, latency_s, - audio=audio) + audio=audio, + duration_s=duration_s, + start_ts=start_ts) def install_in_venv(pkgs: typing.List[str]) -> bool: pkgs_str = " ".join(pkgs) print(f"Installing {pkgs_str}") pip_proc = subprocess.Popen( - f"Resources/Python/python.exe -m pip install {pkgs_str}".split(), + f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) pip_stdout, pip_stderr = pip_proc.communicate() @@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool: if pip_proc.returncode != 0: print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}", file=sys.stderr) + return False + return True class TranslationPlugin(StreamingPlugin): def __init__(self, cfg): @@ -847,6 +855,8 @@ def optimize(cfg, return optimized_params def transcriptionThread(ctrl: ThreadControl): + last_stable_commit = None + while ctrl.run_app: time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0); @@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl): commit = plugin.transform(commit) if len(commit.delta) > 0 or len(commit.preview) > 0: + # Avoid re-sending text after long pauses. User controls the length + # of the pause in the UI. + if ctrl.cfg["reset_after_silence_s"] > 0: + silence_duration = 0 + if last_stable_commit: + last_commit_end_ts = \ + last_stable_commit.start_ts + \ + last_stable_commit.duration_s + silence_duration = commit.start_ts - last_commit_end_ts + if silence_duration > ctrl.cfg["reset_after_silence_s"]: + print(f"Resetting transcript after {silence_duration}-second " + "silence", file=sys.stderr) + ctrl.transcript = "" + ctrl.preview = "" + if commit.delta: + last_stable_commit = commit + # Hard-cap displayed transcript length at 4k characters to prevent # runaway memory use in UI. Keep the full transcript to avoid # breaking OSC pager. @@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl): try: print(f"Transcript: {transcript}") except UnicodeEncodeError: - print("Failed to encode transcript - discarding delta") + print("Failed to encode transcript - discarding delta", + file=sys.stderr) continue try: print(f"Preview: {preview}") except UnicodeEncodeError: - print("Failed to encode preview - discarding") + print("Failed to encode preview - discarding", file=sys.stderr) if cfg["enable_debug_mode"]: print(f"commit latency: {commit.latency_s}", file=sys.stderr) print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr) - if len(commit.preview) > 0: - print("Finalized: 0") - else: - print("Finalized: 1") ctrl.transcript += commit.delta ctrl.preview = ctrl.transcript + commit.preview @@ -1125,7 +1149,7 @@ def run(cfg): collector = AudioCollector(stream) #collector = LengthEnforcingAudioCollector(collector, 5.0) - #collector = NormalizingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) whisper = Whisper(collector, cfg) segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"], |
