diff options
| author | yum <yum.food.vr@gmail.com> | 2023-10-05 18:22:55 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-10-05 18:28:42 -0700 |
| commit | add7bd8ef86ec21cd1327eb45bcb739aa54f7db8 (patch) | |
| tree | f342e37917c93073552854a125696e12afbd4c39 | |
| parent | c2bc70c18d2fd1c3601b32f2a93b3b4a704786a5 (diff) | |
Transcripts preceding long pauses now dropv0.16.0
When hot-miking into the built-in chatbox, there are sometimes long
pauses in conversation. After these pauses, it's undesirable to show the
transcript generate before the pause. This feature makes it so that
those transcripts can be dropped.
Also:
* Limit number of segments sent to browser source to 10. Allow this to
grow up to 10 segments before dropping the first 5 segments.
* Silence warnings generated by `install_in_venv`, used by e.g.
translation codepath.
* Enable audio normalization to improve accuracy when speaking softly,
at the cost of some accuracy when speaking normally.
Credit: user endo0269 on Discord suggested this feature.
| -rw-r--r-- | GUI/GUI/GUI/Config.cpp | 3 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Config.h | 1 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 22 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.h | 1 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.h | 4 | ||||
| -rw-r--r-- | Scripts/browser_src.py | 10 | ||||
| -rw-r--r-- | Scripts/transcribe_pipeline.py | 11 | ||||
| -rw-r--r-- | Scripts/transcribe_v2.py | 50 |
8 files changed, 83 insertions, 19 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index 573238f..2abe5b2 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -88,6 +88,7 @@ AppConfig::AppConfig(wxTextCtrl* out) gpu_idx(0),
min_silence_duration_ms(250),
max_speech_duration_s(5),
+ reset_after_silence_s(10),
transcription_loop_delay_ms(100),
keybind("ctrl+x"),
@@ -135,6 +136,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) { cm.Set("gpu_idx", gpu_idx);
cm.Set("min_silence_duration_ms", min_silence_duration_ms);
cm.Set("max_speech_duration_s", max_speech_duration_s);
+ cm.Set("reset_after_silence_s", reset_after_silence_s);
cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms);
cm.Set("keybind", keybind);
@@ -195,6 +197,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) { cm.Get("gpu_idx", c.gpu_idx);
cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
cm.Get("max_speech_duration_s", c.max_speech_duration_s);
+ cm.Get("reset_after_silence_s", c.reset_after_silence_s);
cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms);
cm.Get("keybind", c.keybind);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h index ede21d6..0d0da66 100644 --- a/GUI/GUI/GUI/Config.h +++ b/GUI/GUI/GUI/Config.h @@ -74,6 +74,7 @@ public: int gpu_idx;
int min_silence_duration_ms;
int max_speech_duration_s;
+ int reset_after_silence_s;
int transcription_loop_delay_ms;
std::string keybind;
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 23ac38c..602bf6d 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -78,6 +78,7 @@ namespace { ID_PY_APP_GPU_IDX,
ID_PY_APP_MIN_SILENCE_DURATION_MS,
ID_PY_APP_MAX_SPEECH_DURATION_S,
+ ID_PY_APP_RESET_AFTER_SILENCE_S,
ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
ID_PY_APP_KEYBIND,
ID_PY_APP_BROWSER_SRC_PORT,
@@ -796,6 +797,16 @@ Frame::Frame() "milliseconds.");
py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
+ auto* py_app_reset_after_silence_s = new wxTextCtrl(
+ py_app_config_panel_pairs, ID_PY_APP_RESET_AFTER_SILENCE_S,
+ std::to_string(app_c_->reset_after_silence_s), wxDefaultPosition,
+ wxDefaultSize, /*style=*/0);
+ py_app_reset_after_silence_s->SetToolTip(
+ "If you pause for at least this long between "
+ "sentences, the transcript before the pause will be "
+ "removed. To disable this feature, set it to -1.");
+ py_app_reset_after_silence_s_ = py_app_reset_after_silence_s;
+
auto* py_app_transcription_loop_delay_ms = new wxTextCtrl(
py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition,
@@ -906,6 +917,11 @@ Frame::Frame() /*flags=*/wxEXPAND);
sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+ wxID_ANY, /*label=*/"Reset after silence (s):"));
+ sizer->Add(py_app_reset_after_silence_s, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(py_app_config_panel_pairs,
wxID_ANY, /*label=*/"Transcription loop delay (ms):"));
sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0,
/*flags=*/wxEXPAND);
@@ -1629,6 +1645,10 @@ void Frame::ApplyConfigToInputFields() py_app_max_speech_duration_s->Clear();
py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
+ auto* py_app_reset_after_silence_s = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_RESET_AFTER_SILENCE_S));
+ py_app_reset_after_silence_s->Clear();
+ py_app_reset_after_silence_s->AppendText(std::to_string(app_c_->reset_after_silence_s));
+
auto* py_app_transcription_loop_delay_ms = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS));
py_app_transcription_loop_delay_ms->Clear();
py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms));
@@ -2405,6 +2425,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
+ ASSIGN_OR_RETURN_VOID(int, reset_after_silence_s, stoiInRange(transcribe_out_, py_app_reset_after_silence_s_->GetValue().ToStdString(), "reset_after_silence_s", -1, 30));
ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000));
ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
@@ -2438,6 +2459,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { app_c_->gpu_idx = gpu_idx;
app_c_->min_silence_duration_ms = min_silence_duration_ms;
app_c_->max_speech_duration_s = max_speech_duration_s;
+ app_c_->reset_after_silence_s = reset_after_silence_s;
app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms;
app_c_->keybind = keybind;
app_c_->Serialize(AppConfig::kConfigPath);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 468a650..ee91e98 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -40,6 +40,7 @@ private: wxTextCtrl* py_app_gpu_idx_;
wxTextCtrl* py_app_min_silence_duration_ms_;
wxTextCtrl* py_app_max_speech_duration_s_;
+ wxTextCtrl* py_app_reset_after_silence_s_;
wxTextCtrl* py_app_transcription_loop_delay_ms_;
wxTextCtrl* py_app_keybind_;
wxTextCtrl* py_app_browser_src_port_;
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 31c571e..b5fe518 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -77,10 +77,6 @@ namespace PythonWrapper const std::function<bool()>&& run_cb = []() { return true; }); bool InstallPip(std::string* out, std::string* err = nullptr); - // TODO(yum) both StartApp and GenerateAnimator should be - // parameterized with config files instead of these ever-growing lists of - // parameters. We could persist those files so settings would persist across - // app restarts. std::future<bool> StartApp( const AppConfig& app_c, const std::string& config_path, diff --git a/Scripts/browser_src.py b/Scripts/browser_src.py index befb2db..4ed3407 100644 --- a/Scripts/browser_src.py +++ b/Scripts/browser_src.py @@ -51,6 +51,10 @@ class MyHandler(http.server.BaseHTTPRequestHandler): self.http_server_instance = http_server_instance super().__init__(*args, **kwargs) + def log_message(self, format, *args): + # TODO log if cfg["debug_mode_enabled"] is set + return + def do_GET(self): self.handle_request('GET') @@ -96,6 +100,12 @@ class BrowserSource(StreamingPlugin): del commit.audio if commit.delta: self.commits.append(commit) + # Limit commits to last N. + now = time.time() + self.commits = [commit for commit in self.commits] + max_commits = 10 + if len(self.commits) > max_commits: + self.commits = self.commits[-int(max_commits/2):] self.preview_commit = commit return original_commit diff --git a/Scripts/transcribe_pipeline.py b/Scripts/transcribe_pipeline.py index 3f48b08..5914afc 100644 --- a/Scripts/transcribe_pipeline.py +++ b/Scripts/transcribe_pipeline.py @@ -5,15 +5,22 @@ class TranscriptCommit: def __init__(self, delta: str, preview: str, - latency_s: int = None, + latency_s: float = None, thresh_at_commit: int = None, - audio: bytes = None): + audio: bytes = None, + duration_s: float = None, + start_ts: float = None): self.delta = delta self.preview = preview self.latency_s = latency_s self.thresh_at_commit = thresh_at_commit self.audio = audio + # Time at which the commit is generated self.ts = time.time() + # Time corresponding to the start of the segment + self.start_ts = start_ts + # The duration of the audio segment, in seconds. + self.duration_s = duration_s class StreamingPlugin: diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 2bf605d..889e1cf 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -217,11 +217,11 @@ class AudioCollector: return self.frames def dropAudioPrefix(self, dur_s: float) -> bytes: - n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ + n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes: @@ -229,7 +229,7 @@ class AudioCollector: n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def keepLast(self, dur_s: float) -> bytes: @@ -243,7 +243,7 @@ class AudioCollector: return cut_portion def duration(self): - return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ) + return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ) def begin(self): return self.wall_ts @@ -486,9 +486,13 @@ class VadCommitter: delta = "" commit_audio = None latency_s = None + duration_s = self.collector.duration() + start_ts = self.collector.begin() if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() + duration_s = stable_cutoff / AudioStream.FPS + start_ts = self.collector.begin() commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) segments = self.whisper.transcribe(commit_audio) @@ -516,13 +520,15 @@ class VadCommitter: delta, preview, latency_s, - audio=audio) + audio=audio, + duration_s=duration_s, + start_ts=start_ts) def install_in_venv(pkgs: typing.List[str]) -> bool: pkgs_str = " ".join(pkgs) print(f"Installing {pkgs_str}") pip_proc = subprocess.Popen( - f"Resources/Python/python.exe -m pip install {pkgs_str}".split(), + f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) pip_stdout, pip_stderr = pip_proc.communicate() @@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool: if pip_proc.returncode != 0: print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}", file=sys.stderr) + return False + return True class TranslationPlugin(StreamingPlugin): def __init__(self, cfg): @@ -847,6 +855,8 @@ def optimize(cfg, return optimized_params def transcriptionThread(ctrl: ThreadControl): + last_stable_commit = None + while ctrl.run_app: time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0); @@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl): commit = plugin.transform(commit) if len(commit.delta) > 0 or len(commit.preview) > 0: + # Avoid re-sending text after long pauses. User controls the length + # of the pause in the UI. + if ctrl.cfg["reset_after_silence_s"] > 0: + silence_duration = 0 + if last_stable_commit: + last_commit_end_ts = \ + last_stable_commit.start_ts + \ + last_stable_commit.duration_s + silence_duration = commit.start_ts - last_commit_end_ts + if silence_duration > ctrl.cfg["reset_after_silence_s"]: + print(f"Resetting transcript after {silence_duration}-second " + "silence", file=sys.stderr) + ctrl.transcript = "" + ctrl.preview = "" + if commit.delta: + last_stable_commit = commit + # Hard-cap displayed transcript length at 4k characters to prevent # runaway memory use in UI. Keep the full transcript to avoid # breaking OSC pager. @@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl): try: print(f"Transcript: {transcript}") except UnicodeEncodeError: - print("Failed to encode transcript - discarding delta") + print("Failed to encode transcript - discarding delta", + file=sys.stderr) continue try: print(f"Preview: {preview}") except UnicodeEncodeError: - print("Failed to encode preview - discarding") + print("Failed to encode preview - discarding", file=sys.stderr) if cfg["enable_debug_mode"]: print(f"commit latency: {commit.latency_s}", file=sys.stderr) print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr) - if len(commit.preview) > 0: - print("Finalized: 0") - else: - print("Finalized: 1") ctrl.transcript += commit.delta ctrl.preview = ctrl.transcript + commit.preview @@ -1125,7 +1149,7 @@ def run(cfg): collector = AudioCollector(stream) #collector = LengthEnforcingAudioCollector(collector, 5.0) - #collector = NormalizingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) whisper = Whisper(collector, cfg) segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"], |
