summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--GUI/GUI/GUI/Config.cpp3
-rw-r--r--GUI/GUI/GUI/Config.h1
-rw-r--r--GUI/GUI/GUI/Frame.cpp22
-rw-r--r--GUI/GUI/GUI/Frame.h1
-rw-r--r--GUI/GUI/GUI/PythonWrapper.h4
-rw-r--r--Scripts/browser_src.py10
-rw-r--r--Scripts/transcribe_pipeline.py11
-rw-r--r--Scripts/transcribe_v2.py50
8 files changed, 83 insertions, 19 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 573238f..2abe5b2 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -88,6 +88,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
gpu_idx(0),
min_silence_duration_ms(250),
max_speech_duration_s(5),
+ reset_after_silence_s(10),
transcription_loop_delay_ms(100),
keybind("ctrl+x"),
@@ -135,6 +136,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
cm.Set("gpu_idx", gpu_idx);
cm.Set("min_silence_duration_ms", min_silence_duration_ms);
cm.Set("max_speech_duration_s", max_speech_duration_s);
+ cm.Set("reset_after_silence_s", reset_after_silence_s);
cm.Set("transcription_loop_delay_ms", transcription_loop_delay_ms);
cm.Set("keybind", keybind);
@@ -195,6 +197,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
cm.Get("gpu_idx", c.gpu_idx);
cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
cm.Get("max_speech_duration_s", c.max_speech_duration_s);
+ cm.Get("reset_after_silence_s", c.reset_after_silence_s);
cm.Get("transcription_loop_delay_ms", c.transcription_loop_delay_ms);
cm.Get("keybind", c.keybind);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index ede21d6..0d0da66 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -74,6 +74,7 @@ public:
int gpu_idx;
int min_silence_duration_ms;
int max_speech_duration_s;
+ int reset_after_silence_s;
int transcription_loop_delay_ms;
std::string keybind;
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 23ac38c..602bf6d 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -78,6 +78,7 @@ namespace {
ID_PY_APP_GPU_IDX,
ID_PY_APP_MIN_SILENCE_DURATION_MS,
ID_PY_APP_MAX_SPEECH_DURATION_S,
+ ID_PY_APP_RESET_AFTER_SILENCE_S,
ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
ID_PY_APP_KEYBIND,
ID_PY_APP_BROWSER_SRC_PORT,
@@ -796,6 +797,16 @@ Frame::Frame()
"milliseconds.");
py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
+ auto* py_app_reset_after_silence_s = new wxTextCtrl(
+ py_app_config_panel_pairs, ID_PY_APP_RESET_AFTER_SILENCE_S,
+ std::to_string(app_c_->reset_after_silence_s), wxDefaultPosition,
+ wxDefaultSize, /*style=*/0);
+ py_app_reset_after_silence_s->SetToolTip(
+ "If you pause for at least this long between "
+ "sentences, the transcript before the pause will be "
+ "removed. To disable this feature, set it to -1.");
+ py_app_reset_after_silence_s_ = py_app_reset_after_silence_s;
+
auto* py_app_transcription_loop_delay_ms = new wxTextCtrl(
py_app_config_panel_pairs, ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS,
std::to_string(app_c_->transcription_loop_delay_ms), wxDefaultPosition,
@@ -906,6 +917,11 @@ Frame::Frame()
/*flags=*/wxEXPAND);
sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+ wxID_ANY, /*label=*/"Reset after silence (s):"));
+ sizer->Add(py_app_reset_after_silence_s, /*proportion=*/0,
+ /*flags=*/wxEXPAND);
+
+ sizer->Add(new wxStaticText(py_app_config_panel_pairs,
wxID_ANY, /*label=*/"Transcription loop delay (ms):"));
sizer->Add(py_app_transcription_loop_delay_ms, /*proportion=*/0,
/*flags=*/wxEXPAND);
@@ -1629,6 +1645,10 @@ void Frame::ApplyConfigToInputFields()
py_app_max_speech_duration_s->Clear();
py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
+ auto* py_app_reset_after_silence_s = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_RESET_AFTER_SILENCE_S));
+ py_app_reset_after_silence_s->Clear();
+ py_app_reset_after_silence_s->AppendText(std::to_string(app_c_->reset_after_silence_s));
+
auto* py_app_transcription_loop_delay_ms = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_TRANSCRIPTION_LOOP_DELAY_MS));
py_app_transcription_loop_delay_ms->Clear();
py_app_transcription_loop_delay_ms->AppendText(std::to_string(app_c_->transcription_loop_delay_ms));
@@ -2405,6 +2425,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
+ ASSIGN_OR_RETURN_VOID(int, reset_after_silence_s, stoiInRange(transcribe_out_, py_app_reset_after_silence_s_->GetValue().ToStdString(), "reset_after_silence_s", -1, 30));
ASSIGN_OR_RETURN_VOID(int, transcription_loop_delay_ms, stoiInRange(transcribe_out_, py_app_transcription_loop_delay_ms_->GetValue().ToStdString(), "transcription_loop_delay_ms", 0, 10000));
ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
@@ -2438,6 +2459,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
app_c_->gpu_idx = gpu_idx;
app_c_->min_silence_duration_ms = min_silence_duration_ms;
app_c_->max_speech_duration_s = max_speech_duration_s;
+ app_c_->reset_after_silence_s = reset_after_silence_s;
app_c_->transcription_loop_delay_ms = transcription_loop_delay_ms;
app_c_->keybind = keybind;
app_c_->Serialize(AppConfig::kConfigPath);
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 468a650..ee91e98 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -40,6 +40,7 @@ private:
wxTextCtrl* py_app_gpu_idx_;
wxTextCtrl* py_app_min_silence_duration_ms_;
wxTextCtrl* py_app_max_speech_duration_s_;
+ wxTextCtrl* py_app_reset_after_silence_s_;
wxTextCtrl* py_app_transcription_loop_delay_ms_;
wxTextCtrl* py_app_keybind_;
wxTextCtrl* py_app_browser_src_port_;
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 31c571e..b5fe518 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -77,10 +77,6 @@ namespace PythonWrapper
const std::function<bool()>&& run_cb = []() { return true; });
bool InstallPip(std::string* out, std::string* err = nullptr);
- // TODO(yum) both StartApp and GenerateAnimator should be
- // parameterized with config files instead of these ever-growing lists of
- // parameters. We could persist those files so settings would persist across
- // app restarts.
std::future<bool> StartApp(
const AppConfig& app_c,
const std::string& config_path,
diff --git a/Scripts/browser_src.py b/Scripts/browser_src.py
index befb2db..4ed3407 100644
--- a/Scripts/browser_src.py
+++ b/Scripts/browser_src.py
@@ -51,6 +51,10 @@ class MyHandler(http.server.BaseHTTPRequestHandler):
self.http_server_instance = http_server_instance
super().__init__(*args, **kwargs)
+ def log_message(self, format, *args):
+ # TODO log if cfg["debug_mode_enabled"] is set
+ return
+
def do_GET(self):
self.handle_request('GET')
@@ -96,6 +100,12 @@ class BrowserSource(StreamingPlugin):
del commit.audio
if commit.delta:
self.commits.append(commit)
+ # Limit commits to last N.
+ now = time.time()
+ self.commits = [commit for commit in self.commits]
+ max_commits = 10
+ if len(self.commits) > max_commits:
+ self.commits = self.commits[-int(max_commits/2):]
self.preview_commit = commit
return original_commit
diff --git a/Scripts/transcribe_pipeline.py b/Scripts/transcribe_pipeline.py
index 3f48b08..5914afc 100644
--- a/Scripts/transcribe_pipeline.py
+++ b/Scripts/transcribe_pipeline.py
@@ -5,15 +5,22 @@ class TranscriptCommit:
def __init__(self,
delta: str,
preview: str,
- latency_s: int = None,
+ latency_s: float = None,
thresh_at_commit: int = None,
- audio: bytes = None):
+ audio: bytes = None,
+ duration_s: float = None,
+ start_ts: float = None):
self.delta = delta
self.preview = preview
self.latency_s = latency_s
self.thresh_at_commit = thresh_at_commit
self.audio = audio
+ # Time at which the commit is generated
self.ts = time.time()
+ # Time corresponding to the start of the segment
+ self.start_ts = start_ts
+ # The duration of the audio segment, in seconds.
+ self.duration_s = duration_s
class StreamingPlugin:
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 2bf605d..889e1cf 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -217,11 +217,11 @@ class AudioCollector:
return self.frames
def dropAudioPrefix(self, dur_s: float) -> bytes:
- n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ
+ n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ
n_bytes = min(n_bytes, len(self.frames))
cut_portion = self.frames[:n_bytes]
self.frames = self.frames[n_bytes:]
- self.wall_ts = self.wall_ts + self.duration()
+ self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS
return cut_portion
def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes:
@@ -229,7 +229,7 @@ class AudioCollector:
n_bytes = min(n_bytes, len(self.frames))
cut_portion = self.frames[:n_bytes]
self.frames = self.frames[n_bytes:]
- self.wall_ts = self.wall_ts + self.duration()
+ self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS
return cut_portion
def keepLast(self, dur_s: float) -> bytes:
@@ -243,7 +243,7 @@ class AudioCollector:
return cut_portion
def duration(self):
- return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ)
+ return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ)
def begin(self):
return self.wall_ts
@@ -486,9 +486,13 @@ class VadCommitter:
delta = ""
commit_audio = None
latency_s = None
+ duration_s = self.collector.duration()
+ start_ts = self.collector.begin()
if has_audio and stable_cutoff:
#print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
latency_s = self.collector.now() - self.collector.begin()
+ duration_s = stable_cutoff / AudioStream.FPS
+ start_ts = self.collector.begin()
commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
segments = self.whisper.transcribe(commit_audio)
@@ -516,13 +520,15 @@ class VadCommitter:
delta,
preview,
latency_s,
- audio=audio)
+ audio=audio,
+ duration_s=duration_s,
+ start_ts=start_ts)
def install_in_venv(pkgs: typing.List[str]) -> bool:
pkgs_str = " ".join(pkgs)
print(f"Installing {pkgs_str}")
pip_proc = subprocess.Popen(
- f"Resources/Python/python.exe -m pip install {pkgs_str}".split(),
+ f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
pip_stdout, pip_stderr = pip_proc.communicate()
@@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool:
if pip_proc.returncode != 0:
print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}",
file=sys.stderr)
+ return False
+ return True
class TranslationPlugin(StreamingPlugin):
def __init__(self, cfg):
@@ -847,6 +855,8 @@ def optimize(cfg,
return optimized_params
def transcriptionThread(ctrl: ThreadControl):
+ last_stable_commit = None
+
while ctrl.run_app:
time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0);
@@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl):
commit = plugin.transform(commit)
if len(commit.delta) > 0 or len(commit.preview) > 0:
+ # Avoid re-sending text after long pauses. User controls the length
+ # of the pause in the UI.
+ if ctrl.cfg["reset_after_silence_s"] > 0:
+ silence_duration = 0
+ if last_stable_commit:
+ last_commit_end_ts = \
+ last_stable_commit.start_ts + \
+ last_stable_commit.duration_s
+ silence_duration = commit.start_ts - last_commit_end_ts
+ if silence_duration > ctrl.cfg["reset_after_silence_s"]:
+ print(f"Resetting transcript after {silence_duration}-second "
+ "silence", file=sys.stderr)
+ ctrl.transcript = ""
+ ctrl.preview = ""
+ if commit.delta:
+ last_stable_commit = commit
+
# Hard-cap displayed transcript length at 4k characters to prevent
# runaway memory use in UI. Keep the full transcript to avoid
# breaking OSC pager.
@@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl):
try:
print(f"Transcript: {transcript}")
except UnicodeEncodeError:
- print("Failed to encode transcript - discarding delta")
+ print("Failed to encode transcript - discarding delta",
+ file=sys.stderr)
continue
try:
print(f"Preview: {preview}")
except UnicodeEncodeError:
- print("Failed to encode preview - discarding")
+ print("Failed to encode preview - discarding", file=sys.stderr)
if cfg["enable_debug_mode"]:
print(f"commit latency: {commit.latency_s}", file=sys.stderr)
print(f"commit thresh: {commit.thresh_at_commit}",
file=sys.stderr)
- if len(commit.preview) > 0:
- print("Finalized: 0")
- else:
- print("Finalized: 1")
ctrl.transcript += commit.delta
ctrl.preview = ctrl.transcript + commit.preview
@@ -1125,7 +1149,7 @@ def run(cfg):
collector = AudioCollector(stream)
#collector = LengthEnforcingAudioCollector(collector, 5.0)
- #collector = NormalizingAudioCollector(collector)
+ collector = NormalizingAudioCollector(collector)
collector = CompressingAudioCollector(collector)
whisper = Whisper(collector, cfg)
segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"],