diff options
Diffstat (limited to 'Scripts')
| -rw-r--r-- | Scripts/browser_src.py | 10 | ||||
| -rw-r--r-- | Scripts/transcribe_pipeline.py | 11 | ||||
| -rw-r--r-- | Scripts/transcribe_v2.py | 50 |
3 files changed, 56 insertions, 15 deletions
diff --git a/Scripts/browser_src.py b/Scripts/browser_src.py index befb2db..4ed3407 100644 --- a/Scripts/browser_src.py +++ b/Scripts/browser_src.py @@ -51,6 +51,10 @@ class MyHandler(http.server.BaseHTTPRequestHandler): self.http_server_instance = http_server_instance super().__init__(*args, **kwargs) + def log_message(self, format, *args): + # TODO log if cfg["debug_mode_enabled"] is set + return + def do_GET(self): self.handle_request('GET') @@ -96,6 +100,12 @@ class BrowserSource(StreamingPlugin): del commit.audio if commit.delta: self.commits.append(commit) + # Limit commits to last N. + now = time.time() + self.commits = [commit for commit in self.commits] + max_commits = 10 + if len(self.commits) > max_commits: + self.commits = self.commits[-int(max_commits/2):] self.preview_commit = commit return original_commit diff --git a/Scripts/transcribe_pipeline.py b/Scripts/transcribe_pipeline.py index 3f48b08..5914afc 100644 --- a/Scripts/transcribe_pipeline.py +++ b/Scripts/transcribe_pipeline.py @@ -5,15 +5,22 @@ class TranscriptCommit: def __init__(self, delta: str, preview: str, - latency_s: int = None, + latency_s: float = None, thresh_at_commit: int = None, - audio: bytes = None): + audio: bytes = None, + duration_s: float = None, + start_ts: float = None): self.delta = delta self.preview = preview self.latency_s = latency_s self.thresh_at_commit = thresh_at_commit self.audio = audio + # Time at which the commit is generated self.ts = time.time() + # Time corresponding to the start of the segment + self.start_ts = start_ts + # The duration of the audio segment, in seconds. + self.duration_s = duration_s class StreamingPlugin: diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 2bf605d..889e1cf 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -217,11 +217,11 @@ class AudioCollector: return self.frames def dropAudioPrefix(self, dur_s: float) -> bytes: - n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ + n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes: @@ -229,7 +229,7 @@ class AudioCollector: n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def keepLast(self, dur_s: float) -> bytes: @@ -243,7 +243,7 @@ class AudioCollector: return cut_portion def duration(self): - return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ) + return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ) def begin(self): return self.wall_ts @@ -486,9 +486,13 @@ class VadCommitter: delta = "" commit_audio = None latency_s = None + duration_s = self.collector.duration() + start_ts = self.collector.begin() if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() + duration_s = stable_cutoff / AudioStream.FPS + start_ts = self.collector.begin() commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) segments = self.whisper.transcribe(commit_audio) @@ -516,13 +520,15 @@ class VadCommitter: delta, preview, latency_s, - audio=audio) + audio=audio, + duration_s=duration_s, + start_ts=start_ts) def install_in_venv(pkgs: typing.List[str]) -> bool: pkgs_str = " ".join(pkgs) print(f"Installing {pkgs_str}") pip_proc = subprocess.Popen( - f"Resources/Python/python.exe -m pip install {pkgs_str}".split(), + f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) pip_stdout, pip_stderr = pip_proc.communicate() @@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool: if pip_proc.returncode != 0: print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}", file=sys.stderr) + return False + return True class TranslationPlugin(StreamingPlugin): def __init__(self, cfg): @@ -847,6 +855,8 @@ def optimize(cfg, return optimized_params def transcriptionThread(ctrl: ThreadControl): + last_stable_commit = None + while ctrl.run_app: time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0); @@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl): commit = plugin.transform(commit) if len(commit.delta) > 0 or len(commit.preview) > 0: + # Avoid re-sending text after long pauses. User controls the length + # of the pause in the UI. + if ctrl.cfg["reset_after_silence_s"] > 0: + silence_duration = 0 + if last_stable_commit: + last_commit_end_ts = \ + last_stable_commit.start_ts + \ + last_stable_commit.duration_s + silence_duration = commit.start_ts - last_commit_end_ts + if silence_duration > ctrl.cfg["reset_after_silence_s"]: + print(f"Resetting transcript after {silence_duration}-second " + "silence", file=sys.stderr) + ctrl.transcript = "" + ctrl.preview = "" + if commit.delta: + last_stable_commit = commit + # Hard-cap displayed transcript length at 4k characters to prevent # runaway memory use in UI. Keep the full transcript to avoid # breaking OSC pager. @@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl): try: print(f"Transcript: {transcript}") except UnicodeEncodeError: - print("Failed to encode transcript - discarding delta") + print("Failed to encode transcript - discarding delta", + file=sys.stderr) continue try: print(f"Preview: {preview}") except UnicodeEncodeError: - print("Failed to encode preview - discarding") + print("Failed to encode preview - discarding", file=sys.stderr) if cfg["enable_debug_mode"]: print(f"commit latency: {commit.latency_s}", file=sys.stderr) print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr) - if len(commit.preview) > 0: - print("Finalized: 0") - else: - print("Finalized: 1") ctrl.transcript += commit.delta ctrl.preview = ctrl.transcript + commit.preview @@ -1125,7 +1149,7 @@ def run(cfg): collector = AudioCollector(stream) #collector = LengthEnforcingAudioCollector(collector, 5.0) - #collector = NormalizingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) whisper = Whisper(collector, cfg) segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"], |
