diff options
Diffstat (limited to 'Scripts/transcribe_v2.py')
| -rw-r--r-- | Scripts/transcribe_v2.py | 50 |
1 files changed, 37 insertions, 13 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 2bf605d..889e1cf 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -217,11 +217,11 @@ class AudioCollector: return self.frames def dropAudioPrefix(self, dur_s: float) -> bytes: - n_bytes = int(dur_s * self.stream.FPS) * self.stream.FRAME_SZ + n_bytes = int(dur_s * AudioStream.FPS) * self.stream.FRAME_SZ n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def dropAudioPrefixByFrames(self, dur_frames: int) -> bytes: @@ -229,7 +229,7 @@ class AudioCollector: n_bytes = min(n_bytes, len(self.frames)) cut_portion = self.frames[:n_bytes] self.frames = self.frames[n_bytes:] - self.wall_ts = self.wall_ts + self.duration() + self.wall_ts += float(n_bytes / self.stream.FRAME_SZ) / self.stream.FPS return cut_portion def keepLast(self, dur_s: float) -> bytes: @@ -243,7 +243,7 @@ class AudioCollector: return cut_portion def duration(self): - return len(self.frames) / (self.stream.FPS * self.stream.FRAME_SZ) + return len(self.frames) / (AudioStream.FPS * self.stream.FRAME_SZ) def begin(self): return self.wall_ts @@ -486,9 +486,13 @@ class VadCommitter: delta = "" commit_audio = None latency_s = None + duration_s = self.collector.duration() + start_ts = self.collector.begin() if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() + duration_s = stable_cutoff / AudioStream.FPS + start_ts = self.collector.begin() commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) segments = self.whisper.transcribe(commit_audio) @@ -516,13 +520,15 @@ class VadCommitter: delta, preview, latency_s, - audio=audio) + audio=audio, + duration_s=duration_s, + start_ts=start_ts) def install_in_venv(pkgs: typing.List[str]) -> bool: pkgs_str = " ".join(pkgs) print(f"Installing {pkgs_str}") pip_proc = subprocess.Popen( - f"Resources/Python/python.exe -m pip install {pkgs_str}".split(), + f"Resources/Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) pip_stdout, pip_stderr = pip_proc.communicate() @@ -533,6 +539,8 @@ def install_in_venv(pkgs: typing.List[str]) -> bool: if pip_proc.returncode != 0: print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}", file=sys.stderr) + return False + return True class TranslationPlugin(StreamingPlugin): def __init__(self, cfg): @@ -847,6 +855,8 @@ def optimize(cfg, return optimized_params def transcriptionThread(ctrl: ThreadControl): + last_stable_commit = None + while ctrl.run_app: time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0); @@ -858,6 +868,23 @@ def transcriptionThread(ctrl: ThreadControl): commit = plugin.transform(commit) if len(commit.delta) > 0 or len(commit.preview) > 0: + # Avoid re-sending text after long pauses. User controls the length + # of the pause in the UI. + if ctrl.cfg["reset_after_silence_s"] > 0: + silence_duration = 0 + if last_stable_commit: + last_commit_end_ts = \ + last_stable_commit.start_ts + \ + last_stable_commit.duration_s + silence_duration = commit.start_ts - last_commit_end_ts + if silence_duration > ctrl.cfg["reset_after_silence_s"]: + print(f"Resetting transcript after {silence_duration}-second " + "silence", file=sys.stderr) + ctrl.transcript = "" + ctrl.preview = "" + if commit.delta: + last_stable_commit = commit + # Hard-cap displayed transcript length at 4k characters to prevent # runaway memory use in UI. Keep the full transcript to avoid # breaking OSC pager. @@ -870,21 +897,18 @@ def transcriptionThread(ctrl: ThreadControl): try: print(f"Transcript: {transcript}") except UnicodeEncodeError: - print("Failed to encode transcript - discarding delta") + print("Failed to encode transcript - discarding delta", + file=sys.stderr) continue try: print(f"Preview: {preview}") except UnicodeEncodeError: - print("Failed to encode preview - discarding") + print("Failed to encode preview - discarding", file=sys.stderr) if cfg["enable_debug_mode"]: print(f"commit latency: {commit.latency_s}", file=sys.stderr) print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr) - if len(commit.preview) > 0: - print("Finalized: 0") - else: - print("Finalized: 1") ctrl.transcript += commit.delta ctrl.preview = ctrl.transcript + commit.preview @@ -1125,7 +1149,7 @@ def run(cfg): collector = AudioCollector(stream) #collector = LengthEnforcingAudioCollector(collector, 5.0) - #collector = NormalizingAudioCollector(collector) + collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) whisper = Whisper(collector, cfg) segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"], |
