diff options
| author | yum <yum.food.vr@gmail.com> | 2023-09-09 22:26:09 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-09-09 22:32:35 -0700 |
| commit | ae866f553d3db67030e37ce315707d72982f4063 (patch) | |
| tree | 8b8977cad5ff9c443a86868efcad3d5e4fb612ac /Scripts | |
| parent | 286dcae5e087db817f3350cf442145107b25bc9c (diff) | |
Add UI for max speech duration
Also fix bug when not using previews. Audio buffer no longer grows
without bound while there's no speech.
Diffstat (limited to 'Scripts')
| -rw-r--r-- | Scripts/transcribe_v2.py | 29 |
1 files changed, 17 insertions, 12 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 81a4bf2..541ff23 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -314,9 +314,11 @@ class CompressingAudioCollector(AudioCollectorFilter): class AudioSegmenter: def __init__(self, - min_silence_ms=250): + min_silence_ms=250, + max_speech_s=5): self.vad_options = vad.VadOptions( - min_silence_duration_ms=min_silence_ms) + min_silence_duration_ms=min_silence_ms, + max_speech_duration_s=max_speech_s) pass def segmentAudio(self, audio: bytes): @@ -332,6 +334,7 @@ class AudioSegmenter: last_end = None segments = self.segmentAudio(audio) + for i in range(len(segments)): s = segments[i] #print(f"s: {s}") @@ -349,7 +352,8 @@ class AudioSegmenter: now = int(len(audio) / AudioStream.FRAME_SZ) #print(f"now: {now}") #print(f"min d: {min_delta_frames}") - if now - s['end'] > min_delta_frames: + delta_frames = now - s['end'] + if delta_frames > min_delta_frames: cutoff = now - int(min_delta_frames / 2) return (cutoff, len(segments) > 0) @@ -480,7 +484,7 @@ class VadCommitter: delta = "" commit_audio = None latency_s = None - if stable_cutoff: + if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff) @@ -497,13 +501,13 @@ class VadCommitter: #saveAudio(commit_audio, filename) preview = "" - if self.cfg["enable_previews"]: - if has_audio: - segments = self.whisper.transcribe(audio) - preview = "".join(s.transcript for s in segments) - else: - #print("VAD detects no audio, skip transcription", file=sys.stderr) - self.collector.keepLast(1.0) + if self.cfg["enable_previews"] and has_audio: + segments = self.whisper.transcribe(audio) + preview = "".join(s.transcript for s in segments) + + if not has_audio: + #print("VAD detects no audio, skip transcription", file=sys.stderr) + self.collector.keepLast(1.0) return TranscriptCommit( delta, @@ -907,7 +911,8 @@ def run(cfg): #collector = NormalizingAudioCollector(collector) collector = CompressingAudioCollector(collector) whisper = Whisper(collector, cfg) - segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"]) + segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"], + max_speech_s=cfg["max_speech_duration_s"]) committer = VadCommitter(cfg, collector, whisper, segmenter) pager = OscPager(cfg) |
