summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-09-09 22:26:09 -0700
committeryum <yum.food.vr@gmail.com>2023-09-09 22:32:35 -0700
commitae866f553d3db67030e37ce315707d72982f4063 (patch)
tree8b8977cad5ff9c443a86868efcad3d5e4fb612ac /Scripts
parent286dcae5e087db817f3350cf442145107b25bc9c (diff)
Add UI for max speech duration
Also fix bug when not using previews. Audio buffer no longer grows without bound while there's no speech.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe_v2.py29
1 files changed, 17 insertions, 12 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 81a4bf2..541ff23 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -314,9 +314,11 @@ class CompressingAudioCollector(AudioCollectorFilter):
class AudioSegmenter:
def __init__(self,
- min_silence_ms=250):
+ min_silence_ms=250,
+ max_speech_s=5):
self.vad_options = vad.VadOptions(
- min_silence_duration_ms=min_silence_ms)
+ min_silence_duration_ms=min_silence_ms,
+ max_speech_duration_s=max_speech_s)
pass
def segmentAudio(self, audio: bytes):
@@ -332,6 +334,7 @@ class AudioSegmenter:
last_end = None
segments = self.segmentAudio(audio)
+
for i in range(len(segments)):
s = segments[i]
#print(f"s: {s}")
@@ -349,7 +352,8 @@ class AudioSegmenter:
now = int(len(audio) / AudioStream.FRAME_SZ)
#print(f"now: {now}")
#print(f"min d: {min_delta_frames}")
- if now - s['end'] > min_delta_frames:
+ delta_frames = now - s['end']
+ if delta_frames > min_delta_frames:
cutoff = now - int(min_delta_frames / 2)
return (cutoff, len(segments) > 0)
@@ -480,7 +484,7 @@ class VadCommitter:
delta = ""
commit_audio = None
latency_s = None
- if stable_cutoff:
+ if has_audio and stable_cutoff:
#print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
latency_s = self.collector.now() - self.collector.begin()
commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
@@ -497,13 +501,13 @@ class VadCommitter:
#saveAudio(commit_audio, filename)
preview = ""
- if self.cfg["enable_previews"]:
- if has_audio:
- segments = self.whisper.transcribe(audio)
- preview = "".join(s.transcript for s in segments)
- else:
- #print("VAD detects no audio, skip transcription", file=sys.stderr)
- self.collector.keepLast(1.0)
+ if self.cfg["enable_previews"] and has_audio:
+ segments = self.whisper.transcribe(audio)
+ preview = "".join(s.transcript for s in segments)
+
+ if not has_audio:
+ #print("VAD detects no audio, skip transcription", file=sys.stderr)
+ self.collector.keepLast(1.0)
return TranscriptCommit(
delta,
@@ -907,7 +911,8 @@ def run(cfg):
#collector = NormalizingAudioCollector(collector)
collector = CompressingAudioCollector(collector)
whisper = Whisper(collector, cfg)
- segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"])
+ segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"],
+ max_speech_s=cfg["max_speech_duration_s"])
committer = VadCommitter(cfg, collector, whisper, segmenter)
pager = OscPager(cfg)