From e58c718cb115c44ef3a546bea245e05e50d24c55 Mon Sep 17 00:00:00 2001 From: yum Date: Mon, 5 Feb 2024 17:40:37 -0800 Subject: Add another threshold to filter out common hallucinations The paper recommends filtering out segments with no_speech_prob > 0.6 and avg_logprob < -1. This is too loose of a bound for short-form audio which is not guaranteed to contain speech. I already have a tighter bound: no_speech > 0.6 and avg_logprob < -0.5 While listening to instrumental music I find that a lot of hallucinations sneak past that bound. So I added a second bound: no_speech > 0.15 and avg_logprob < -0.7 Basically we filter out things that look like speech but have a worse avg_logprob. Seems to not have false negatives. Requires testing. Also: dial back the default max segment length from 15 seconds to 10 seconds. This is done based on performance observations in desktop. --- GUI/GUI/GUI/Config.cpp | 4 ++-- Scripts/transcribe_v2.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index a92502d..52f6f30 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -87,8 +87,8 @@ AppConfig::AppConfig(wxTextCtrl* out) enable_lock_at_spawn(true), gpu_idx(0), min_silence_duration_ms(250), - max_speech_duration_s(15), - reset_after_silence_s(10), + max_speech_duration_s(10), + reset_after_silence_s(15), transcription_loop_delay_ms(100), keybind("ctrl+x"), diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 65a0cf8..7dccc48 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -457,6 +457,20 @@ class Whisper: # Manual touchup. I see a decent number of hallucinations sneaking # in with high `no_speech_prob` and modest `avg_logprob`. if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5: + if cfg["enable_debug_mode"]: + print(f"Drop probable hallucination (case 1) " + + f"(text='{s.text}', " + + f"no_speech_prob={s.no_speech_prob}, " + + f"avg_logprob={s.avg_logprob})", file=sys.stderr) + continue + # Another touchup targeted at the vexatious "thanks for watching!" + # hallucination. + if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7: + if cfg["enable_debug_mode"]: + print(f"Drop probable hallucination (case 2) " + + f"(text='{s.text}', " + + f"no_speech_prob={s.no_speech_prob}, " + + f"avg_logprob={s.avg_logprob})", file=sys.stderr) continue if cfg["enable_debug_mode"]: print(f"s get: {s}") @@ -500,6 +514,7 @@ class VadCommitter: latency_s = None duration_s = self.collector.duration() start_ts = self.collector.begin() + if has_audio and stable_cutoff: #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr) latency_s = self.collector.now() - self.collector.begin() @@ -515,7 +530,7 @@ class VadCommitter: print(f"commit segment: {s}", file=sys.stderr) print(f"delta get: {delta}", file=sys.stderr) - if True: + if False: ts = datetime.fromtimestamp(self.collector.now() - latency_s) filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav" saveAudio(commit_audio, filename) -- cgit v1.2.3