summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe_v2.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2024-02-05 17:40:37 -0800
committeryum <yum.food.vr@gmail.com>2024-02-05 17:40:37 -0800
commite58c718cb115c44ef3a546bea245e05e50d24c55 (patch)
tree228f5cfddfd974a0567dde29bc199fde0f169f22 /Scripts/transcribe_v2.py
parentacccf8ebcff0f7cc2b26e45e497f8b12ab73d8e1 (diff)
Add another threshold to filter out common hallucinations
The paper recommends filtering out segments with no_speech_prob > 0.6 and avg_logprob < -1. This is too loose of a bound for short-form audio which is not guaranteed to contain speech. I already have a tighter bound: no_speech > 0.6 and avg_logprob < -0.5 While listening to instrumental music I find that a lot of hallucinations sneak past that bound. So I added a second bound: no_speech > 0.15 and avg_logprob < -0.7 Basically we filter out things that look like speech but have a worse avg_logprob. Seems to not have false negatives. Requires testing. Also: dial back the default max segment length from 15 seconds to 10 seconds. This is done based on performance observations in desktop.
Diffstat (limited to 'Scripts/transcribe_v2.py')
-rw-r--r--Scripts/transcribe_v2.py17
1 files changed, 16 insertions, 1 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 65a0cf8..7dccc48 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -457,6 +457,20 @@ class Whisper:
# Manual touchup. I see a decent number of hallucinations sneaking
# in with high `no_speech_prob` and modest `avg_logprob`.
if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+ if cfg["enable_debug_mode"]:
+ print(f"Drop probable hallucination (case 1) " +
+ f"(text='{s.text}', " +
+ f"no_speech_prob={s.no_speech_prob}, " +
+ f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+ continue
+ # Another touchup targeted at the vexatious "thanks for watching!"
+ # hallucination.
+ if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7:
+ if cfg["enable_debug_mode"]:
+ print(f"Drop probable hallucination (case 2) " +
+ f"(text='{s.text}', " +
+ f"no_speech_prob={s.no_speech_prob}, " +
+ f"avg_logprob={s.avg_logprob})", file=sys.stderr)
continue
if cfg["enable_debug_mode"]:
print(f"s get: {s}")
@@ -500,6 +514,7 @@ class VadCommitter:
latency_s = None
duration_s = self.collector.duration()
start_ts = self.collector.begin()
+
if has_audio and stable_cutoff:
#print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
latency_s = self.collector.now() - self.collector.begin()
@@ -515,7 +530,7 @@ class VadCommitter:
print(f"commit segment: {s}", file=sys.stderr)
print(f"delta get: {delta}", file=sys.stderr)
- if True:
+ if False:
ts = datetime.fromtimestamp(self.collector.now() - latency_s)
filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
saveAudio(commit_audio, filename)