From e58c718cb115c44ef3a546bea245e05e50d24c55 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Mon, 5 Feb 2024 17:40:37 -0800
Subject: Add another threshold to filter out common hallucinations

The paper recommends filtering out segments with no_speech_prob > 0.6
and avg_logprob < -1. This is too loose of a bound for short-form audio
which is not guaranteed to contain speech.

I already have a tighter bound:

  no_speech > 0.6 and avg_logprob < -0.5

While listening to instrumental music I find that a lot of
hallucinations sneak past that bound. So I added a second bound:

  no_speech > 0.15 and avg_logprob < -0.7

Basically we filter out things that look like speech but have a worse
avg_logprob. Seems to not have false negatives. Requires testing.

Also: dial back the default max segment length from 15 seconds to 10
seconds. This is done based on performance observations in desktop.
---
 GUI/GUI/GUI/Config.cpp   |  4 ++--
 Scripts/transcribe_v2.py | 17 ++++++++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index a92502d..52f6f30 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -87,8 +87,8 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	enable_lock_at_spawn(true),
 	gpu_idx(0),
 	min_silence_duration_ms(250),
-	max_speech_duration_s(15),
-	reset_after_silence_s(10),
+	max_speech_duration_s(10),
+	reset_after_silence_s(15),
 	transcription_loop_delay_ms(100),
 	keybind("ctrl+x"),
 
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 65a0cf8..7dccc48 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -457,6 +457,20 @@ class Whisper:
             # Manual touchup. I see a decent number of hallucinations sneaking
             # in with high `no_speech_prob` and modest `avg_logprob`.
             if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+                if cfg["enable_debug_mode"]:
+                    print(f"Drop probable hallucination (case 1) " +
+                            f"(text='{s.text}', " +
+                            f"no_speech_prob={s.no_speech_prob}, " +
+                            f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+                continue
+            # Another touchup targeted at the vexatious "thanks for watching!"
+            # hallucination.
+            if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7:
+                if cfg["enable_debug_mode"]:
+                    print(f"Drop probable hallucination (case 2) " +
+                            f"(text='{s.text}', " +
+                            f"no_speech_prob={s.no_speech_prob}, " +
+                            f"avg_logprob={s.avg_logprob})", file=sys.stderr)
                 continue
             if cfg["enable_debug_mode"]:
                 print(f"s get: {s}")
@@ -500,6 +514,7 @@ class VadCommitter:
         latency_s = None
         duration_s = self.collector.duration()
         start_ts = self.collector.begin()
+
         if has_audio and stable_cutoff:
             #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
             latency_s = self.collector.now() - self.collector.begin()
@@ -515,7 +530,7 @@ class VadCommitter:
                     print(f"commit segment: {s}", file=sys.stderr)
                 print(f"delta get: {delta}", file=sys.stderr)
 
-            if True:
+            if False:
                 ts = datetime.fromtimestamp(self.collector.now() - latency_s)
                 filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
                 saveAudio(commit_audio, filename)
-- 
cgit v1.2.3