From bce085367e546e5801a41f55f2d5e84e12cc8607 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Wed, 3 Sep 2025 16:07:40 -0700
Subject: Drop turbo; use old logic when no_speech ts available

---
 app/hallucination_filter.py | 84 ++++++++++++++++++++++++++++++---------------
 app/stt.py                  |  2 +-
 2 files changed, 57 insertions(+), 29 deletions(-)

(limited to 'app')

diff --git a/app/hallucination_filter.py b/app/hallucination_filter.py
index fa7b16a..1a80e62 100644
--- a/app/hallucination_filter.py
+++ b/app/hallucination_filter.py
@@ -11,36 +11,39 @@ import sys
 def count_syllables(word):
     """Count syllables in a word using pronouncing library with regex fallback."""
     phones = pronouncing.phones_for_word(word.lower())
+    if len(phones) == 0:
+        return 0
     return pronouncing.syllable_count(phones[0])
+
 def text_syllable_count(text):
     """Count total syllables in text."""
     words = re.findall(r'\b\w+\b', text)
     return sum(count_syllables(word) for word in words)
+
 class HallucinationFilter:
     """Filter for detecting hallucinated segments in speech-to-text output."""
-    def __init__(self, model_path: Path = None):
+    def __init__(self, cfg, model_path: Path = None):
         """
         Initialize the hallucination filter.
         Args:
             model_path: Optional path to the model file. If not provided,
                        uses the default path.
         """
+        self.cfg = cfg
         self.model = None
         self.threshold = None
         self.features = None
         # Get the project root directory
         app_root = Path(__file__).resolve().parent
         project_root = app_root.parent
-        # Use provided path or default
-        if model_path is None:
-            model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
+        model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
         # Try to load the model
-        log_err(f"Loading hallucination filter")
+        log(f"Loading hallucination filter")
         bundle = joblib.load(model_path)
         self.model = bundle["model"]
         self.threshold = bundle["threshold"]
         self.features = bundle["features"]
-        log_err(f"Loaded hallucination filter model from {model_path}")
+        log(f"Loaded hallucination filter model from {model_path}")
     def is_hallucination(self, segment) -> bool:
         """
         Check if a segment is likely a hallucination.
@@ -51,25 +54,50 @@ class HallucinationFilter:
         Returns:
             bool: True if the segment is likely a hallucination, False otherwise.
         """
-        # Calculate text-based features
-        text = getattr(segment, 'text', '')
-        duration = segment.audio_len_s
-        raw_duration = segment.end_ts - segment.start_ts
-        n_syllables = text_syllable_count(text)
-        sps = n_syllables / duration
-        raw_sps = n_syllables / raw_duration
-        duration_ratio = raw_duration / duration
-        X = pd.DataFrame([[
-            segment.avg_logprob,
-            segment.no_speech_prob,
-            segment.compression_ratio,
-            np.log1p(duration),
-            np.log1p(sps),
-            np.log1p(raw_duration),
-            np.log1p(raw_sps),
-            duration_ratio,
-            segment.avg_logprob * duration
-        ]], columns=self.features)
-        # Get probability
-        prob = self.model.predict_proba(X)[0, 1]
-        return prob >= self.threshold
+        s = segment  # Brevity
+
+        if s.no_speech_prob == 0:
+            # no_speech is not available. Use fancy classifier trained on my
+            # speech data.
+            text = s.transcript
+            duration = s.audio_len_s
+            raw_duration = s.end_ts - s.start_ts
+            n_syllables = text_syllable_count(text)
+            sps = n_syllables / duration
+            raw_sps = n_syllables / raw_duration
+            duration_ratio = raw_duration / duration
+            X = pd.DataFrame([[
+                s.avg_logprob,
+                s.no_speech_prob,
+                s.compression_ratio,
+                np.log1p(duration),
+                np.log1p(sps),
+                np.log1p(raw_duration),
+                np.log1p(raw_sps),
+                duration_ratio,
+                s.avg_logprob * duration
+            ]], columns=self.features)
+            # Get probability
+            prob = self.model.predict_proba(X)[0, 1]
+            return prob >= self.threshold
+
+        # If no_speech is set, use simpler filter.
+        if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+            if self.cfg["enable_debug_mode"]:
+                print(f"Drop probable hallucination (case 1) " +
+                        f"(text='{s.text}', " +
+                        f"no_speech_prob={s.no_speech_prob}, " +
+                        f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+            return True
+        # Another touchup targeted at the vexatious "thanks for watching!"
+        # hallucination. This triggers a lot when listening to
+        # instrumental/electronic music.
+        if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7:
+            if self.cfg["enable_debug_mode"]:
+                print(f"Drop probable hallucination (case 2) " +
+                        f"(text='{s.text}', " +
+                        f"no_speech_prob={s.no_speech_prob}, " +
+                        f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+            return True
+        return False
+
diff --git a/app/stt.py b/app/stt.py
index 9947bae..78da707 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -485,7 +485,7 @@ class Whisper:
         self.collector = collector
         self.model = None
         self.cfg = cfg
-        self.hallucination_filter = HallucinationFilter()
+        self.hallucination_filter = HallucinationFilter(cfg)
         self.segment_logger = segment_logger
 
         model_str = cfg["model"]
-- 
cgit v1.2.3