Drop turbo; use old logic when no_speech ts available

author: yum <yum.food.vr@gmail.com> 2025-09-03 16:07:40 -0700
committer: yum <yum.food.vr@gmail.com> 2025-09-03 16:07:40 -0700
commit: bce085367e546e5801a41f55f2d5e84e12cc8607 (patch)
tree: 2a2232b237be570d594c31a0226737ea9e53bed5 /app
parent: 6815848fb8ed06b59b6d7e57096143f1f840e7db (diff)
2 files changed, 57 insertions, 29 deletions
diff --git a/app/hallucination_filter.py b/app/hallucination_filter.py
index fa7b16a..1a80e62 100644
--- a/app/hallucination_filter.py
+++ b/app/hallucination_filter.py
@@ -11,36 +11,39 @@ import sys
 def count_syllables(word):
     """Count syllables in a word using pronouncing library with regex fallback."""
     phones = pronouncing.phones_for_word(word.lower())
+    if len(phones) == 0:
+        return 0
     return pronouncing.syllable_count(phones[0])
+
 def text_syllable_count(text):
     """Count total syllables in text."""
     words = re.findall(r'\b\w+\b', text)
     return sum(count_syllables(word) for word in words)
+
 class HallucinationFilter:
     """Filter for detecting hallucinated segments in speech-to-text output."""
-    def __init__(self, model_path: Path = None):
+    def __init__(self, cfg, model_path: Path = None):
         """
         Initialize the hallucination filter.
         Args:
             model_path: Optional path to the model file. If not provided,
                        uses the default path.
         """
+        self.cfg = cfg
         self.model = None
         self.threshold = None
         self.features = None
         # Get the project root directory
         app_root = Path(__file__).resolve().parent
         project_root = app_root.parent
-        # Use provided path or default
-        if model_path is None:
-            model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
+        model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
         # Try to load the model
-        log_err(f"Loading hallucination filter")
+        log(f"Loading hallucination filter")
         bundle = joblib.load(model_path)
         self.model = bundle["model"]
         self.threshold = bundle["threshold"]
         self.features = bundle["features"]
-        log_err(f"Loaded hallucination filter model from {model_path}")
+        log(f"Loaded hallucination filter model from {model_path}")
     def is_hallucination(self, segment) -> bool:
         """
         Check if a segment is likely a hallucination.
@@ -51,25 +54,50 @@ class HallucinationFilter:
         Returns:
             bool: True if the segment is likely a hallucination, False otherwise.
         """
-        # Calculate text-based features
-        text = getattr(segment, 'text', '')
-        duration = segment.audio_len_s
-        raw_duration = segment.end_ts - segment.start_ts
-        n_syllables = text_syllable_count(text)
-        sps = n_syllables / duration
-        raw_sps = n_syllables / raw_duration
-        duration_ratio = raw_duration / duration
-        X = pd.DataFrame([[
-            segment.avg_logprob,
-            segment.no_speech_prob,
-            segment.compression_ratio,
-            np.log1p(duration),
-            np.log1p(sps),
-            np.log1p(raw_duration),
-            np.log1p(raw_sps),
-            duration_ratio,
-            segment.avg_logprob * duration
-        ]], columns=self.features)
-        # Get probability
-        prob = self.model.predict_proba(X)[0, 1]
-        return prob >= self.threshold
+        s = segment  # Brevity
+
+        if s.no_speech_prob == 0:
+            # no_speech is not available. Use fancy classifier trained on my
+            # speech data.
+            text = s.transcript
+            duration = s.audio_len_s
+            raw_duration = s.end_ts - s.start_ts
+            n_syllables = text_syllable_count(text)
+            sps = n_syllables / duration
+            raw_sps = n_syllables / raw_duration
+            duration_ratio = raw_duration / duration
+            X = pd.DataFrame([[
+                s.avg_logprob,
+                s.no_speech_prob,
+                s.compression_ratio,
+                np.log1p(duration),
+                np.log1p(sps),
+                np.log1p(raw_duration),
+                np.log1p(raw_sps),
+                duration_ratio,
+                s.avg_logprob * duration
+            ]], columns=self.features)
+            # Get probability
+            prob = self.model.predict_proba(X)[0, 1]
+            return prob >= self.threshold
+
+        # If no_speech is set, use simpler filter.
+        if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+            if self.cfg["enable_debug_mode"]:
+                print(f"Drop probable hallucination (case 1) " +
+                        f"(text='{s.text}', " +
+                        f"no_speech_prob={s.no_speech_prob}, " +
+                        f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+            return True
+        # Another touchup targeted at the vexatious "thanks for watching!"
+        # hallucination. This triggers a lot when listening to
+        # instrumental/electronic music.
+        if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7:
+            if self.cfg["enable_debug_mode"]:
+                print(f"Drop probable hallucination (case 2) " +
+                        f"(text='{s.text}', " +
+                        f"no_speech_prob={s.no_speech_prob}, " +
+                        f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+            return True
+        return False
+
diff --git a/app/stt.py b/app/stt.py
index 9947bae..78da707 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -485,7 +485,7 @@ class Whisper:
         self.collector = collector
         self.model = None
         self.cfg = cfg
-        self.hallucination_filter = HallucinationFilter()
+        self.hallucination_filter = HallucinationFilter(cfg)
         self.segment_logger = segment_logger
 
         model_str = cfg["model"]
author	yum <yum.food.vr@gmail.com>	2025-09-03 16:07:40 -0700
committer	yum <yum.food.vr@gmail.com>	2025-09-03 16:07:40 -0700
commit	bce085367e546e5801a41f55f2d5e84e12cc8607 (patch)
tree	2a2232b237be570d594c31a0226737ea9e53bed5 /app
parent	6815848fb8ed06b59b6d7e57096143f1f840e7db (diff)