From bce085367e546e5801a41f55f2d5e84e12cc8607 Mon Sep 17 00:00:00 2001 From: yum Date: Wed, 3 Sep 2025 16:07:40 -0700 Subject: Drop turbo; use old logic when no_speech ts available --- app/hallucination_filter.py | 84 ++++++++++++++++++++++++++++++--------------- app/stt.py | 2 +- 2 files changed, 57 insertions(+), 29 deletions(-) (limited to 'app') diff --git a/app/hallucination_filter.py b/app/hallucination_filter.py index fa7b16a..1a80e62 100644 --- a/app/hallucination_filter.py +++ b/app/hallucination_filter.py @@ -11,36 +11,39 @@ import sys def count_syllables(word): """Count syllables in a word using pronouncing library with regex fallback.""" phones = pronouncing.phones_for_word(word.lower()) + if len(phones) == 0: + return 0 return pronouncing.syllable_count(phones[0]) + def text_syllable_count(text): """Count total syllables in text.""" words = re.findall(r'\b\w+\b', text) return sum(count_syllables(word) for word in words) + class HallucinationFilter: """Filter for detecting hallucinated segments in speech-to-text output.""" - def __init__(self, model_path: Path = None): + def __init__(self, cfg, model_path: Path = None): """ Initialize the hallucination filter. Args: model_path: Optional path to the model file. If not provided, uses the default path. """ + self.cfg = cfg self.model = None self.threshold = None self.features = None # Get the project root directory app_root = Path(__file__).resolve().parent project_root = app_root.parent - # Use provided path or default - if model_path is None: - model_path = project_root / "Models" / "thankyou_filter_gb.pkl" + model_path = project_root / "Models" / "thankyou_filter_gb.pkl" # Try to load the model - log_err(f"Loading hallucination filter") + log(f"Loading hallucination filter") bundle = joblib.load(model_path) self.model = bundle["model"] self.threshold = bundle["threshold"] self.features = bundle["features"] - log_err(f"Loaded hallucination filter model from {model_path}") + log(f"Loaded hallucination filter model from {model_path}") def is_hallucination(self, segment) -> bool: """ Check if a segment is likely a hallucination. @@ -51,25 +54,50 @@ class HallucinationFilter: Returns: bool: True if the segment is likely a hallucination, False otherwise. """ - # Calculate text-based features - text = getattr(segment, 'text', '') - duration = segment.audio_len_s - raw_duration = segment.end_ts - segment.start_ts - n_syllables = text_syllable_count(text) - sps = n_syllables / duration - raw_sps = n_syllables / raw_duration - duration_ratio = raw_duration / duration - X = pd.DataFrame([[ - segment.avg_logprob, - segment.no_speech_prob, - segment.compression_ratio, - np.log1p(duration), - np.log1p(sps), - np.log1p(raw_duration), - np.log1p(raw_sps), - duration_ratio, - segment.avg_logprob * duration - ]], columns=self.features) - # Get probability - prob = self.model.predict_proba(X)[0, 1] - return prob >= self.threshold + s = segment # Brevity + + if s.no_speech_prob == 0: + # no_speech is not available. Use fancy classifier trained on my + # speech data. + text = s.transcript + duration = s.audio_len_s + raw_duration = s.end_ts - s.start_ts + n_syllables = text_syllable_count(text) + sps = n_syllables / duration + raw_sps = n_syllables / raw_duration + duration_ratio = raw_duration / duration + X = pd.DataFrame([[ + s.avg_logprob, + s.no_speech_prob, + s.compression_ratio, + np.log1p(duration), + np.log1p(sps), + np.log1p(raw_duration), + np.log1p(raw_sps), + duration_ratio, + s.avg_logprob * duration + ]], columns=self.features) + # Get probability + prob = self.model.predict_proba(X)[0, 1] + return prob >= self.threshold + + # If no_speech is set, use simpler filter. + if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5: + if self.cfg["enable_debug_mode"]: + print(f"Drop probable hallucination (case 1) " + + f"(text='{s.text}', " + + f"no_speech_prob={s.no_speech_prob}, " + + f"avg_logprob={s.avg_logprob})", file=sys.stderr) + return True + # Another touchup targeted at the vexatious "thanks for watching!" + # hallucination. This triggers a lot when listening to + # instrumental/electronic music. + if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7: + if self.cfg["enable_debug_mode"]: + print(f"Drop probable hallucination (case 2) " + + f"(text='{s.text}', " + + f"no_speech_prob={s.no_speech_prob}, " + + f"avg_logprob={s.avg_logprob})", file=sys.stderr) + return True + return False + diff --git a/app/stt.py b/app/stt.py index 9947bae..78da707 100644 --- a/app/stt.py +++ b/app/stt.py @@ -485,7 +485,7 @@ class Whisper: self.collector = collector self.model = None self.cfg = cfg - self.hallucination_filter = HallucinationFilter() + self.hallucination_filter = HallucinationFilter(cfg) self.segment_logger = segment_logger model_str = cfg["model"] -- cgit v1.2.3