summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2025-09-03 16:07:40 -0700
committeryum <yum.food.vr@gmail.com>2025-09-03 16:07:40 -0700
commitbce085367e546e5801a41f55f2d5e84e12cc8607 (patch)
tree2a2232b237be570d594c31a0226737ea9e53bed5 /app
parent6815848fb8ed06b59b6d7e57096143f1f840e7db (diff)
Drop turbo; use old logic when no_speech ts available
Diffstat (limited to 'app')
-rw-r--r--app/hallucination_filter.py84
-rw-r--r--app/stt.py2
2 files changed, 57 insertions, 29 deletions
diff --git a/app/hallucination_filter.py b/app/hallucination_filter.py
index fa7b16a..1a80e62 100644
--- a/app/hallucination_filter.py
+++ b/app/hallucination_filter.py
@@ -11,36 +11,39 @@ import sys
def count_syllables(word):
"""Count syllables in a word using pronouncing library with regex fallback."""
phones = pronouncing.phones_for_word(word.lower())
+ if len(phones) == 0:
+ return 0
return pronouncing.syllable_count(phones[0])
+
def text_syllable_count(text):
"""Count total syllables in text."""
words = re.findall(r'\b\w+\b', text)
return sum(count_syllables(word) for word in words)
+
class HallucinationFilter:
"""Filter for detecting hallucinated segments in speech-to-text output."""
- def __init__(self, model_path: Path = None):
+ def __init__(self, cfg, model_path: Path = None):
"""
Initialize the hallucination filter.
Args:
model_path: Optional path to the model file. If not provided,
uses the default path.
"""
+ self.cfg = cfg
self.model = None
self.threshold = None
self.features = None
# Get the project root directory
app_root = Path(__file__).resolve().parent
project_root = app_root.parent
- # Use provided path or default
- if model_path is None:
- model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
+ model_path = project_root / "Models" / "thankyou_filter_gb.pkl"
# Try to load the model
- log_err(f"Loading hallucination filter")
+ log(f"Loading hallucination filter")
bundle = joblib.load(model_path)
self.model = bundle["model"]
self.threshold = bundle["threshold"]
self.features = bundle["features"]
- log_err(f"Loaded hallucination filter model from {model_path}")
+ log(f"Loaded hallucination filter model from {model_path}")
def is_hallucination(self, segment) -> bool:
"""
Check if a segment is likely a hallucination.
@@ -51,25 +54,50 @@ class HallucinationFilter:
Returns:
bool: True if the segment is likely a hallucination, False otherwise.
"""
- # Calculate text-based features
- text = getattr(segment, 'text', '')
- duration = segment.audio_len_s
- raw_duration = segment.end_ts - segment.start_ts
- n_syllables = text_syllable_count(text)
- sps = n_syllables / duration
- raw_sps = n_syllables / raw_duration
- duration_ratio = raw_duration / duration
- X = pd.DataFrame([[
- segment.avg_logprob,
- segment.no_speech_prob,
- segment.compression_ratio,
- np.log1p(duration),
- np.log1p(sps),
- np.log1p(raw_duration),
- np.log1p(raw_sps),
- duration_ratio,
- segment.avg_logprob * duration
- ]], columns=self.features)
- # Get probability
- prob = self.model.predict_proba(X)[0, 1]
- return prob >= self.threshold
+ s = segment # Brevity
+
+ if s.no_speech_prob == 0:
+ # no_speech is not available. Use fancy classifier trained on my
+ # speech data.
+ text = s.transcript
+ duration = s.audio_len_s
+ raw_duration = s.end_ts - s.start_ts
+ n_syllables = text_syllable_count(text)
+ sps = n_syllables / duration
+ raw_sps = n_syllables / raw_duration
+ duration_ratio = raw_duration / duration
+ X = pd.DataFrame([[
+ s.avg_logprob,
+ s.no_speech_prob,
+ s.compression_ratio,
+ np.log1p(duration),
+ np.log1p(sps),
+ np.log1p(raw_duration),
+ np.log1p(raw_sps),
+ duration_ratio,
+ s.avg_logprob * duration
+ ]], columns=self.features)
+ # Get probability
+ prob = self.model.predict_proba(X)[0, 1]
+ return prob >= self.threshold
+
+ # If no_speech is set, use simpler filter.
+ if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+ if self.cfg["enable_debug_mode"]:
+ print(f"Drop probable hallucination (case 1) " +
+ f"(text='{s.text}', " +
+ f"no_speech_prob={s.no_speech_prob}, " +
+ f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+ return True
+ # Another touchup targeted at the vexatious "thanks for watching!"
+ # hallucination. This triggers a lot when listening to
+ # instrumental/electronic music.
+ if s.no_speech_prob > 0.15 and s.avg_logprob < -0.7:
+ if self.cfg["enable_debug_mode"]:
+ print(f"Drop probable hallucination (case 2) " +
+ f"(text='{s.text}', " +
+ f"no_speech_prob={s.no_speech_prob}, " +
+ f"avg_logprob={s.avg_logprob})", file=sys.stderr)
+ return True
+ return False
+
diff --git a/app/stt.py b/app/stt.py
index 9947bae..78da707 100644
--- a/app/stt.py
+++ b/app/stt.py
@@ -485,7 +485,7 @@ class Whisper:
self.collector = collector
self.model = None
self.cfg = cfg
- self.hallucination_filter = HallucinationFilter()
+ self.hallucination_filter = HallucinationFilter(cfg)
self.segment_logger = segment_logger
model_str = cfg["model"]