1 files changed, 164 insertions, 98 deletions
diff --git a/train_hallucination_filter.py b/train_hallucination_filter.py
index 446e893..dc3ce36 100644
--- a/train_hallucination_filter.py
+++ b/train_hallucination_filter.py
@@ -5,6 +5,7 @@ import re
 from pathlib import Path
 import numpy as np
 import pandas as pd
+import pronouncing
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import StandardScaler
@@ -13,25 +14,12 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report, confusion_matrix
 import joblib
 import warnings
-warnings.filterwarnings('ignore')
-
-try:
-    import pronouncing
-    HAS_PRONOUNCING = True
-except ImportError:
-    HAS_PRONOUNCING = False
-    print("Warning: pronouncing library not found. Using regex fallback for syllable counting.")
+from sklearn.model_selection import StratifiedKFold, cross_val_predict
 
 def count_syllables(word):
     """Count syllables in a word using pronouncing library with regex fallback."""
-    if HAS_PRONOUNCING:
-        phones = pronouncing.phones_for_word(word.lower())
-        if phones:
-            return pronouncing.syllable_count(phones[0])
-    
-    # Fallback: count vowel groups
-    vowel_groups = re.findall(r'[aeiouy]+', word, re.IGNORECASE)
-    return max(1, len(vowel_groups))
+    phones = pronouncing.phones_for_word(word.lower())
+    return pronouncing.syllable_count(phones[0])
 
 def text_syllable_count(text):
     """Count total syllables in text."""
@@ -41,6 +29,8 @@ def text_syllable_count(text):
 def load_segments(log_dir):
     """Load segments from JSON files."""
     segments = []
+    seen = set()  # To deduplicate identical segment metadata
+    num_dupes = 0
 
     for root, dirs, files in os.walk(log_dir):
         for file in files:
@@ -57,12 +47,12 @@ def load_segments(log_dir):
                     # Extract all available features
                     text = segment.get('text', '')
                     duration = segment.get('duration_sanity', 0)
-                    
+
                     # Calculate raw duration from timestamps
                     start_ts = segment.get('start_ts', 0)
                     end_ts = segment.get('end_ts', 0)
                     raw_duration = end_ts - start_ts
-                    
+
                     seg_data = {
                         'avg_logprob': segment.get('avg_logprob', 0),
                         'no_speech_prob': segment.get('no_speech_prob', 0),
@@ -73,34 +63,52 @@ def load_segments(log_dir):
                     }
 
                     # Add speech rate features
-                    n_words = len(re.findall(r'\b\w+\b', text))
-                    n_chars = len(text)
                     n_syllables = text_syllable_count(text)
-                    
-                    seg_data['n_words'] = n_words
-                    seg_data['n_syllables'] = n_syllables
-                    seg_data['n_chars'] = n_chars
-                    
-                    # Calculate rates (words/syllables/chars per second)
                     seg_data['sps'] = n_syllables / duration
-                    
-                    # Calculate raw speech rate (using timestamp-based duration)
+                    seg_data['log_sps'] = np.log1p(seg_data['sps'])
                     seg_data['raw_sps'] = n_syllables / raw_duration
+                    seg_data['log_raw_sps'] = np.log1p(seg_data['raw_sps'])
 
                     # Add derived features
                     seg_data['log_duration'] = np.log1p(duration)
                     seg_data['logprob_duration_interaction'] = seg_data['avg_logprob'] * duration
-                    seg_data['log_sps'] = np.log1p(seg_data['sps'])  # Log-scaled speech rate
                     seg_data['log_raw_duration'] = np.log1p(raw_duration)
                     seg_data['duration_ratio'] = raw_duration / duration if duration > 0 else 1.0
-                    seg_data['raw_log_sps'] = np.log1p(seg_data['raw_sps'])  # Log-scaled raw speech rate
+
+                    # Deduplicate: skip if this exact metadata already seen
+                    key = tuple(sorted(seg_data.items()))
+                    if key in seen:
+                        num_dupes += 1
+                        continue
+                    seen.add(key)
 
                     segments.append(seg_data)
             except Exception as e:
                 print(f"Error loading {file}: {e}")
 
+    print(f"Skipped {num_dupes} duplicate segments")
     return pd.DataFrame(segments)
 
+def log_seed_data(seeds_df, seed_type, label_desc):
+    """Log comprehensive data for seed segments."""
+    if len(seeds_df) == 0:
+        return
+
+    print(f"\n{seed_type} seeds ({label_desc}) - {len(seeds_df)} total:")
+    for i, (_, seg) in enumerate(seeds_df.head(10).iterrows(), 1):
+        print(f"  {i:3d}. SPS={seg['sps']:.2f}, Raw_SPS={seg['raw_sps']:.2f}, "
+              f"logprob={seg['avg_logprob']:.3f}, no_speech={seg['no_speech_prob']:.3f}, "
+              f"compression={seg['compression_ratio']:.2f}, duration={seg['duration_sanity']:.2f}s, "
+              f"raw_duration={seg['raw_duration']:.2f}s")
+        print(f"       Text: '{seg['text']}'")
+        print()
+
+    # Show statistics
+    print(f"\n{seed_type} seed statistics:")
+    for metric, col in [('SPS', 'sps'), ('Logprob', 'avg_logprob'), ('Compression', 'compression_ratio')]:
+        data = seeds_df[col]
+        print(f"  {metric}: mean={data.mean():.3f}, std={data.std():.3f}, min={data.min():.3f}, max={data.max():.3f}")
+
 def main():
     # Find logs directory
     log_dir = None
@@ -129,108 +137,159 @@ def main():
     print(f"Syllables per second: mean={df['sps'].mean():.2f}, std={df['sps'].std():.2f}, max={df['sps'].max():.2f}")
     print(f"Raw syllables per second: mean={df['raw_sps'].mean():.2f}, std={df['raw_sps'].std():.2f}, max={df['raw_sps'].max():.2f}")
     print(f"Duration ratio (raw/sanity): mean={df['duration_ratio'].mean():.2f}, std={df['duration_ratio'].std():.2f}")
-    
+
     # Step 1: Apply heuristic rules for seed labeling
     print("\nApplying heuristic rules for seed labeling...")
-    
+
     # Conservative positive seeds (likely hallucinations)
     h_pos = (
-        (df['avg_logprob'] < -0.8)           # This low of a logprob is almost always a hallucination
+        ((df['avg_logprob'] < -0.85)          # This low of a logprob is almost always a hallucination
         | (df['compression_ratio'] > 2.3)    # High compressibility is usually a hallucination
-        | (df['sps'] > 6)                    # No one speaks this fast
-        | (df['sps'] < 0.5)                  # No one speaks this slow
+        | (df['sps'] > 9))                    # No one speaks this fast
+        & df['text'].str.contains("Thank you", na=False)  # Hack. Nothing good enough to
     )
-    
+
     # Conservative negative seeds (likely valid)
     h_neg = (
         (df['avg_logprob'] > -0.5)          # solid confidence drop
         & (df['compression_ratio'] < 1.2)
-        & (df['sps'] < 6)
-        & (df['sps'] > 0.5)
+        & (df['sps'] < 9)
     )
-    
+
     # Create seed labels (NaN for unlabeled)
-    df['seed_label'] = np.where(h_pos, 1, 
+    df['seed_label'] = np.where(h_pos, 1,
                                 np.where(h_neg, 0, np.nan))
-    
+
     n_pos_seeds = (df['seed_label'] == 1).sum()
     n_neg_seeds = (df['seed_label'] == 0).sum()
     n_unlabeled = df['seed_label'].isna().sum()
-    
+
     print(f"Seed labeling results:")
     print(f"  Positive seeds (hallucinations): {n_pos_seeds} ({n_pos_seeds/len(df):.1%})")
     print(f"  Negative seeds (valid): {n_neg_seeds} ({n_neg_seeds/len(df):.1%})")
     print(f"  Unlabeled: {n_unlabeled} ({n_unlabeled/len(df):.1%})")
-    
+
     if n_pos_seeds == 0 or n_neg_seeds == 0:
         print("Warning: Not enough seed labels. Adjusting thresholds might help.")
         return
-    
-    # Show examples of positive seeds
-    pos_seeds = df[df['seed_label'] == 1].head(5)
-    if len(pos_seeds) > 0:
-        print(f"\nExample positive seeds (likely hallucinations):")
-        for _, seg in pos_seeds.iterrows():
-            print(f"  SPS={seg['sps']:.1f}, logprob={seg['avg_logprob']:.2f}, text='{seg['text'][:50]}...'")
-
-    # Define features
-    features = ['avg_logprob', 'duration_sanity', 'no_speech_prob',
-                'compression_ratio', 'log_duration', 'logprob_duration_interaction',
-                'sps', 'log_sps', 'raw_duration', 'log_raw_duration', 
-                'duration_ratio', 'raw_log_sps']
+
+    # Log all seed data
+    pos_seeds = df[df['seed_label'] == 1]
+    neg_seeds = df[df['seed_label'] == 0]
+
+    log_seed_data(pos_seeds, "Positive", "likely hallucinations")
+    log_seed_data(neg_seeds, "Negative", "likely valid")
+
+    # Define features (trimmed to remove redundant transformations)
+    features = [
+        'avg_logprob',
+        'no_speech_prob',
+        'compression_ratio',
+        'log_duration',
+        'log_sps',
+        'log_raw_duration',
+        'log_raw_sps',
+        'duration_ratio',
+        'logprob_duration_interaction'
+    ]
 
     X = df[features].values
-    
+
     # Step 2: Train kNN on seed labels
     print("\nTraining k-NN classifier on seed labels...")
-    
+
     labeled_mask = df['seed_label'].notna()
     X_seed = X[labeled_mask]
     y_seed = df.loc[labeled_mask, 'seed_label'].values.astype(int)
-    
+
+    # Auto-select k based on seed data size
+    n_seed_samples = len(X_seed)
+    optimal_k = min(max(int(np.sqrt(n_seed_samples)), 3), n_seed_samples // 2)
+    print(f"Using k={optimal_k} neighbors (from {n_seed_samples} seed samples)")
+
     # Create pipeline with scaling (important for kNN)
     knn_pipeline = Pipeline([
         ('scale', StandardScaler()),
         ('knn', KNeighborsClassifier(
-            n_neighbors=15,          # adjust based on dataset size
+            n_neighbors=optimal_k,
             weights='distance'       # closer neighbors weigh more
         ))
     ])
-    
+
+    # --- step 2: train k-NN on seeds -------------------------------
+    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+
+    # out-of-fold probas for the seeds
+    seed_scores = cross_val_predict(
+        knn_pipeline,                # pipeline defined earlier
+        X_seed, y_seed,
+        cv=cv,
+        method="predict_proba"
+    )[:, 1]
+
+    # store the scores for thresholding
+    df.loc[labeled_mask, 'knn_score'] = seed_scores
+
+    # finally fit on the full seed set before scoring the rest
     knn_pipeline.fit(X_seed, y_seed)
-    
-    # Step 3: Predict on all data
-    print("Propagating labels to unlabeled segments...")
-    
-    # Get probabilities for all segments
-    proba = knn_pipeline.predict_proba(X)[:, 1]  # probability of being hallucination
-    df['knn_score'] = proba
-    
-    # Choose threshold - use 95th percentile of negative seeds
-    neg_seed_scores = proba[df['seed_label'] == 0]
-    threshold = max(0.05, np.percentile(neg_seed_scores, 95))
-    
-    print(f"\nChosen threshold: {threshold:.3f}")
-    print(f"Based on 95th percentile of negative seed scores")
-    
-    # Apply threshold
-    df['is_hallucination'] = (proba >= threshold).astype(int)
-    
+    df.loc[~labeled_mask, 'knn_score'] = knn_pipeline.predict_proba(
+        X[~labeled_mask])[:, 1]
+
+    # --- step 2 done: kNN scores are in df['knn_score'] ---------------------
+
+    # Debug: how are the scores distributed?
+    for lbl, mask in {
+        "Positive seeds":  df['seed_label'] == 1,
+        "Negative seeds":  df['seed_label'] == 0,
+        "Un-labelled":     df['seed_label'].isna()
+    }.items():
+        scores = df.loc[mask, 'knn_score']
+        if scores.empty:
+            continue
+        print(f"{lbl:15s} | n={len(scores):4d}  min={scores.min():.3f}  "
+              f"25%={scores.quantile(.25):.3f}  median={scores.median():.3f}  "
+              f"75%={scores.quantile(.75):.3f}  max={scores.max():.3f}")
+    print()  # blank line for readability
+
+    # Step 3: derive a threshold from the seed scores
+    print("Applying threshold to segment scores...")
+
+    neg_seed_scores = df.loc[df['seed_label'] == 0, 'knn_score']
+    pos_seed_scores = df.loc[df['seed_label'] == 1, 'knn_score']
+
+    max_neg = neg_seed_scores.max()
+    min_pos = pos_seed_scores.min()
+
+    if min_pos > max_neg:
+        # clear separation – use the midpoint
+        threshold = (max_neg + min_pos) / 2
+        reason = "mid-point between max-neg and min-pos"
+    else:
+        # fallback to percentile rule, but ensure it’s >0
+        threshold = np.percentile(neg_seed_scores, 95)
+        if threshold <= 0:
+            threshold = 1e-3
+        reason = "95th percentile of negative seeds"
+
+    print(f"\nChosen threshold: {threshold:.3f}  ({reason})")
+
+    df['is_hallucination'] = (df['knn_score'] >= threshold).astype(int)
+
     # Print results
     n_hallucinations = df['is_hallucination'].sum()
     print(f"\nDetected hallucinations: {n_hallucinations} ({n_hallucinations/len(df):.1%})")
-    
+
     # Step 4: Train final gradient boosting model on kNN labels
     print("\nTraining final Gradient Boosting classifier...")
-    
+
     X_final = df[features]
     y_final = df['is_hallucination']
-    
+
     # Split data
     X_train, X_test, y_train, y_test = train_test_split(
         X_final, y_final, test_size=0.3, stratify=y_final, random_state=42
     )
-    
+
     # Train model
     model = GradientBoostingClassifier(
         n_estimators=80,
@@ -239,19 +298,19 @@ def main():
         random_state=42
     )
     model.fit(X_train, y_train)
-    
+
     # Evaluate
     y_pred = model.predict(X_test)
     y_proba_gb = model.predict_proba(X_test)[:, 1]
-    
+
     print("\nFinal Model Performance:")
     print(classification_report(y_test, y_pred))
-    
+
     # Confusion matrix
     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
     tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
     fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
-    
+
     print(f"\nDetection rate (TPR): {tpr:.1%}")
     print(f"False positive rate (FPR): {fpr:.1%}")
 
@@ -263,10 +322,22 @@ def main():
 
     # Show example detections
     hallucination_examples = df[df['is_hallucination'] == 1].head(10)
-    if len(hallucination_examples) > 0:
-        print(f"\nExample detected hallucinations:")
-        for _, seg in hallucination_examples.iterrows():
-            print(f"  Score={seg['knn_score']:.3f}, SPS={seg['sps']:.1f}, text='{seg['text'][:60]}...'")
+    print(f"\nExample detected hallucinations:")
+    for _, seg in hallucination_examples.iterrows():
+        print(f"  Score={seg['knn_score']:.3f}, text='{seg['text']}'")
+
+    non_hallucination_examples = df[df['is_hallucination'] == 0].head(10)
+    print(f"\nExample detected non-hallucinations:")
+    for _, seg in non_hallucination_examples.iterrows():
+        print(f"  Score={seg['knn_score']:.3f}, text='{seg['text']}'")
+
+    # --- after training the GB model ---
+    gb_scores = model.predict_proba(X_final)[:, 1]
+
+    # choose threshold on GB scores, e.g. same 95-percentile rule
+    neg_scores = gb_scores[df['seed_label'] == 0]
+    threshold = np.percentile(neg_scores, 95)
+    print(f"\nPost-training threshold: {threshold:.3f}")
 
     # Save model
     model_dir = Path("Models")
@@ -274,16 +345,11 @@ def main():
 
     model_bundle = {
         "model": model,
-        "threshold": 0.5,  # Using standard threshold since we trained on binary labels
+        "threshold": threshold,
         "features": features,
-        "heuristic_thresholds": {
-            "avg_logprob_high": -1.0,
-            "compression_ratio_high": 2.4,
-            "sps_high": 9.0
-        }
     }
 
-    output_path = model_dir / "hallucination_filter_gb.pkl"
+    output_path = model_dir / "thankyou_filter_gb.pkl"
     joblib.dump(model_bundle, output_path)
     print(f"\nModel saved to: {output_path}")