From 2793ac9dd31059f2fc29f7978bcb688a7de664ed Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Fri, 7 Jul 2023 01:57:56 -0700
Subject: Filter out segments based on avg_log_prob & no_speech_prob

Surprisingly, these args do not cause transcribe() to omit those
segments from the result, so we have to manually filter them out.
Hallucinated phrases generally have one or both of these params set
high.
---
 Scripts/transcribe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'Scripts')

diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 39c6563..cebd70d 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -278,12 +278,15 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             audio,
             beam_size = 5,
             language = audio_state.language,
-            temperature = [0.0],
+            temperature = [0.0, 0.2, 0.4],
+            log_prob_threshold = -1.0,
             vad_filter = True,
             condition_on_previous_text = True,
             without_timestamps = False)
     ranges = []
     for s in segments:
+        if s.avg_log_prob < -1.0 or s.no_speech_prob > 0.6:
+            continue
         if audio_state.enable_debug_mode:
             print(f"Segment: {s}")
         ranges.append((s.start, s.end, s.text))
-- 
cgit v1.2.3