Improve transcription quality

Apply heuristics described in whisper paper. Dramatically improve silence detection as well as overall transcription quality. I was able to read the entire demo script at speed without any serious transcription inaccuracies. Field testing is TODO.
author: yum <yum.food.vr@gmail.com> 2022-11-01 20:28:28 -0700
committer: yum <yum.food.vr@gmail.com> 2022-11-01 20:35:13 -0700
commit: b5ec06b14ec8085172535cf52f6633da226c8084 (patch)
tree: a1a4c467c12d92bfad8c80e6a9c08b35e6177416
parent: eb3fecd09f18233ba9e145a9092a7eb38ff965b8 (diff)
1 files changed, 24 insertions, 39 deletions
diff --git a/transcribe.py b/transcribe.py
index 4400999..be2275b 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -181,12 +181,21 @@ def transcribe(model, filename):
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     #_, probs = model.detect_language(mel)
     #print(f"Detected language: {max(probs, key=probs.get)}")
-    options = whisper.DecodingOptions(language = "en")
+    options = whisper.DecodingOptions(language = "en",
+            beam_size = 3)
     result = whisper.decode(model, mel, options)
 
-    if result.no_speech_prob > 0.2:
+    if result.no_speech_prob > 0.15:
         print("no speech prob: {}".format(result.no_speech_prob))
-        return ""
+        return None
+
+    if result.avg_logprob < -1.0:
+        print("avg logprob: {}".format(result.avg_logprob))
+        return None
+
+    if result.compression_ratio > 2.4:
+        print("compression ratio: {}".format(result.compression_ratio))
+        return None
 
     return result.text
 
@@ -214,6 +223,8 @@ def transcribeAudio(audio_state, model):
             continue
 
         text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME)
+        if not text:
+            continue
 
         audio_state.text_lock.acquire()
 
@@ -229,44 +240,18 @@ def transcribeAudio(audio_state, model):
                 words = words[0:-1]
                 audio_state.display_paused = True
 
-        # We use a few heuristics to handle spurious mistranscriptions and to
-        # handle events where we trim off the start of the audio clip.
-        #   1. If we get 2 consecutive identical transcriptions, we commit to
-        #       the transcription. This reduces the number of
-        #       mistranscriptions by a lot.
-        #   2. If the last transcription is a prefix of the current one, we
-        #       immediately accept it, since the transcription is obviously
-        #       somewhat stable.
-        #   3. If the transcription is somewhat long and the
-        #       first few words change, we assume this is due to a
-        #       trim event and immediately accept the transcription.
-        candidate_words = ''.join(c for c in audio_state.text_candidate.lower() if (c.isalpha() or c == " ")).split()
-
-        candidate_words_are_prefix_of_text = False
-        if len(candidate_words) < len(words) and \
-                candidate_words == words[0:len(candidate_words)]:
-            candidate_words_are_prefix_of_text = True
-
-        commit_transcription = False
-        if words == candidate_words or candidate_words_are_prefix_of_text:
-            commit_transcription = True
-        elif len(words) >= 3 and len(candidate_words) >= 3 and \
-                words[0:3] != candidate_words[0:3]:
-            commit_transcription = True
-
         print("Transcription: {}".format(audio_state.text))
 
-        if commit_transcription:
-            old_text = audio_state.text
-            old_words = audio_state.text.split()
-            new_words = text.split()
-
-            audio_state.text = string_matcher.matchStringList(old_words, new_words)
-            if old_text != audio_state.text:
-                # We think the user said something, so  reset the amount of
-                # time we sleep between transcriptions to the minimum.
-                audio_state.transcribe_no_change_count = 0
-                audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
+        old_text = audio_state.text
+        old_words = audio_state.text.split()
+        new_words = text.split()
+
+        audio_state.text = string_matcher.matchStringList(old_words, new_words)
+        if old_text != audio_state.text:
+            # We think the user said something, so  reset the amount of
+            # time we sleep between transcriptions to the minimum.
+            audio_state.transcribe_no_change_count = 0
+            audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
         audio_state.text_candidate = text
author	yum <yum.food.vr@gmail.com>	2022-11-01 20:28:28 -0700
committer	yum <yum.food.vr@gmail.com>	2022-11-01 20:35:13 -0700
commit	b5ec06b14ec8085172535cf52f6633da226c8084 (patch)
tree	a1a4c467c12d92bfad8c80e6a9c08b35e6177416
parent	eb3fecd09f18233ba9e145a9092a7eb38ff965b8 (diff)