Quiet down transcribe.py

Also adjust continuous transcription algorithm to use leftmost minimum instead of rightmost. This prevents some cases where we generate longer and longer text.
author: yum <yum.food.vr@gmail.com> 2022-10-20 18:41:01 -0700
committer: yum <yum.food.vr@gmail.com> 2022-10-20 18:41:01 -0700
commit: d7c225ab3fcad600e93e9464886702fd269fedd5 (patch)
tree: e1c90ab928895cd782819c1d6da19f59a33a5afe /transcribe.py
parent: 247f163efd46a58b2fbb5e7e26e0d141252dc651 (diff)
1 files changed, 6 insertions, 13 deletions
diff --git a/transcribe.py b/transcribe.py
index 20cd0ba..45b2a8e 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -131,8 +131,6 @@ def saveAudio(audio_state, filename):
     normalized = pydub_effects.normalize(raw)
     normalized.export(filename, format="wav")
 
-    print("audio save")
-
 def resetAudio(audio_state):
     audio_state.frames_lock.acquire()
     audio_state.frames = []
@@ -149,8 +147,8 @@ def transcribe(model, filename):
     options = whisper.DecodingOptions(language = "en")
     result = whisper.decode(model, mel, options)
 
-    print("no speech prob: {}".format(result.no_speech_prob))
     if result.no_speech_prob > 0.1:
+        print("no speech prob: {}".format(result.no_speech_prob))
         return ""
 
     return result.text
@@ -163,7 +161,6 @@ def transcribeAudio(audio_state, model):
             time.sleep(0.1)
             continue
 
-        print("Beginning transcription")
         text = transcribe(model, "audio.wav")
 
         audio_state.text_lock.acquire()
@@ -185,9 +182,7 @@ def transcribeAudio(audio_state, model):
         elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]:
             commit_transcription = True
 
-        print("TRANSCRIPTION")
-        print("Previous: {}".format(audio_state.text))
-        print("Current:  {}".format(text))
+        print("Transcription: {}".format(audio_state.text))
 
         if commit_transcription:
             window_size = 20
@@ -202,22 +197,20 @@ def transcribeAudio(audio_state, model):
                     new_slice = text[i:i + window_size]
                     #print("Consider slice {}".format(new_slice))
                     d = levenshtein_distance(old_slice, new_slice)
-                    if d <= best_match_d and d < window_size:
+                    if d < best_match_d and d < window_size:
                         best_match_i = i
                         best_match_d = d
                 if best_match_i == None:
                     audio_state.text = text
                 else:
-                    print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
-                    print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
-                    print("New suffix: {}".format(text[best_match_i:]))
-                    #new_text = old_text[0:max(len(old_text) - window_size, 0)]
+                    #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
+                    #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
+                    #print("New suffix: {}".format(text[best_match_i:]))
                     new_text = old_text[0:len(old_text) - window_size]
                     new_text += text[best_match_i:]
                     audio_state.text = new_text
             else:
                 audio_state.text = text
-                
 
         audio_state.text_candidate = text
author	yum <yum.food.vr@gmail.com>	2022-10-20 18:41:01 -0700
committer	yum <yum.food.vr@gmail.com>	2022-10-20 18:41:01 -0700
commit	d7c225ab3fcad600e93e9464886702fd269fedd5 (patch)
tree	e1c90ab928895cd782819c1d6da19f59a33a5afe /transcribe.py
parent	247f163efd46a58b2fbb5e7e26e0d141252dc651 (diff)