From d7c225ab3fcad600e93e9464886702fd269fedd5 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Thu, 20 Oct 2022 18:41:01 -0700
Subject: Quiet down transcribe.py

Also adjust continuous transcription algorithm to use leftmost minimum
instead of rightmost. This prevents some cases where we generate longer
and longer text.
---
 osc_ctrl.py   |  4 ++--
 transcribe.py | 19 ++++++-------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/osc_ctrl.py b/osc_ctrl.py
index 761ff78..5b4b3f6 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -262,13 +262,13 @@ def sendMessageLazy(client, msg, tx_state):
 
         if cell_msg == [state.encoding[' ']] * NUM_LAYERS:
             if empty_cells_sent >= tx_state.empty_cells_to_send_per_call:
-                print("empty cell budget exceeded")
+                #print("empty cell budget exceeded")
                 tx_state.last_msg_encoded = msg_encoded[0:cell_end]
                 return False
             empty_cells_sent += 1
         else:
             if nonempty_cells_sent >= tx_state.nonempty_cells_to_send_per_call:
-                print("nonempty cell budget exceeded")
+                #print("nonempty cell budget exceeded")
                 tx_state.last_msg_encoded = msg_encoded[0:cell_end]
                 return False
             nonempty_cells_sent += 1
diff --git a/transcribe.py b/transcribe.py
index 20cd0ba..45b2a8e 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -131,8 +131,6 @@ def saveAudio(audio_state, filename):
     normalized = pydub_effects.normalize(raw)
     normalized.export(filename, format="wav")
 
-    print("audio save")
-
 def resetAudio(audio_state):
     audio_state.frames_lock.acquire()
     audio_state.frames = []
@@ -149,8 +147,8 @@ def transcribe(model, filename):
     options = whisper.DecodingOptions(language = "en")
     result = whisper.decode(model, mel, options)
 
-    print("no speech prob: {}".format(result.no_speech_prob))
     if result.no_speech_prob > 0.1:
+        print("no speech prob: {}".format(result.no_speech_prob))
         return ""
 
     return result.text
@@ -163,7 +161,6 @@ def transcribeAudio(audio_state, model):
             time.sleep(0.1)
             continue
 
-        print("Beginning transcription")
         text = transcribe(model, "audio.wav")
 
         audio_state.text_lock.acquire()
@@ -185,9 +182,7 @@ def transcribeAudio(audio_state, model):
         elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]:
             commit_transcription = True
 
-        print("TRANSCRIPTION")
-        print("Previous: {}".format(audio_state.text))
-        print("Current:  {}".format(text))
+        print("Transcription: {}".format(audio_state.text))
 
         if commit_transcription:
             window_size = 20
@@ -202,22 +197,20 @@ def transcribeAudio(audio_state, model):
                     new_slice = text[i:i + window_size]
                     #print("Consider slice {}".format(new_slice))
                     d = levenshtein_distance(old_slice, new_slice)
-                    if d <= best_match_d and d < window_size:
+                    if d < best_match_d and d < window_size:
                         best_match_i = i
                         best_match_d = d
                 if best_match_i == None:
                     audio_state.text = text
                 else:
-                    print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
-                    print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
-                    print("New suffix: {}".format(text[best_match_i:]))
-                    #new_text = old_text[0:max(len(old_text) - window_size, 0)]
+                    #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
+                    #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
+                    #print("New suffix: {}".format(text[best_match_i:]))
                     new_text = old_text[0:len(old_text) - window_size]
                     new_text += text[best_match_i:]
                     audio_state.text = new_text
             else:
                 audio_state.text = text
-                
 
         audio_state.text_candidate = text
 
-- 
cgit v1.2.3