From d7c225ab3fcad600e93e9464886702fd269fedd5 Mon Sep 17 00:00:00 2001 From: yum Date: Thu, 20 Oct 2022 18:41:01 -0700 Subject: Quiet down transcribe.py Also adjust continuous transcription algorithm to use leftmost minimum instead of rightmost. This prevents some cases where we generate longer and longer text. --- osc_ctrl.py | 4 ++-- transcribe.py | 19 ++++++------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/osc_ctrl.py b/osc_ctrl.py index 761ff78..5b4b3f6 100644 --- a/osc_ctrl.py +++ b/osc_ctrl.py @@ -262,13 +262,13 @@ def sendMessageLazy(client, msg, tx_state): if cell_msg == [state.encoding[' ']] * NUM_LAYERS: if empty_cells_sent >= tx_state.empty_cells_to_send_per_call: - print("empty cell budget exceeded") + #print("empty cell budget exceeded") tx_state.last_msg_encoded = msg_encoded[0:cell_end] return False empty_cells_sent += 1 else: if nonempty_cells_sent >= tx_state.nonempty_cells_to_send_per_call: - print("nonempty cell budget exceeded") + #print("nonempty cell budget exceeded") tx_state.last_msg_encoded = msg_encoded[0:cell_end] return False nonempty_cells_sent += 1 diff --git a/transcribe.py b/transcribe.py index 20cd0ba..45b2a8e 100644 --- a/transcribe.py +++ b/transcribe.py @@ -131,8 +131,6 @@ def saveAudio(audio_state, filename): normalized = pydub_effects.normalize(raw) normalized.export(filename, format="wav") - print("audio save") - def resetAudio(audio_state): audio_state.frames_lock.acquire() audio_state.frames = [] @@ -149,8 +147,8 @@ def transcribe(model, filename): options = whisper.DecodingOptions(language = "en") result = whisper.decode(model, mel, options) - print("no speech prob: {}".format(result.no_speech_prob)) if result.no_speech_prob > 0.1: + print("no speech prob: {}".format(result.no_speech_prob)) return "" return result.text @@ -163,7 +161,6 @@ def transcribeAudio(audio_state, model): time.sleep(0.1) continue - print("Beginning transcription") text = transcribe(model, "audio.wav") audio_state.text_lock.acquire() @@ -185,9 +182,7 @@ def transcribeAudio(audio_state, model): elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]: commit_transcription = True - print("TRANSCRIPTION") - print("Previous: {}".format(audio_state.text)) - print("Current: {}".format(text)) + print("Transcription: {}".format(audio_state.text)) if commit_transcription: window_size = 20 @@ -202,22 +197,20 @@ def transcribeAudio(audio_state, model): new_slice = text[i:i + window_size] #print("Consider slice {}".format(new_slice)) d = levenshtein_distance(old_slice, new_slice) - if d <= best_match_d and d < window_size: + if d < best_match_d and d < window_size: best_match_i = i best_match_d = d if best_match_i == None: audio_state.text = text else: - print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) - print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) - print("New suffix: {}".format(text[best_match_i:])) - #new_text = old_text[0:max(len(old_text) - window_size, 0)] + #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) + #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) + #print("New suffix: {}".format(text[best_match_i:])) new_text = old_text[0:len(old_text) - window_size] new_text += text[best_match_i:] audio_state.text = new_text else: audio_state.text = text - audio_state.text_candidate = text -- cgit v1.2.3