diff options
| author | yum <yum.food.vr@gmail.com> | 2022-10-20 18:41:01 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-10-20 18:41:01 -0700 |
| commit | d7c225ab3fcad600e93e9464886702fd269fedd5 (patch) | |
| tree | e1c90ab928895cd782819c1d6da19f59a33a5afe | |
| parent | 247f163efd46a58b2fbb5e7e26e0d141252dc651 (diff) | |
Quiet down transcribe.py
Also adjust continuous transcription algorithm to use leftmost minimum
instead of rightmost. This prevents some cases where we generate longer
and longer text.
| -rw-r--r-- | osc_ctrl.py | 4 | ||||
| -rw-r--r-- | transcribe.py | 19 |
2 files changed, 8 insertions, 15 deletions
diff --git a/osc_ctrl.py b/osc_ctrl.py index 761ff78..5b4b3f6 100644 --- a/osc_ctrl.py +++ b/osc_ctrl.py @@ -262,13 +262,13 @@ def sendMessageLazy(client, msg, tx_state): if cell_msg == [state.encoding[' ']] * NUM_LAYERS: if empty_cells_sent >= tx_state.empty_cells_to_send_per_call: - print("empty cell budget exceeded") + #print("empty cell budget exceeded") tx_state.last_msg_encoded = msg_encoded[0:cell_end] return False empty_cells_sent += 1 else: if nonempty_cells_sent >= tx_state.nonempty_cells_to_send_per_call: - print("nonempty cell budget exceeded") + #print("nonempty cell budget exceeded") tx_state.last_msg_encoded = msg_encoded[0:cell_end] return False nonempty_cells_sent += 1 diff --git a/transcribe.py b/transcribe.py index 20cd0ba..45b2a8e 100644 --- a/transcribe.py +++ b/transcribe.py @@ -131,8 +131,6 @@ def saveAudio(audio_state, filename): normalized = pydub_effects.normalize(raw) normalized.export(filename, format="wav") - print("audio save") - def resetAudio(audio_state): audio_state.frames_lock.acquire() audio_state.frames = [] @@ -149,8 +147,8 @@ def transcribe(model, filename): options = whisper.DecodingOptions(language = "en") result = whisper.decode(model, mel, options) - print("no speech prob: {}".format(result.no_speech_prob)) if result.no_speech_prob > 0.1: + print("no speech prob: {}".format(result.no_speech_prob)) return "" return result.text @@ -163,7 +161,6 @@ def transcribeAudio(audio_state, model): time.sleep(0.1) continue - print("Beginning transcription") text = transcribe(model, "audio.wav") audio_state.text_lock.acquire() @@ -185,9 +182,7 @@ def transcribeAudio(audio_state, model): elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]: commit_transcription = True - print("TRANSCRIPTION") - print("Previous: {}".format(audio_state.text)) - print("Current: {}".format(text)) + print("Transcription: {}".format(audio_state.text)) if commit_transcription: window_size = 20 @@ -202,22 +197,20 @@ def transcribeAudio(audio_state, model): new_slice = text[i:i + window_size] #print("Consider slice {}".format(new_slice)) d = levenshtein_distance(old_slice, new_slice) - if d <= best_match_d and d < window_size: + if d < best_match_d and d < window_size: best_match_i = i best_match_d = d if best_match_i == None: audio_state.text = text else: - print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) - print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) - print("New suffix: {}".format(text[best_match_i:])) - #new_text = old_text[0:max(len(old_text) - window_size, 0)] + #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) + #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) + #print("New suffix: {}".format(text[best_match_i:])) new_text = old_text[0:len(old_text) - window_size] new_text += text[best_match_i:] audio_state.text = new_text else: audio_state.text = text - audio_state.text_candidate = text |
