diff options
| author | yum <yum.food.vr@gmail.com> | 2022-10-16 23:48:15 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-10-17 00:24:57 -0700 |
| commit | 247f163efd46a58b2fbb5e7e26e0d141252dc651 (patch) | |
| tree | 97354f82193410360f1606d779b1facfaf1f4f90 | |
| parent | 6eb03b1e369286a3e0417b3229f50f00f5760e8a (diff) | |
Add continuous transcription mode
Algorithm:
* look at last 20 chars of last committed transcription
* scan new transcription using 10-char sliding window
* find spot where distance is minimized
* stitch two messages together
Thus we're able to maintain a continuously growing transcription
without having to feed the AI more than 30 seconds of data at a
time. Seems to work reasonably well in bench tests.
Also fix silence detection. AI exposes a probability that nothing
was said. Hand-pick a probability of 0.1. Sometimes the AI still
goes sicko mode with this setting but going higher occasionally
results in no transcription.
| -rw-r--r-- | osc_ctrl.py | 2 | ||||
| -rw-r--r-- | transcribe.py | 57 |
2 files changed, 41 insertions, 18 deletions
diff --git a/osc_ctrl.py b/osc_ctrl.py index d80f055..761ff78 100644 --- a/osc_ctrl.py +++ b/osc_ctrl.py @@ -241,8 +241,6 @@ def resizeBoard(num_lines, tx_state, shrink_only): # This may take multiple calls to complete. Returns True once it's done. def sendMessageLazy(client, msg, tx_state): lines = splitMessage(msg) - #resizeBoard(len(lines), tx_state, shrink_only=False) - msg_encoded = encodeMessage(lines) msg_encoded_len = len(msg_encoded) diff --git a/transcribe.py b/transcribe.py index 9170152..20cd0ba 100644 --- a/transcribe.py +++ b/transcribe.py @@ -2,6 +2,8 @@ import argparse import copy +# python3 -m pip install python-Levenshtein +from Levenshtein import distance as levenshtein_distance import os import osc_ctrl # python3 -m pip install pydub @@ -142,11 +144,15 @@ def transcribe(model, filename): audio = whisper.load_audio(filename) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) - _, probs = model.detect_language(mel) - print(f"Detected language: {max(probs, key=probs.get)}") - options = whisper.DecodingOptions() + #_, probs = model.detect_language(mel) + #print(f"Detected language: {max(probs, key=probs.get)}") + options = whisper.DecodingOptions(language = "en") result = whisper.decode(model, mel, options) + print("no speech prob: {}".format(result.no_speech_prob)) + if result.no_speech_prob > 0.1: + return "" + return result.text def transcribeAudio(audio_state, model): @@ -177,28 +183,48 @@ def transcribeAudio(audio_state, model): if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate): commit_transcription = True elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]: - audio_state.text = text commit_transcription = True - if commit_transcription: - old_len = len(audio_state.text_candidate) - new_len = len(text) - min_len = min(old_len, new_len) - overlap_fraction = 0.2 - overlap_len = int(0.2 * min_len) + print("TRANSCRIPTION") + print("Previous: {}".format(audio_state.text)) + print("Current: {}".format(text)) - if audio_state.text_candidate[old_len - overlap_len:old_len] == text_state[0:overlap_len] + if commit_transcription: + window_size = 20 + old_text = audio_state.text + if audio_state.text == text: + pass + elif len(text) >= window_size and len(old_text) >= window_size: + old_slice = old_text[len(old_text) - window_size:] + best_match_i = None + best_match_d = window_size * 1000 + for i in range(0, 1 + len(text) - window_size): + new_slice = text[i:i + window_size] + #print("Consider slice {}".format(new_slice)) + d = levenshtein_distance(old_slice, new_slice) + if d <= best_match_d and d < window_size: + best_match_i = i + best_match_d = d + if best_match_i == None: + audio_state.text = text + else: + print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size])) + print("Old prefix: {}".format(old_text[0:len(old_text) - window_size])) + print("New suffix: {}".format(text[best_match_i:])) + #new_text = old_text[0:max(len(old_text) - window_size, 0)] + new_text = old_text[0:len(old_text) - window_size] + new_text += text[best_match_i:] + audio_state.text = new_text + else: audio_state.text = text + + audio_state.text_candidate = text audio_state.text_lock.release() - print("Transcription: {}".format(audio_state.text)) - print("Candidate: {}".format(audio_state.text_candidate)) - # Pace this out time.sleep(0.05) - def sendAudio(audio_state): tx_state = osc_ctrl.OscTxState() while audio_state.send_audio == True: @@ -206,7 +232,6 @@ def sendAudio(audio_state): text = copy.deepcopy(audio_state.text) audio_state.text_lock.release() - print("here") osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state) # Pace this out |
