summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-10-16 23:48:15 -0700
committeryum <yum.food.vr@gmail.com>2022-10-17 00:24:57 -0700
commit247f163efd46a58b2fbb5e7e26e0d141252dc651 (patch)
tree97354f82193410360f1606d779b1facfaf1f4f90
parent6eb03b1e369286a3e0417b3229f50f00f5760e8a (diff)
Add continuous transcription mode
Algorithm: * look at last 20 chars of last committed transcription * scan new transcription using 10-char sliding window * find spot where distance is minimized * stitch two messages together Thus we're able to maintain a continuously growing transcription without having to feed the AI more than 30 seconds of data at a time. Seems to work reasonably well in bench tests. Also fix silence detection. AI exposes a probability that nothing was said. Hand-pick a probability of 0.1. Sometimes the AI still goes sicko mode with this setting but going higher occasionally results in no transcription.
-rw-r--r--osc_ctrl.py2
-rw-r--r--transcribe.py57
2 files changed, 41 insertions, 18 deletions
diff --git a/osc_ctrl.py b/osc_ctrl.py
index d80f055..761ff78 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -241,8 +241,6 @@ def resizeBoard(num_lines, tx_state, shrink_only):
# This may take multiple calls to complete. Returns True once it's done.
def sendMessageLazy(client, msg, tx_state):
lines = splitMessage(msg)
- #resizeBoard(len(lines), tx_state, shrink_only=False)
-
msg_encoded = encodeMessage(lines)
msg_encoded_len = len(msg_encoded)
diff --git a/transcribe.py b/transcribe.py
index 9170152..20cd0ba 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -2,6 +2,8 @@
import argparse
import copy
+# python3 -m pip install python-Levenshtein
+from Levenshtein import distance as levenshtein_distance
import os
import osc_ctrl
# python3 -m pip install pydub
@@ -142,11 +144,15 @@ def transcribe(model, filename):
audio = whisper.load_audio(filename)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
- _, probs = model.detect_language(mel)
- print(f"Detected language: {max(probs, key=probs.get)}")
- options = whisper.DecodingOptions()
+ #_, probs = model.detect_language(mel)
+ #print(f"Detected language: {max(probs, key=probs.get)}")
+ options = whisper.DecodingOptions(language = "en")
result = whisper.decode(model, mel, options)
+ print("no speech prob: {}".format(result.no_speech_prob))
+ if result.no_speech_prob > 0.1:
+ return ""
+
return result.text
def transcribeAudio(audio_state, model):
@@ -177,28 +183,48 @@ def transcribeAudio(audio_state, model):
if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate):
commit_transcription = True
elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]:
- audio_state.text = text
commit_transcription = True
- if commit_transcription:
- old_len = len(audio_state.text_candidate)
- new_len = len(text)
- min_len = min(old_len, new_len)
- overlap_fraction = 0.2
- overlap_len = int(0.2 * min_len)
+ print("TRANSCRIPTION")
+ print("Previous: {}".format(audio_state.text))
+ print("Current: {}".format(text))
- if audio_state.text_candidate[old_len - overlap_len:old_len] == text_state[0:overlap_len]
+ if commit_transcription:
+ window_size = 20
+ old_text = audio_state.text
+ if audio_state.text == text:
+ pass
+ elif len(text) >= window_size and len(old_text) >= window_size:
+ old_slice = old_text[len(old_text) - window_size:]
+ best_match_i = None
+ best_match_d = window_size * 1000
+ for i in range(0, 1 + len(text) - window_size):
+ new_slice = text[i:i + window_size]
+ #print("Consider slice {}".format(new_slice))
+ d = levenshtein_distance(old_slice, new_slice)
+ if d <= best_match_d and d < window_size:
+ best_match_i = i
+ best_match_d = d
+ if best_match_i == None:
+ audio_state.text = text
+ else:
+ print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
+ print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
+ print("New suffix: {}".format(text[best_match_i:]))
+ #new_text = old_text[0:max(len(old_text) - window_size, 0)]
+ new_text = old_text[0:len(old_text) - window_size]
+ new_text += text[best_match_i:]
+ audio_state.text = new_text
+ else:
audio_state.text = text
+
+
audio_state.text_candidate = text
audio_state.text_lock.release()
- print("Transcription: {}".format(audio_state.text))
- print("Candidate: {}".format(audio_state.text_candidate))
-
# Pace this out
time.sleep(0.05)
-
def sendAudio(audio_state):
tx_state = osc_ctrl.OscTxState()
while audio_state.send_audio == True:
@@ -206,7 +232,6 @@ def sendAudio(audio_state):
text = copy.deepcopy(audio_state.text)
audio_state.text_lock.release()
- print("here")
osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state)
# Pace this out