summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-10-20 18:41:01 -0700
committeryum <yum.food.vr@gmail.com>2022-10-20 18:41:01 -0700
commitd7c225ab3fcad600e93e9464886702fd269fedd5 (patch)
treee1c90ab928895cd782819c1d6da19f59a33a5afe
parent247f163efd46a58b2fbb5e7e26e0d141252dc651 (diff)
Quiet down transcribe.py
Also adjust continuous transcription algorithm to use leftmost minimum instead of rightmost. This prevents some cases where we generate longer and longer text.
-rw-r--r--osc_ctrl.py4
-rw-r--r--transcribe.py19
2 files changed, 8 insertions, 15 deletions
diff --git a/osc_ctrl.py b/osc_ctrl.py
index 761ff78..5b4b3f6 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -262,13 +262,13 @@ def sendMessageLazy(client, msg, tx_state):
if cell_msg == [state.encoding[' ']] * NUM_LAYERS:
if empty_cells_sent >= tx_state.empty_cells_to_send_per_call:
- print("empty cell budget exceeded")
+ #print("empty cell budget exceeded")
tx_state.last_msg_encoded = msg_encoded[0:cell_end]
return False
empty_cells_sent += 1
else:
if nonempty_cells_sent >= tx_state.nonempty_cells_to_send_per_call:
- print("nonempty cell budget exceeded")
+ #print("nonempty cell budget exceeded")
tx_state.last_msg_encoded = msg_encoded[0:cell_end]
return False
nonempty_cells_sent += 1
diff --git a/transcribe.py b/transcribe.py
index 20cd0ba..45b2a8e 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -131,8 +131,6 @@ def saveAudio(audio_state, filename):
normalized = pydub_effects.normalize(raw)
normalized.export(filename, format="wav")
- print("audio save")
-
def resetAudio(audio_state):
audio_state.frames_lock.acquire()
audio_state.frames = []
@@ -149,8 +147,8 @@ def transcribe(model, filename):
options = whisper.DecodingOptions(language = "en")
result = whisper.decode(model, mel, options)
- print("no speech prob: {}".format(result.no_speech_prob))
if result.no_speech_prob > 0.1:
+ print("no speech prob: {}".format(result.no_speech_prob))
return ""
return result.text
@@ -163,7 +161,6 @@ def transcribeAudio(audio_state, model):
time.sleep(0.1)
continue
- print("Beginning transcription")
text = transcribe(model, "audio.wav")
audio_state.text_lock.acquire()
@@ -185,9 +182,7 @@ def transcribeAudio(audio_state, model):
elif len(text) > 30 and len(audio_state.text_candidate) >= 10 and text[0:10] != audio_state.text_candidate[0:10]:
commit_transcription = True
- print("TRANSCRIPTION")
- print("Previous: {}".format(audio_state.text))
- print("Current: {}".format(text))
+ print("Transcription: {}".format(audio_state.text))
if commit_transcription:
window_size = 20
@@ -202,22 +197,20 @@ def transcribeAudio(audio_state, model):
new_slice = text[i:i + window_size]
#print("Consider slice {}".format(new_slice))
d = levenshtein_distance(old_slice, new_slice)
- if d <= best_match_d and d < window_size:
+ if d < best_match_d and d < window_size:
best_match_i = i
best_match_d = d
if best_match_i == None:
audio_state.text = text
else:
- print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
- print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
- print("New suffix: {}".format(text[best_match_i:]))
- #new_text = old_text[0:max(len(old_text) - window_size, 0)]
+ #print("Best overlap: {}, {}".format(best_match_d, text[best_match_i:best_match_i + window_size]))
+ #print("Old prefix: {}".format(old_text[0:len(old_text) - window_size]))
+ #print("New suffix: {}".format(text[best_match_i:]))
new_text = old_text[0:len(old_text) - window_size]
new_text += text[best_match_i:]
audio_state.text = new_text
else:
audio_state.text = text
-
audio_state.text_candidate = text