diff options
| author | yum <yum.food.vr@gmail.com> | 2022-10-15 18:49:51 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-10-15 18:49:51 -0700 |
| commit | 68db1b25f8f29a487dab0953248e0effdff68567 (patch) | |
| tree | 7b5164580320ba115e52229c32f7a744fad3fa44 | |
| parent | eba68f4fb35078327b75e99c25100ec1154efb13 (diff) | |
Further improve transcribe.py responsiveness
Add a third heuristic. If the transcription is relatively long and the
first bit differs from the previous transcription, immediately
overwrite. Because the transcription is long, it's a bit less likely to
be a complete mistranscription.
| -rw-r--r-- | transcribe.py | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/transcribe.py b/transcribe.py index 8369c43..73b2c92 100644 --- a/transcribe.py +++ b/transcribe.py @@ -4,6 +4,9 @@ import argparse import copy import os import osc_ctrl +# python3 -m pip install pydub +from pydub import AudioSegment as pydub_AudioSegment +from pydub import effects as pydub_effects # python3 -m pip install pyaudio import pyaudio import sys @@ -120,6 +123,12 @@ def saveAudio(audio_state, filename): wf.writeframes(b''.join(frames)) wf.close() + # Normalize volume. This seems to make the neural net a little more + # consistent. + raw = pydub_AudioSegment.from_wav(filename) + normalized = pydub_effects.normalize(raw) + normalized.export(filename, format="wav") + print("audio save") def resetAudio(audio_state): @@ -153,8 +162,22 @@ def transcribeAudio(audio_state, model): audio_state.text_lock.acquire() + # We use a few heuristics to handle spurious mistranscriptions and to + # handle events where we trim off the start of the audio clip. + # 1. If we get 2 consecutive identical transcriptions, we commit to + # the transcription. This reduces the number of + # mistranscriptions by a lot. + # 2. If the last transcription is a prefix of the current one, we + # immediately accept it, since the transcription is obviously + # somewhat stable. + # 3. If the transcription is somewhat long and the + # first few characters change, we assume this is due to a + # trim event and immediately accept the transcription. if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate): audio_state.text = text + elif len(text) > 30 and text[0:10] != audio_state.text_candidate[0:10]: + audio_state.text = text + audio_state.text_candidate = text audio_state.text_lock.release() @@ -176,7 +199,7 @@ def sendAudio(audio_state): osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state) # Pace this out - time.sleep(0.05) + time.sleep(0.01) if __name__ == "__main__": parser = argparse.ArgumentParser() |
