diff options
Diffstat (limited to 'transcribe.py')
| -rw-r--r-- | transcribe.py | 36 |
1 files changed, 27 insertions, 9 deletions
diff --git a/transcribe.py b/transcribe.py index dc36541..8369c43 100644 --- a/transcribe.py +++ b/transcribe.py @@ -12,8 +12,7 @@ import time import wave # python3 -m pip install git+https://github.com/openai/whisper.git # python3 -m pip install torch -f https://download.pytorch.org/whl/torch_stable.html -from whisper import transcribe as whisper_transcribe -from whisper import load_model as whisper_load_model +import whisper class AudioState: CHUNK = 1024 @@ -24,9 +23,9 @@ class AudioState: # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. - MAX_LENGTH_S = 90 + MAX_LENGTH_S = 25 # The minimum length that recordAudio() will wait for before saving audio. - MIN_LENGTH_S = 3 + MIN_LENGTH_S = 1 # PyAudio object p = None @@ -38,6 +37,9 @@ class AudioState: frames_lock = threading.Lock() text = "" + # To improve temporal stability, we require two consecutive identical + # transcriptions before "committing" to a transcription. + text_candidate = "" text_lock = threading.Lock() record_audio = True @@ -118,6 +120,8 @@ def saveAudio(audio_state, filename): wf.writeframes(b''.join(frames)) wf.close() + print("audio save") + def resetAudio(audio_state): audio_state.frames_lock.acquire() audio_state.frames = [] @@ -125,8 +129,16 @@ def resetAudio(audio_state): # Transcribe the audio recorded in a file. def transcribe(model, filename): - result = whisper_transcribe(model=model, audio=filename, language="en") - return result["text"] + + audio = whisper.load_audio(filename) + audio = whisper.pad_or_trim(audio) + mel = whisper.log_mel_spectrogram(audio).to(model.device) + _, probs = model.detect_language(mel) + print(f"Detected language: {max(probs, key=probs.get)}") + options = whisper.DecodingOptions() + result = whisper.decode(model, mel, options) + + return result.text def transcribeAudio(audio_state, model): while audio_state.transcribe_audio == True: @@ -140,13 +152,18 @@ def transcribeAudio(audio_state, model): text = transcribe(model, "audio.wav") audio_state.text_lock.acquire() - audio_state.text = text + + if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate): + audio_state.text = text + audio_state.text_candidate = text + audio_state.text_lock.release() print("Transcription: {}".format(audio_state.text)) + print("Candidate: {}".format(audio_state.text_candidate)) # Pace this out - time.sleep(0.2) + time.sleep(0.05) def sendAudio(audio_state): tx_state = osc_ctrl.OscTxState() @@ -155,6 +172,7 @@ def sendAudio(audio_state): text = copy.deepcopy(audio_state.text) audio_state.text_lock.release() + print("here") osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state) # Pace this out @@ -179,7 +197,7 @@ if __name__ == "__main__": print("Safe to start talking") - model = whisper_load_model("base") + model = whisper.load_model("base") transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model]) transcribe_audio_thd.daemon = True |
