diff options
| author | yum <yum.food.vr@gmail.com> | 2022-10-24 23:08:11 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-10-24 23:13:47 -0700 |
| commit | 0102b725f60c91ca6d095c2b04de71db6d5b1fda (patch) | |
| tree | c87c84ad4f48577e90307ca2f2bf40baa18b6950 /transcribe.py | |
| parent | 08655f96dc798e3e129058a5e97c5aa7ff96e798 (diff) | |
Saying the word "clear" clears the board
While the board is clearing, you can keep talking, and it will be
rendered when the board finishes clearing.
* bugfix: STT only beeps when it's out
Diffstat (limited to 'transcribe.py')
| -rw-r--r-- | transcribe.py | 89 |
1 files changed, 80 insertions, 9 deletions
diff --git a/transcribe.py b/transcribe.py index 45b2a8e..fa3b166 100644 --- a/transcribe.py +++ b/transcribe.py @@ -32,6 +32,8 @@ class AudioState: # The minimum length that recordAudio() will wait for before saving audio. MIN_LENGTH_S = 1 + VOICE_AUDIO_FILENAME = "audio.wav" + # PyAudio object p = None @@ -46,10 +48,12 @@ class AudioState: # transcriptions before "committing" to a transcription. text_candidate = "" text_lock = threading.Lock() + clear_requested = False record_audio = True transcribe_audio = True send_audio = True + run_control_thread = True osc_client = osc_ctrl.getClient() @@ -131,15 +135,30 @@ def saveAudio(audio_state, filename): normalized = pydub_effects.normalize(raw) normalized.export(filename, format="wav") +def resetDiskAudioLocked(audio_state, filename): + wf = wave.open(filename, 'wb') + wf.setnchannels(audio_state.CHANNELS) + wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT)) + wf.setframerate(audio_state.RATE) + + wf.writeframes(b''.join([])) + wf.close() + +def resetAudioLocked(audio_state): + audio_state.frames = [] + def resetAudio(audio_state): audio_state.frames_lock.acquire() - audio_state.frames = [] + resetAudioLocked(audio_state) audio_state.frames_lock.release() # Transcribe the audio recorded in a file. def transcribe(model, filename): + audio_state.frames_lock.acquire() audio = whisper.load_audio(filename) + audio_state.frames_lock.release() + audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) #_, probs = model.detect_language(mel) @@ -155,16 +174,35 @@ def transcribe(model, filename): def transcribeAudio(audio_state, model): while audio_state.transcribe_audio == True: - saveAudio(audio_state, "audio.wav") + # Pace this out + time.sleep(0.05) - if not os.path.isfile("audio.wav"): + saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME) + + if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): time.sleep(0.1) continue - text = transcribe(model, "audio.wav") + text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME) audio_state.text_lock.acquire() + if audio_state.clear_requested: + audio_state.text = "" + audio_state.text_candidate = "" + audio_state.clear_requested = False + audio_state.text_lock.release() + continue + + words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() + print("words: {}".format(words)) + if len(words) > 0 and words[-1] == "clear": + audio_state.text = "" + audio_state.text_candidate = "" + audio_state.clear_requested = True + audio_state.text_lock.release() + continue + # We use a few heuristics to handle spurious mistranscriptions and to # handle events where we trim off the start of the audio clip. # 1. If we get 2 consecutive identical transcriptions, we commit to @@ -216,8 +254,6 @@ def transcribeAudio(audio_state, model): audio_state.text_lock.release() - # Pace this out - time.sleep(0.05) def sendAudio(audio_state): tx_state = osc_ctrl.OscTxState() while audio_state.send_audio == True: @@ -230,6 +266,34 @@ def sendAudio(audio_state): # Pace this out time.sleep(0.01) +def controlThread(audio_state): + while audio_state.run_control_thread: + time.sleep(0.1) + if audio_state.clear_requested: + print("here a") + audio_state.text_lock.acquire() + audio_state.frames_lock.acquire() + + if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): + # empty out the voice file + open(audio_state.VOICE_AUDIO_FILENAME, "w").close() + resetAudioLocked(audio_state) + resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME) + audio_state.clear_requested = False + + # Allow audio collection to resume now. If we don't do this, then + # any audio spoken while the board is slowly clearing will be lost. + audio_state.frames_lock.release() + + # Clearing can take a while, and the user might be talking in the + # meantime. So we drop audio state before clearing so the other + # threads can continue saving to it. + osc_ctrl.clear(audio_state.osc_client) + + audio_state.text_lock.release() + + print("here b") + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index") @@ -238,11 +302,12 @@ if __name__ == "__main__": if not args.mic: args.mic = "index" - if os.path.isfile("audio.wav"): - os.remove("audio.wav") - audio_state = getMicStream(args.mic) + if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): + # empty out the voice file + open(audio_state.VOICE_AUDIO_FILENAME, "w").close() + record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state]) record_audio_thd.daemon = True record_audio_thd.start() @@ -259,6 +324,10 @@ if __name__ == "__main__": send_audio_thd.daemon = True send_audio_thd.start() + control_thd = threading.Thread(target = controlThread, args = [audio_state]) + control_thd.daemon = True + control_thd.start() + print("Press enter to start a new message") for line in sys.stdin: resetAudio(audio_state) @@ -268,6 +337,8 @@ if __name__ == "__main__": print("Joining threads") audio_state.record_audio = False audio_state.transcribe_audio = False + audio_state.run_control_thread = False record_audio_thd.join() transcribe_audio_thd.join() + control_thd.join() |
