#!/usr/bin/env python3 import copy import fileinput import os import osc_ctrl # python3 -m pip install pyaudio import pyaudio import threading import time import wave # python3 -m pip install git+https://github.com/openai/whisper.git # python3 -m pip install torch -f https://download.pytorch.org/whl/torch_stable.html import whisper class AudioState: CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 # This matches the framerate expected by whisper. RATE = 16000 # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. MAX_LENGTH_S = 30 # PyAudio object p = None # PyAudio stream object stream = None frames = [] frames_lock = threading.Lock() text = "" text_lock = threading.Lock() record_audio = True transcribe_audio = True send_audio = True def getMicStream(): audio_state = AudioState() audio_state.p = pyaudio.PyAudio() info = audio_state.p.get_host_api_info_by_index(0) numdevices = info.get('deviceCount') print("Finding index mic...") got_match = False device_index = -1 while got_match == False: for i in range(0, numdevices): if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0: device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name') #print("Input Device id ", i, " - ", device_name) if "Digital Audio Interface" in device_name: print("Got match: {}".format(device_name)) device_index = i got_match = True if got_match == False: print("No match, sleeping") time.sleep(3) audio_state.stream = audio_state.p.open(format=audio_state.FORMAT, channels=audio_state.CHANNELS, rate=audio_state.RATE, input=True, frames_per_buffer=audio_state.CHUNK, input_device_index=device_index) return audio_state # Continuously records audio as long as audio_state.record_audio is set. def recordAudio(audio_state): print("Recording audio") while audio_state.record_audio: data = audio_state.stream.read(audio_state.CHUNK) audio_state.frames_lock.acquire() audio_state.frames.append(data) max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: audio_state.frames = audio_state.frames[-1 * max_frames :] audio_state.frames_lock.release() print("Done recording") # Saves audio. recordAudio() may continue running while this takes place. def saveAudio(audio_state, filename): wf = wave.open(filename, 'wb') wf.setnchannels(audio_state.CHANNELS) wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT)) wf.setframerate(audio_state.RATE) audio_state.frames_lock.acquire() frames = copy.deepcopy(audio_state.frames) audio_state.frames_lock.release() wf.writeframes(b''.join(frames)) wf.close() def resetAudio(audio_state): audio_state.frames_lock.acquire() audio_state.frames = [] audio_state.frames_lock.release() # Transcribe the audio recorded in a file. def transcribe(model, filename): print("Loading audio") audio = whisper.load_audio(filename) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) options = whisper.DecodingOptions(language = "en") result = whisper.decode(model, mel, options) print("Transcribed text: {}".format(result.text)) return result.text def transcribeAudio(audio_state, model): while audio_state.transcribe_audio == True: print("Saving audio") saveAudio(audio_state, "audio.wav") print("Beginning transcription") text = transcribe(model, "audio.wav") audio_state.text_lock.acquire() audio_state.text = text audio_state.text_lock.release() # Pace this out time.sleep(0.2) def sendAudio(audio_state): tx_state = osc_ctrl.OscTxState() while audio_state.send_audio == True: audio_state.text_lock.acquire() text = copy.deepcopy(audio_state.text) audio_state.text_lock.release() osc_ctrl.sendMessageLazy(text, tx_state) # Pace this out time.sleep(0.05) if __name__ == "__main__": audio_state = getMicStream() record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state]) record_audio_thd.daemon = True record_audio_thd.start() print("Safe to start talking") model = whisper.load_model("base") transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model]) transcribe_audio_thd.daemon = True transcribe_audio_thd.start() send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state]) send_audio_thd.daemon = True send_audio_thd.start() print("Press enter to start a new message") for line in fileinput.input(): resetAudio(audio_state) if "exit" in line or "quit" in line: break print("Joining threads") audio_state.record_audio = False audio_state.transcribe_audio = False record_audio_thd.join() transcribe_audio_thd.join()