diff options
| author | yum <yum.food.vr@gmail.com> | 2022-12-14 21:45:30 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-12-14 23:04:29 -0800 |
| commit | 8326dee0bf01956b450858212cbdba3403b32b0d (patch) | |
| tree | 3c218f1e3159543df9b935dd6a43685a0811107d | |
| parent | f9bcee6acf6f8a418d2f96368941fc05e4b72252 (diff) | |
Optimize transcription latency
Shave off ~500ms due to locking. Acquiring a threading.Lock takes
hundreds of milliseconds and the global interpreter lock already takes
care of most crashy race conditions, so just remove the locks.
Avoid writing audio to disk, saving more time (and disk wear / IOPS).
Add basic profiling to transcribe().
Omit timestamps, since we don't use them (maybe we should!)
Shorten noise indicators to 350ms
The whisper behavior where it repeats tokens causes certain
transcriptions to take many seconds. I haven't thought about how to fix
this, yet.
| -rw-r--r-- | Sounds/Noise_Off.wav | bin | 159822 -> 67278 bytes | |||
| -rw-r--r-- | Sounds/Noise_On.wav | bin | 90190 -> 67278 bytes | |||
| -rw-r--r-- | transcribe.py | 115 |
3 files changed, 34 insertions, 81 deletions
diff --git a/Sounds/Noise_Off.wav b/Sounds/Noise_Off.wav Binary files differindex 7c8a9bf..0d3843c 100644 --- a/Sounds/Noise_Off.wav +++ b/Sounds/Noise_Off.wav diff --git a/Sounds/Noise_On.wav b/Sounds/Noise_On.wav Binary files differindex 925b3ed..28c8f6b 100644 --- a/Sounds/Noise_On.wav +++ b/Sounds/Noise_On.wav diff --git a/transcribe.py b/transcribe.py index f9aee9f..62e6add 100644 --- a/transcribe.py +++ b/transcribe.py @@ -41,8 +41,6 @@ class AudioState: # The minimum length that recordAudio() will wait for before saving audio. MIN_LENGTH_S = 1 - VOICE_AUDIO_FILENAME = "audio.wav" - # PyAudio object p = None @@ -131,60 +129,19 @@ def recordAudio(audio_state): time.sleep(0.1) continue - audio_state.audio_lock.acquire() audio_state.frames.append(data) max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK) if len(audio_state.frames) > max_frames: audio_state.frames = audio_state.frames[-1 * max_frames :] - audio_state.audio_lock.release() print("Done recording") -# Saves audio. recordAudio() may continue running while this takes place. -def saveAudio(audio_state, filename): - min_frames = int(audio_state.RATE * audio_state.MIN_LENGTH_S / audio_state.CHUNK) - if len(audio_state.frames) < min_frames: - return - - wf = wave.open(filename, 'wb') - wf.setnchannels(audio_state.CHANNELS) - wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT)) - wf.setframerate(audio_state.RATE) - - audio_state.audio_lock.acquire() - frames = copy.deepcopy(audio_state.frames) - audio_state.audio_lock.release() - - wf.writeframes(b''.join(frames)) - wf.close() - - # Normalize volume. This seems to make the neural net a little more - # consistent. - raw = pydub_AudioSegment.from_wav(filename) - normalized = pydub_effects.normalize(raw) - normalized.export(filename, format="wav") - -def resetDiskAudioLocked(audio_state, filename): - if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): - # empty out the voice file - open(audio_state.VOICE_AUDIO_FILENAME, "w").close() - - wf = wave.open(filename, 'wb') - wf.setnchannels(audio_state.CHANNELS) - wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT)) - wf.setframerate(audio_state.RATE) - - wf.writeframes(b''.join([])) - wf.close() - def resetAudioLocked(audio_state): audio_state.frames = [] audio_state.transcribe_no_change_count = 0 audio_state.transcribe_sleep_duration = \ audio_state.transcribe_sleep_duration_min_s - resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME) - audio_state.committed_text = "" audio_state.text = "" @@ -199,14 +156,21 @@ def resetAudio(audio_state): audio_state.transcribe_lock.release() # Transcribe the audio recorded in a file. -def transcribe(audio_state, model, filename): +def transcribe(audio_state, model, frames): - audio_state.transcribe_lock.acquire() - audio = whisper.load_audio(filename) - audio_state.transcribe_lock.release() + start_time = time.time() + + frames = audio_state.frames + # Convert from signed 16-bit int [-32768, 32767] to signed 16-bit float on + # [-1, 1]. + # We should technically acquire a lock to protect frames, but this is + # really slow and in practice it doesn't make the app crash, so who cares. + frames = np.asarray(audio_state.frames) + audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0 audio = whisper.pad_or_trim(audio, length = audio_state.RATE * audio_state.MAX_LENGTH_S_WHISPER) + mel = whisper.log_mel_spectrogram(audio).to(model.device) result = None @@ -215,29 +179,33 @@ def transcribe(audio_state, model, filename): for temp in (0.00,): print("temp: {}".format(temp)) options = whisper.DecodingOptions(language = audio_state.language, - beam_size = 5, temperature = temp) + beam_size = 5, temperature = temp, without_timestamps = True) result = whisper.decode(model, mel, options) if result.avg_logprob < -1.0: print("avg logprob: {}".format(result.avg_logprob)) + result = None continue if result.compression_ratio > 2.4: print("compression ratio: {}".format(result.compression_ratio)) + result = None continue if result.no_speech_prob > 0.60: print("no speech prob: {}".format(result.no_speech_prob)) - return None + result = None + continue - return result.text + result = result.text + break - return None + return result def transcribeAudio(audio_state, model): + last_transcribe_time = time.time() while audio_state.run_app == True: # Pace this out - print("sleep duration: {}".format(audio_state.transcribe_sleep_duration)) time.sleep(audio_state.transcribe_sleep_duration) # Increase sleep time. Code below will set sleep time back to minimum @@ -249,53 +217,44 @@ def transcribeAudio(audio_state, model): audio_state.transcribe_sleep_duration = min( audio_state.transcribe_sleep_duration_max_s, longer_sleep_dur) - print("next sleep duration: {}".format(audio_state.transcribe_sleep_duration)) - - saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME) - if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): - time.sleep(0.1) - continue - - text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME) + text = transcribe(audio_state, model, audio_state.frames) if not text: + print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) + last_transcribe_time = time.time() continue - audio_state.transcribe_lock.acquire() if audio_state.drop_transcription: audio_state.drop_transcription = False - audio_state.transcribe_lock.release() + print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time)) + last_transcribe_time = time.time() continue words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() - print("Transcription: {}".format(audio_state.text)) + now = time.time() + print("Transcription ({} seconds): {}".format( + now - last_transcribe_time, + audio_state.text)) + last_transcribe_time = now old_text = audio_state.text - #old_words = audio_state.text.split() - #new_words = text.split() audio_state.text = string_matcher.matchStrings(audio_state.text, text, window_size = 30) - #audio_state.text = text if old_text != audio_state.text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. audio_state.transcribe_no_change_count = 0 audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s - audio_state.transcribe_lock.release() - def sendAudio(audio_state): while audio_state.run_app == True: - audio_state.transcribe_lock.acquire() - text = audio_state.committed_text + " " + audio_state.text ret = osc_ctrl.sendMessageLazy(audio_state.osc_client, text, audio_state.tx_state) is_paging = (ret == osc_ctrl.SEND_MSG_LAZY_SENT_NON_EMPTY) osc_ctrl.indicatePaging(audio_state.osc_client, is_paging) - audio_state.transcribe_lock.release() # Pace this out time.sleep(0.01) @@ -325,31 +284,25 @@ def readControllerInput(audio_state): osc_ctrl.indicateSpeech(audio_state.osc_client, True) playsound(os.path.abspath("Sounds/Noise_On.wav")) - audio_state.transcribe_lock.acquire() - audio_state.audio_lock.acquire() resetAudioLocked(audio_state) resetDisplayLocked(audio_state) audio_state.drop_transcription = True audio_state.audio_paused = False - audio_state.audio_lock.release() - audio_state.transcribe_lock.release() - def transcribeLoop(mic: str, language: str): audio_state = getMicStream(mic) audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language] - if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): - # empty out the voice file - open(audio_state.VOICE_AUDIO_FILENAME, "w").close() - record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state]) record_audio_thd.daemon = True record_audio_thd.start() print("Safe to start talking") - model = whisper.load_model("base") + #model = whisper.load_model("tiny") + #model = whisper.load_model("base") + model = whisper.load_model("small") + #model = whisper.load_model("medium") transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model]) transcribe_audio_thd.daemon = True |
