summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-12-14 21:45:30 -0800
committeryum <yum.food.vr@gmail.com>2022-12-14 23:04:29 -0800
commit8326dee0bf01956b450858212cbdba3403b32b0d (patch)
tree3c218f1e3159543df9b935dd6a43685a0811107d
parentf9bcee6acf6f8a418d2f96368941fc05e4b72252 (diff)
Optimize transcription latency
Shave off ~500ms due to locking. Acquiring a threading.Lock takes hundreds of milliseconds and the global interpreter lock already takes care of most crashy race conditions, so just remove the locks. Avoid writing audio to disk, saving more time (and disk wear / IOPS). Add basic profiling to transcribe(). Omit timestamps, since we don't use them (maybe we should!) Shorten noise indicators to 350ms The whisper behavior where it repeats tokens causes certain transcriptions to take many seconds. I haven't thought about how to fix this, yet.
-rw-r--r--Sounds/Noise_Off.wavbin159822 -> 67278 bytes
-rw-r--r--Sounds/Noise_On.wavbin90190 -> 67278 bytes
-rw-r--r--transcribe.py115
3 files changed, 34 insertions, 81 deletions
diff --git a/Sounds/Noise_Off.wav b/Sounds/Noise_Off.wav
index 7c8a9bf..0d3843c 100644
--- a/Sounds/Noise_Off.wav
+++ b/Sounds/Noise_Off.wav
Binary files differ
diff --git a/Sounds/Noise_On.wav b/Sounds/Noise_On.wav
index 925b3ed..28c8f6b 100644
--- a/Sounds/Noise_On.wav
+++ b/Sounds/Noise_On.wav
Binary files differ
diff --git a/transcribe.py b/transcribe.py
index f9aee9f..62e6add 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -41,8 +41,6 @@ class AudioState:
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
- VOICE_AUDIO_FILENAME = "audio.wav"
-
# PyAudio object
p = None
@@ -131,60 +129,19 @@ def recordAudio(audio_state):
time.sleep(0.1)
continue
- audio_state.audio_lock.acquire()
audio_state.frames.append(data)
max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
audio_state.frames = audio_state.frames[-1 * max_frames :]
- audio_state.audio_lock.release()
print("Done recording")
-# Saves audio. recordAudio() may continue running while this takes place.
-def saveAudio(audio_state, filename):
- min_frames = int(audio_state.RATE * audio_state.MIN_LENGTH_S / audio_state.CHUNK)
- if len(audio_state.frames) < min_frames:
- return
-
- wf = wave.open(filename, 'wb')
- wf.setnchannels(audio_state.CHANNELS)
- wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
- wf.setframerate(audio_state.RATE)
-
- audio_state.audio_lock.acquire()
- frames = copy.deepcopy(audio_state.frames)
- audio_state.audio_lock.release()
-
- wf.writeframes(b''.join(frames))
- wf.close()
-
- # Normalize volume. This seems to make the neural net a little more
- # consistent.
- raw = pydub_AudioSegment.from_wav(filename)
- normalized = pydub_effects.normalize(raw)
- normalized.export(filename, format="wav")
-
-def resetDiskAudioLocked(audio_state, filename):
- if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- # empty out the voice file
- open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
-
- wf = wave.open(filename, 'wb')
- wf.setnchannels(audio_state.CHANNELS)
- wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
- wf.setframerate(audio_state.RATE)
-
- wf.writeframes(b''.join([]))
- wf.close()
-
def resetAudioLocked(audio_state):
audio_state.frames = []
audio_state.transcribe_no_change_count = 0
audio_state.transcribe_sleep_duration = \
audio_state.transcribe_sleep_duration_min_s
- resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
-
audio_state.committed_text = ""
audio_state.text = ""
@@ -199,14 +156,21 @@ def resetAudio(audio_state):
audio_state.transcribe_lock.release()
# Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, filename):
+def transcribe(audio_state, model, frames):
- audio_state.transcribe_lock.acquire()
- audio = whisper.load_audio(filename)
- audio_state.transcribe_lock.release()
+ start_time = time.time()
+
+ frames = audio_state.frames
+ # Convert from signed 16-bit int [-32768, 32767] to signed 16-bit float on
+ # [-1, 1].
+ # We should technically acquire a lock to protect frames, but this is
+ # really slow and in practice it doesn't make the app crash, so who cares.
+ frames = np.asarray(audio_state.frames)
+ audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0
audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
audio_state.MAX_LENGTH_S_WHISPER)
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
result = None
@@ -215,29 +179,33 @@ def transcribe(audio_state, model, filename):
for temp in (0.00,):
print("temp: {}".format(temp))
options = whisper.DecodingOptions(language = audio_state.language,
- beam_size = 5, temperature = temp)
+ beam_size = 5, temperature = temp, without_timestamps = True)
result = whisper.decode(model, mel, options)
if result.avg_logprob < -1.0:
print("avg logprob: {}".format(result.avg_logprob))
+ result = None
continue
if result.compression_ratio > 2.4:
print("compression ratio: {}".format(result.compression_ratio))
+ result = None
continue
if result.no_speech_prob > 0.60:
print("no speech prob: {}".format(result.no_speech_prob))
- return None
+ result = None
+ continue
- return result.text
+ result = result.text
+ break
- return None
+ return result
def transcribeAudio(audio_state, model):
+ last_transcribe_time = time.time()
while audio_state.run_app == True:
# Pace this out
- print("sleep duration: {}".format(audio_state.transcribe_sleep_duration))
time.sleep(audio_state.transcribe_sleep_duration)
# Increase sleep time. Code below will set sleep time back to minimum
@@ -249,53 +217,44 @@ def transcribeAudio(audio_state, model):
audio_state.transcribe_sleep_duration = min(
audio_state.transcribe_sleep_duration_max_s,
longer_sleep_dur)
- print("next sleep duration: {}".format(audio_state.transcribe_sleep_duration))
-
- saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME)
- if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- time.sleep(0.1)
- continue
-
- text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME)
+ text = transcribe(audio_state, model, audio_state.frames)
if not text:
+ print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
+ last_transcribe_time = time.time()
continue
- audio_state.transcribe_lock.acquire()
if audio_state.drop_transcription:
audio_state.drop_transcription = False
- audio_state.transcribe_lock.release()
+ print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
+ last_transcribe_time = time.time()
continue
words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
- print("Transcription: {}".format(audio_state.text))
+ now = time.time()
+ print("Transcription ({} seconds): {}".format(
+ now - last_transcribe_time,
+ audio_state.text))
+ last_transcribe_time = now
old_text = audio_state.text
- #old_words = audio_state.text.split()
- #new_words = text.split()
audio_state.text = string_matcher.matchStrings(audio_state.text,
text, window_size = 30)
- #audio_state.text = text
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.
audio_state.transcribe_no_change_count = 0
audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
- audio_state.transcribe_lock.release()
-
def sendAudio(audio_state):
while audio_state.run_app == True:
- audio_state.transcribe_lock.acquire()
-
text = audio_state.committed_text + " " + audio_state.text
ret = osc_ctrl.sendMessageLazy(audio_state.osc_client, text,
audio_state.tx_state)
is_paging = (ret == osc_ctrl.SEND_MSG_LAZY_SENT_NON_EMPTY)
osc_ctrl.indicatePaging(audio_state.osc_client, is_paging)
- audio_state.transcribe_lock.release()
# Pace this out
time.sleep(0.01)
@@ -325,31 +284,25 @@ def readControllerInput(audio_state):
osc_ctrl.indicateSpeech(audio_state.osc_client, True)
playsound(os.path.abspath("Sounds/Noise_On.wav"))
- audio_state.transcribe_lock.acquire()
- audio_state.audio_lock.acquire()
resetAudioLocked(audio_state)
resetDisplayLocked(audio_state)
audio_state.drop_transcription = True
audio_state.audio_paused = False
- audio_state.audio_lock.release()
- audio_state.transcribe_lock.release()
-
def transcribeLoop(mic: str, language: str):
audio_state = getMicStream(mic)
audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
- if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- # empty out the voice file
- open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
-
record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
record_audio_thd.daemon = True
record_audio_thd.start()
print("Safe to start talking")
- model = whisper.load_model("base")
+ #model = whisper.load_model("tiny")
+ #model = whisper.load_model("base")
+ model = whisper.load_model("small")
+ #model = whisper.load_model("medium")
transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model])
transcribe_audio_thd.daemon = True