summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Sounds/Noise_Off.wavbin159822 -> 67278 bytes
-rw-r--r--Sounds/Noise_On.wavbin90190 -> 67278 bytes
-rw-r--r--transcribe.py115
3 files changed, 34 insertions, 81 deletions
diff --git a/Sounds/Noise_Off.wav b/Sounds/Noise_Off.wav
index 7c8a9bf..0d3843c 100644
--- a/Sounds/Noise_Off.wav
+++ b/Sounds/Noise_Off.wav
Binary files differ
diff --git a/Sounds/Noise_On.wav b/Sounds/Noise_On.wav
index 925b3ed..28c8f6b 100644
--- a/Sounds/Noise_On.wav
+++ b/Sounds/Noise_On.wav
Binary files differ
diff --git a/transcribe.py b/transcribe.py
index f9aee9f..62e6add 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -41,8 +41,6 @@ class AudioState:
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
- VOICE_AUDIO_FILENAME = "audio.wav"
-
# PyAudio object
p = None
@@ -131,60 +129,19 @@ def recordAudio(audio_state):
time.sleep(0.1)
continue
- audio_state.audio_lock.acquire()
audio_state.frames.append(data)
max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
audio_state.frames = audio_state.frames[-1 * max_frames :]
- audio_state.audio_lock.release()
print("Done recording")
-# Saves audio. recordAudio() may continue running while this takes place.
-def saveAudio(audio_state, filename):
- min_frames = int(audio_state.RATE * audio_state.MIN_LENGTH_S / audio_state.CHUNK)
- if len(audio_state.frames) < min_frames:
- return
-
- wf = wave.open(filename, 'wb')
- wf.setnchannels(audio_state.CHANNELS)
- wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
- wf.setframerate(audio_state.RATE)
-
- audio_state.audio_lock.acquire()
- frames = copy.deepcopy(audio_state.frames)
- audio_state.audio_lock.release()
-
- wf.writeframes(b''.join(frames))
- wf.close()
-
- # Normalize volume. This seems to make the neural net a little more
- # consistent.
- raw = pydub_AudioSegment.from_wav(filename)
- normalized = pydub_effects.normalize(raw)
- normalized.export(filename, format="wav")
-
-def resetDiskAudioLocked(audio_state, filename):
- if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- # empty out the voice file
- open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
-
- wf = wave.open(filename, 'wb')
- wf.setnchannels(audio_state.CHANNELS)
- wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
- wf.setframerate(audio_state.RATE)
-
- wf.writeframes(b''.join([]))
- wf.close()
-
def resetAudioLocked(audio_state):
audio_state.frames = []
audio_state.transcribe_no_change_count = 0
audio_state.transcribe_sleep_duration = \
audio_state.transcribe_sleep_duration_min_s
- resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
-
audio_state.committed_text = ""
audio_state.text = ""
@@ -199,14 +156,21 @@ def resetAudio(audio_state):
audio_state.transcribe_lock.release()
# Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, filename):
+def transcribe(audio_state, model, frames):
- audio_state.transcribe_lock.acquire()
- audio = whisper.load_audio(filename)
- audio_state.transcribe_lock.release()
+ start_time = time.time()
+
+ frames = audio_state.frames
+ # Convert from signed 16-bit int [-32768, 32767] to signed 16-bit float on
+ # [-1, 1].
+ # We should technically acquire a lock to protect frames, but this is
+ # really slow and in practice it doesn't make the app crash, so who cares.
+ frames = np.asarray(audio_state.frames)
+ audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0
audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
audio_state.MAX_LENGTH_S_WHISPER)
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
result = None
@@ -215,29 +179,33 @@ def transcribe(audio_state, model, filename):
for temp in (0.00,):
print("temp: {}".format(temp))
options = whisper.DecodingOptions(language = audio_state.language,
- beam_size = 5, temperature = temp)
+ beam_size = 5, temperature = temp, without_timestamps = True)
result = whisper.decode(model, mel, options)
if result.avg_logprob < -1.0:
print("avg logprob: {}".format(result.avg_logprob))
+ result = None
continue
if result.compression_ratio > 2.4:
print("compression ratio: {}".format(result.compression_ratio))
+ result = None
continue
if result.no_speech_prob > 0.60:
print("no speech prob: {}".format(result.no_speech_prob))
- return None
+ result = None
+ continue
- return result.text
+ result = result.text
+ break
- return None
+ return result
def transcribeAudio(audio_state, model):
+ last_transcribe_time = time.time()
while audio_state.run_app == True:
# Pace this out
- print("sleep duration: {}".format(audio_state.transcribe_sleep_duration))
time.sleep(audio_state.transcribe_sleep_duration)
# Increase sleep time. Code below will set sleep time back to minimum
@@ -249,53 +217,44 @@ def transcribeAudio(audio_state, model):
audio_state.transcribe_sleep_duration = min(
audio_state.transcribe_sleep_duration_max_s,
longer_sleep_dur)
- print("next sleep duration: {}".format(audio_state.transcribe_sleep_duration))
-
- saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME)
- if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- time.sleep(0.1)
- continue
-
- text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME)
+ text = transcribe(audio_state, model, audio_state.frames)
if not text:
+ print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
+ last_transcribe_time = time.time()
continue
- audio_state.transcribe_lock.acquire()
if audio_state.drop_transcription:
audio_state.drop_transcription = False
- audio_state.transcribe_lock.release()
+ print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
+ last_transcribe_time = time.time()
continue
words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
- print("Transcription: {}".format(audio_state.text))
+ now = time.time()
+ print("Transcription ({} seconds): {}".format(
+ now - last_transcribe_time,
+ audio_state.text))
+ last_transcribe_time = now
old_text = audio_state.text
- #old_words = audio_state.text.split()
- #new_words = text.split()
audio_state.text = string_matcher.matchStrings(audio_state.text,
text, window_size = 30)
- #audio_state.text = text
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.
audio_state.transcribe_no_change_count = 0
audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
- audio_state.transcribe_lock.release()
-
def sendAudio(audio_state):
while audio_state.run_app == True:
- audio_state.transcribe_lock.acquire()
-
text = audio_state.committed_text + " " + audio_state.text
ret = osc_ctrl.sendMessageLazy(audio_state.osc_client, text,
audio_state.tx_state)
is_paging = (ret == osc_ctrl.SEND_MSG_LAZY_SENT_NON_EMPTY)
osc_ctrl.indicatePaging(audio_state.osc_client, is_paging)
- audio_state.transcribe_lock.release()
# Pace this out
time.sleep(0.01)
@@ -325,31 +284,25 @@ def readControllerInput(audio_state):
osc_ctrl.indicateSpeech(audio_state.osc_client, True)
playsound(os.path.abspath("Sounds/Noise_On.wav"))
- audio_state.transcribe_lock.acquire()
- audio_state.audio_lock.acquire()
resetAudioLocked(audio_state)
resetDisplayLocked(audio_state)
audio_state.drop_transcription = True
audio_state.audio_paused = False
- audio_state.audio_lock.release()
- audio_state.transcribe_lock.release()
-
def transcribeLoop(mic: str, language: str):
audio_state = getMicStream(mic)
audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
- if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
- # empty out the voice file
- open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
-
record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
record_audio_thd.daemon = True
record_audio_thd.start()
print("Safe to start talking")
- model = whisper.load_model("base")
+ #model = whisper.load_model("tiny")
+ #model = whisper.load_model("base")
+ model = whisper.load_model("small")
+ #model = whisper.load_model("medium")
transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model])
transcribe_audio_thd.daemon = True