summaryrefslogtreecommitdiffstats
path: root/transcribe.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-11-12 14:14:49 -0800
committeryum <yum.food.vr@gmail.com>2022-11-12 14:14:49 -0800
commit3b038d23ec7621e0164c1901b416bf77a27d8cf3 (patch)
treee92badc27b9a93bd7173de993da02a20a560a778 /transcribe.py
parent8a7858ad4e965f5410faa4ae5e7ad4f79a280d43 (diff)
Clicking the left joystick resets the board.
* Increase no speech probability threshold. This is what was preventing short transcriptions from working. We rely more on the avg logprob filter now. * Remove string matching logic from transcribe. Now when we get 2 consecutive identical transcriptions, we commit the transcription. This *could* cause words to get cut off but in practice it doesn't seem to happen. * Fix steamvr joystick click detection. Moving the joystick would also fire the event, which is not correct. * Combine locks in transcribe.py. * Remove "clear" vocal control. * osc_ctrl.clear() resets last_message_encoded * Remove osc_ctrl.sendMessage (unused)
Diffstat (limited to 'transcribe.py')
-rw-r--r--transcribe.py123
1 files changed, 82 insertions, 41 deletions
diff --git a/transcribe.py b/transcribe.py
index 2e4457e..1327515 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -2,7 +2,6 @@
import argparse
import copy
-import string_matcher
import os
import osc_ctrl
# python3 -m pip install pydub
@@ -12,6 +11,7 @@ from pydub import effects as pydub_effects
# python3 -m pip install pyaudio
# License: MIT.
import pyaudio
+import steamvr
import sys
import threading
import time
@@ -42,21 +42,29 @@ class AudioState:
# PyAudio stream object
stream = None
- frames = []
- frames_lock = threading.Lock()
-
text = ""
- text_lock = threading.Lock()
+ committed_text = ""
+ frames = []
+ # Locks access to `text`, `frames`, and audio stored on disk.
+ lock = threading.Lock()
- record_audio = True
- transcribe_audio = True
- send_audio = True
+ # Used to tell the threads when to stop.
+ run_app = True
transcribe_sleep_duration_min_s = 0.05
transcribe_sleep_duration_max_s = 1.50
transcribe_no_change_count = 0
transcribe_sleep_duration = transcribe_sleep_duration_min_s
- # The language the user is speaking in.
+
+ tx_state = osc_ctrl.OscTxState()
+
+ # The transcription thread transcribes without holding locks, then
+ # blocks on it. Thus we need some way to tell the transcription
+ # thread to drop that transcription.
+ drop_transcription = False
+
+ # The language the user is speaking in. Default is English but user may set
+ # this to whatever they want.
language = whisper.tokenizer.TO_LANGUAGE_CODE["english"]
# When the user says `over`, we stop displaying new transcriptions until
@@ -104,18 +112,18 @@ def getMicStream(which_mic):
return audio_state
-# Continuously records audio as long as audio_state.record_audio is set.
+# Continuously records audio as long as audio_state.run_app is set.
def recordAudio(audio_state):
print("Recording audio")
- while audio_state.record_audio:
+ while audio_state.run_app:
data = audio_state.stream.read(audio_state.CHUNK)
- audio_state.frames_lock.acquire()
+ audio_state.lock.acquire()
audio_state.frames.append(data)
max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
audio_state.frames = audio_state.frames[-1 * max_frames :]
- audio_state.frames_lock.release()
+ audio_state.lock.release()
print("Done recording")
@@ -130,9 +138,9 @@ def saveAudio(audio_state, filename):
wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
wf.setframerate(audio_state.RATE)
- audio_state.frames_lock.acquire()
+ audio_state.lock.acquire()
frames = copy.deepcopy(audio_state.frames)
- audio_state.frames_lock.release()
+ audio_state.lock.release()
wf.writeframes(b''.join(frames))
wf.close()
@@ -164,20 +172,23 @@ def resetAudioLocked(audio_state):
resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+ audio_state.committed_text = ""
audio_state.text = ""
- osc_ctrl.clear(audio_state.osc_client)
+
+def resetDisplayLocked(audio_state):
+ osc_ctrl.clear(audio_state.osc_client, audio_state.tx_state)
def resetAudio(audio_state):
- audio_state.frames_lock.acquire()
+ audio_state.lock.acquire()
resetAudioLocked(audio_state)
- audio_state.frames_lock.release()
+ audio_state.lock.release()
# Transcribe the audio recorded in a file.
def transcribe(audio_state, model, filename):
- audio_state.frames_lock.acquire()
+ audio_state.lock.acquire()
audio = whisper.load_audio(filename)
- audio_state.frames_lock.release()
+ audio_state.lock.release()
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
@@ -186,7 +197,7 @@ def transcribe(audio_state, model, filename):
beam_size = 5)
result = whisper.decode(model, mel, options)
- if result.no_speech_prob > 0.15:
+ if result.no_speech_prob > 0.60:
print("no speech prob: {}".format(result.no_speech_prob))
return None
@@ -201,7 +212,7 @@ def transcribe(audio_state, model, filename):
return result.text
def transcribeAudio(audio_state, model):
- while audio_state.transcribe_audio == True:
+ while audio_state.run_app == True:
# Pace this out
print("sleep duration: {}".format(audio_state.transcribe_sleep_duration))
time.sleep(audio_state.transcribe_sleep_duration)
@@ -226,18 +237,29 @@ def transcribeAudio(audio_state, model):
text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME)
if not text:
continue
+ audio_state.lock.acquire()
- audio_state.text_lock.acquire()
+ if audio_state.drop_transcription:
+ audio_state.drop_transcription = False
+ audio_state.lock.release()
+ continue
+
+ # Hack: two consecutive identical transcriptions get "committed".
+ if text == audio_state.text:
+ print("Commit!")
+ old_commit = audio_state.committed_text
+ resetAudioLocked(audio_state)
+ audio_state.committed_text = old_commit + " " + text
+ audio_state.lock.release()
+ continue
+ else:
+ print("text: {}".format(text))
+ print("audio_state.text: {}".format(audio_state.text))
words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
if len(words) > 0:
- if words[-1] == "clear":
- resetAudio(audio_state)
- audio_state.text_lock.release()
- audio_state.display_paused = False
- continue
- elif words[-1] == "over":
+ if words[-1] == "over":
words = words[0:-1]
audio_state.display_paused = True
@@ -247,32 +269,45 @@ def transcribeAudio(audio_state, model):
#old_words = audio_state.text.split()
#new_words = text.split()
- audio_state.text = string_matcher.matchStrings(audio_state.text,
- text, window_size = 5)
+ audio_state.text = text
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.
audio_state.transcribe_no_change_count = 0
audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
- audio_state.text_lock.release()
+ audio_state.lock.release()
def sendAudio(audio_state):
- tx_state = osc_ctrl.OscTxState()
-
- while audio_state.send_audio == True:
+ while audio_state.run_app == True:
if audio_state.display_paused:
time.sleep(0.1)
continue
- audio_state.text_lock.acquire()
- text = copy.deepcopy(audio_state.text)
- osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state)
- audio_state.text_lock.release()
+ audio_state.lock.acquire()
+ text = audio_state.committed_text + " " + audio_state.text
+ osc_ctrl.sendMessageLazy(audio_state.osc_client, text, audio_state.tx_state)
+ audio_state.lock.release()
# Pace this out
time.sleep(0.01)
+def readControllerInput(audio_state):
+ session = steamvr.SessionState()
+ while audio_state.run_app == True:
+ time.sleep(0.05)
+
+ event = steamvr.pollButtonPress(session)
+
+ if event == steamvr.EVENT_RISING_EDGE:
+ print("event get")
+ audio_state.lock.acquire()
+ resetAudioLocked(audio_state)
+ resetDisplayLocked(audio_state)
+ audio_state.drop_transcription = True
+ audio_state.display_paused = False
+ audio_state.lock.release()
+
def transcribeLoop(mic: str, language: str):
audio_state = getMicStream(mic)
audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
@@ -297,18 +332,24 @@ def transcribeLoop(mic: str, language: str):
send_audio_thd.daemon = True
send_audio_thd.start()
+ controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state])
+ controller_input_thd.daemon = True
+ controller_input_thd.start()
+
print("Press enter or say 'Clear' to start a new message. Say 'Over' to " +
"pause the display (saying 'Clear' resets it again).")
for line in sys.stdin:
resetAudio(audio_state)
+ resetDisplayLocked(audio_state)
if "exit" in line or "quit" in line:
break
print("Joining threads")
- audio_state.record_audio = False
- audio_state.transcribe_audio = False
+ audio_state.run_app = False
+ audio_state.run_app = False
record_audio_thd.join()
transcribe_audio_thd.join()
+ controller_input_thd.join()
if __name__ == "__main__":