summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md5
-rw-r--r--libtastt.py20
-rw-r--r--osc_ctrl.py4
-rw-r--r--transcribe.py89
4 files changed, 104 insertions, 14 deletions
diff --git a/README.md b/README.md
index 3d999fd..c5e58a0 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,11 @@ To use the STT:
layer. Something must be rethought to bring these numbers down.
3. Implement multicore YAML parsing. This will make working with large
animators much more practical.
+ 4. Transcription engine sleep interval increases exponentially up to 1-2
+ seconds, then jumps back to a short interval once speech is detected.
+ This should significantly cut down on idle resource consumption. Perhaps
+ there's even a more efficient way to detect the odds that anything is
+ being said, which we could use to gate transcription.
5. Bugfixes
1. The whisper STT says "Thank you." when there's no audio?
6. Shine
diff --git a/libtastt.py b/libtastt.py
index 12c95f0..e12f93f 100644
--- a/libtastt.py
+++ b/libtastt.py
@@ -337,11 +337,14 @@ def generateFXLayer(which_layer: int, anim: libunity.UnityAnimator, layer:
# Generic toggle adding utility.
# Generates the layer and parameter.
+# Returns a map containing the off and on states, as well as the
+# transitions between them.
def generateToggle(layer_name: str,
gen_anim_dir: str,
off_anim_basename: str,
on_anim_basename: str,
- anim: libunity.UnityAnimator):
+ anim: libunity.UnityAnimator) -> typing.Dict[str,
+ libunity.UnityDocument]:
layer = anim.addLayer(layer_name)
# For simplicity, use the layer name as the parameter name.
@@ -370,7 +373,13 @@ def generateToggle(layer_name: str,
anim.addTransitionBooleanCondition(on_state,
on_to_off_trans, parameter_name, False)
- pass
+ result = {}
+ result["off"] = off_state
+ result["on"] = on_state
+ result["off_to_on"] = off_to_on_trans
+ result["on_to_off"] = on_to_off_trans
+
+ return result
def generateFX(guid_map, gen_anim_dir):
anim = libunity.UnityAnimator()
@@ -382,11 +391,16 @@ def generateFX(guid_map, gen_anim_dir):
print("Generating layer {}/{}".format(which_layer, len(layers.items())), file=sys.stderr)
generateFXLayer(which_layer, anim, layer, gen_anim_dir)
- generateToggle(generate_utils.getSpeechNoiseToggleParam(),
+ states = generateToggle(
+ generate_utils.getSpeechNoiseToggleParam(),
"Animations/",
"TaSTT_Speech_Noise_Off.anim",
"TaSTT_Speech_Noise_On.anim",
anim)
+ # Enable beeping only if board is out.
+ anim.addTransitionBooleanCondition(states["off"],
+ states["off_to_on"], generate_utils.getToggleParam(), True)
+
generateToggle(generate_utils.getToggleParam(),
"Animations/",
"TaSTT_Toggle_Off.anim",
diff --git a/osc_ctrl.py b/osc_ctrl.py
index 4ef238e..4353939 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -327,8 +327,8 @@ def sendRawMessage(client, msg):
#print("Send cell {}".format(cell))
sendMessageCellDiscrete(client, cell_msg, cell)
-def clear():
- sendRawMessage([state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
+def clear(client):
+ sendRawMessage(client, [state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
diff --git a/transcribe.py b/transcribe.py
index 45b2a8e..fa3b166 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -32,6 +32,8 @@ class AudioState:
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
+ VOICE_AUDIO_FILENAME = "audio.wav"
+
# PyAudio object
p = None
@@ -46,10 +48,12 @@ class AudioState:
# transcriptions before "committing" to a transcription.
text_candidate = ""
text_lock = threading.Lock()
+ clear_requested = False
record_audio = True
transcribe_audio = True
send_audio = True
+ run_control_thread = True
osc_client = osc_ctrl.getClient()
@@ -131,15 +135,30 @@ def saveAudio(audio_state, filename):
normalized = pydub_effects.normalize(raw)
normalized.export(filename, format="wav")
+def resetDiskAudioLocked(audio_state, filename):
+ wf = wave.open(filename, 'wb')
+ wf.setnchannels(audio_state.CHANNELS)
+ wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
+ wf.setframerate(audio_state.RATE)
+
+ wf.writeframes(b''.join([]))
+ wf.close()
+
+def resetAudioLocked(audio_state):
+ audio_state.frames = []
+
def resetAudio(audio_state):
audio_state.frames_lock.acquire()
- audio_state.frames = []
+ resetAudioLocked(audio_state)
audio_state.frames_lock.release()
# Transcribe the audio recorded in a file.
def transcribe(model, filename):
+ audio_state.frames_lock.acquire()
audio = whisper.load_audio(filename)
+ audio_state.frames_lock.release()
+
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
#_, probs = model.detect_language(mel)
@@ -155,16 +174,35 @@ def transcribe(model, filename):
def transcribeAudio(audio_state, model):
while audio_state.transcribe_audio == True:
- saveAudio(audio_state, "audio.wav")
+ # Pace this out
+ time.sleep(0.05)
- if not os.path.isfile("audio.wav"):
+ saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+
+ if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
time.sleep(0.1)
continue
- text = transcribe(model, "audio.wav")
+ text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME)
audio_state.text_lock.acquire()
+ if audio_state.clear_requested:
+ audio_state.text = ""
+ audio_state.text_candidate = ""
+ audio_state.clear_requested = False
+ audio_state.text_lock.release()
+ continue
+
+ words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
+ print("words: {}".format(words))
+ if len(words) > 0 and words[-1] == "clear":
+ audio_state.text = ""
+ audio_state.text_candidate = ""
+ audio_state.clear_requested = True
+ audio_state.text_lock.release()
+ continue
+
# We use a few heuristics to handle spurious mistranscriptions and to
# handle events where we trim off the start of the audio clip.
# 1. If we get 2 consecutive identical transcriptions, we commit to
@@ -216,8 +254,6 @@ def transcribeAudio(audio_state, model):
audio_state.text_lock.release()
- # Pace this out
- time.sleep(0.05)
def sendAudio(audio_state):
tx_state = osc_ctrl.OscTxState()
while audio_state.send_audio == True:
@@ -230,6 +266,34 @@ def sendAudio(audio_state):
# Pace this out
time.sleep(0.01)
+def controlThread(audio_state):
+ while audio_state.run_control_thread:
+ time.sleep(0.1)
+ if audio_state.clear_requested:
+ print("here a")
+ audio_state.text_lock.acquire()
+ audio_state.frames_lock.acquire()
+
+ if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+ # empty out the voice file
+ open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+ resetAudioLocked(audio_state)
+ resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+ audio_state.clear_requested = False
+
+ # Allow audio collection to resume now. If we don't do this, then
+ # any audio spoken while the board is slowly clearing will be lost.
+ audio_state.frames_lock.release()
+
+ # Clearing can take a while, and the user might be talking in the
+ # meantime. So we drop audio state before clearing so the other
+ # threads can continue saving to it.
+ osc_ctrl.clear(audio_state.osc_client)
+
+ audio_state.text_lock.release()
+
+ print("here b")
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
@@ -238,11 +302,12 @@ if __name__ == "__main__":
if not args.mic:
args.mic = "index"
- if os.path.isfile("audio.wav"):
- os.remove("audio.wav")
-
audio_state = getMicStream(args.mic)
+ if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+ # empty out the voice file
+ open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+
record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
record_audio_thd.daemon = True
record_audio_thd.start()
@@ -259,6 +324,10 @@ if __name__ == "__main__":
send_audio_thd.daemon = True
send_audio_thd.start()
+ control_thd = threading.Thread(target = controlThread, args = [audio_state])
+ control_thd.daemon = True
+ control_thd.start()
+
print("Press enter to start a new message")
for line in sys.stdin:
resetAudio(audio_state)
@@ -268,6 +337,8 @@ if __name__ == "__main__":
print("Joining threads")
audio_state.record_audio = False
audio_state.transcribe_audio = False
+ audio_state.run_control_thread = False
record_audio_thd.join()
transcribe_audio_thd.join()
+ control_thd.join()