summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-10-24 23:08:11 -0700
committeryum <yum.food.vr@gmail.com>2022-10-24 23:13:47 -0700
commit0102b725f60c91ca6d095c2b04de71db6d5b1fda (patch)
treec87c84ad4f48577e90307ca2f2bf40baa18b6950
parent08655f96dc798e3e129058a5e97c5aa7ff96e798 (diff)
Saying the word "clear" clears the board
While the board is clearing, you can keep talking, and it will be rendered when the board finishes clearing. * bugfix: STT only beeps when it's out
-rw-r--r--README.md5
-rw-r--r--libtastt.py20
-rw-r--r--osc_ctrl.py4
-rw-r--r--transcribe.py89
4 files changed, 104 insertions, 14 deletions
diff --git a/README.md b/README.md
index 3d999fd..c5e58a0 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,11 @@ To use the STT:
layer. Something must be rethought to bring these numbers down.
3. Implement multicore YAML parsing. This will make working with large
animators much more practical.
+ 4. Transcription engine sleep interval increases exponentially up to 1-2
+ seconds, then jumps back to a short interval once speech is detected.
+ This should significantly cut down on idle resource consumption. Perhaps
+ there's even a more efficient way to detect the odds that anything is
+ being said, which we could use to gate transcription.
5. Bugfixes
1. The whisper STT says "Thank you." when there's no audio?
6. Shine
diff --git a/libtastt.py b/libtastt.py
index 12c95f0..e12f93f 100644
--- a/libtastt.py
+++ b/libtastt.py
@@ -337,11 +337,14 @@ def generateFXLayer(which_layer: int, anim: libunity.UnityAnimator, layer:
# Generic toggle adding utility.
# Generates the layer and parameter.
+# Returns a map containing the off and on states, as well as the
+# transitions between them.
def generateToggle(layer_name: str,
gen_anim_dir: str,
off_anim_basename: str,
on_anim_basename: str,
- anim: libunity.UnityAnimator):
+ anim: libunity.UnityAnimator) -> typing.Dict[str,
+ libunity.UnityDocument]:
layer = anim.addLayer(layer_name)
# For simplicity, use the layer name as the parameter name.
@@ -370,7 +373,13 @@ def generateToggle(layer_name: str,
anim.addTransitionBooleanCondition(on_state,
on_to_off_trans, parameter_name, False)
- pass
+ result = {}
+ result["off"] = off_state
+ result["on"] = on_state
+ result["off_to_on"] = off_to_on_trans
+ result["on_to_off"] = on_to_off_trans
+
+ return result
def generateFX(guid_map, gen_anim_dir):
anim = libunity.UnityAnimator()
@@ -382,11 +391,16 @@ def generateFX(guid_map, gen_anim_dir):
print("Generating layer {}/{}".format(which_layer, len(layers.items())), file=sys.stderr)
generateFXLayer(which_layer, anim, layer, gen_anim_dir)
- generateToggle(generate_utils.getSpeechNoiseToggleParam(),
+ states = generateToggle(
+ generate_utils.getSpeechNoiseToggleParam(),
"Animations/",
"TaSTT_Speech_Noise_Off.anim",
"TaSTT_Speech_Noise_On.anim",
anim)
+ # Enable beeping only if board is out.
+ anim.addTransitionBooleanCondition(states["off"],
+ states["off_to_on"], generate_utils.getToggleParam(), True)
+
generateToggle(generate_utils.getToggleParam(),
"Animations/",
"TaSTT_Toggle_Off.anim",
diff --git a/osc_ctrl.py b/osc_ctrl.py
index 4ef238e..4353939 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -327,8 +327,8 @@ def sendRawMessage(client, msg):
#print("Send cell {}".format(cell))
sendMessageCellDiscrete(client, cell_msg, cell)
-def clear():
- sendRawMessage([state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
+def clear(client):
+ sendRawMessage(client, [state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
diff --git a/transcribe.py b/transcribe.py
index 45b2a8e..fa3b166 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -32,6 +32,8 @@ class AudioState:
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
+ VOICE_AUDIO_FILENAME = "audio.wav"
+
# PyAudio object
p = None
@@ -46,10 +48,12 @@ class AudioState:
# transcriptions before "committing" to a transcription.
text_candidate = ""
text_lock = threading.Lock()
+ clear_requested = False
record_audio = True
transcribe_audio = True
send_audio = True
+ run_control_thread = True
osc_client = osc_ctrl.getClient()
@@ -131,15 +135,30 @@ def saveAudio(audio_state, filename):
normalized = pydub_effects.normalize(raw)
normalized.export(filename, format="wav")
+def resetDiskAudioLocked(audio_state, filename):
+ wf = wave.open(filename, 'wb')
+ wf.setnchannels(audio_state.CHANNELS)
+ wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
+ wf.setframerate(audio_state.RATE)
+
+ wf.writeframes(b''.join([]))
+ wf.close()
+
+def resetAudioLocked(audio_state):
+ audio_state.frames = []
+
def resetAudio(audio_state):
audio_state.frames_lock.acquire()
- audio_state.frames = []
+ resetAudioLocked(audio_state)
audio_state.frames_lock.release()
# Transcribe the audio recorded in a file.
def transcribe(model, filename):
+ audio_state.frames_lock.acquire()
audio = whisper.load_audio(filename)
+ audio_state.frames_lock.release()
+
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
#_, probs = model.detect_language(mel)
@@ -155,16 +174,35 @@ def transcribe(model, filename):
def transcribeAudio(audio_state, model):
while audio_state.transcribe_audio == True:
- saveAudio(audio_state, "audio.wav")
+ # Pace this out
+ time.sleep(0.05)
- if not os.path.isfile("audio.wav"):
+ saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+
+ if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
time.sleep(0.1)
continue
- text = transcribe(model, "audio.wav")
+ text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME)
audio_state.text_lock.acquire()
+ if audio_state.clear_requested:
+ audio_state.text = ""
+ audio_state.text_candidate = ""
+ audio_state.clear_requested = False
+ audio_state.text_lock.release()
+ continue
+
+ words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
+ print("words: {}".format(words))
+ if len(words) > 0 and words[-1] == "clear":
+ audio_state.text = ""
+ audio_state.text_candidate = ""
+ audio_state.clear_requested = True
+ audio_state.text_lock.release()
+ continue
+
# We use a few heuristics to handle spurious mistranscriptions and to
# handle events where we trim off the start of the audio clip.
# 1. If we get 2 consecutive identical transcriptions, we commit to
@@ -216,8 +254,6 @@ def transcribeAudio(audio_state, model):
audio_state.text_lock.release()
- # Pace this out
- time.sleep(0.05)
def sendAudio(audio_state):
tx_state = osc_ctrl.OscTxState()
while audio_state.send_audio == True:
@@ -230,6 +266,34 @@ def sendAudio(audio_state):
# Pace this out
time.sleep(0.01)
+def controlThread(audio_state):
+ while audio_state.run_control_thread:
+ time.sleep(0.1)
+ if audio_state.clear_requested:
+ print("here a")
+ audio_state.text_lock.acquire()
+ audio_state.frames_lock.acquire()
+
+ if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+ # empty out the voice file
+ open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+ resetAudioLocked(audio_state)
+ resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+ audio_state.clear_requested = False
+
+ # Allow audio collection to resume now. If we don't do this, then
+ # any audio spoken while the board is slowly clearing will be lost.
+ audio_state.frames_lock.release()
+
+ # Clearing can take a while, and the user might be talking in the
+ # meantime. So we drop audio state before clearing so the other
+ # threads can continue saving to it.
+ osc_ctrl.clear(audio_state.osc_client)
+
+ audio_state.text_lock.release()
+
+ print("here b")
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
@@ -238,11 +302,12 @@ if __name__ == "__main__":
if not args.mic:
args.mic = "index"
- if os.path.isfile("audio.wav"):
- os.remove("audio.wav")
-
audio_state = getMicStream(args.mic)
+ if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+ # empty out the voice file
+ open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+
record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
record_audio_thd.daemon = True
record_audio_thd.start()
@@ -259,6 +324,10 @@ if __name__ == "__main__":
send_audio_thd.daemon = True
send_audio_thd.start()
+ control_thd = threading.Thread(target = controlThread, args = [audio_state])
+ control_thd.daemon = True
+ control_thd.start()
+
print("Press enter to start a new message")
for line in sys.stdin:
resetAudio(audio_state)
@@ -268,6 +337,8 @@ if __name__ == "__main__":
print("Joining threads")
audio_state.record_audio = False
audio_state.transcribe_audio = False
+ audio_state.run_control_thread = False
record_audio_thd.join()
transcribe_audio_thd.join()
+ control_thd.join()