diff options
| author | yum <yum.food.vr@gmail.com> | 2022-10-24 23:08:11 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-10-24 23:13:47 -0700 |
| commit | 0102b725f60c91ca6d095c2b04de71db6d5b1fda (patch) | |
| tree | c87c84ad4f48577e90307ca2f2bf40baa18b6950 | |
| parent | 08655f96dc798e3e129058a5e97c5aa7ff96e798 (diff) | |
Saying the word "clear" clears the board
While the board is clearing, you can keep talking, and it will be
rendered when the board finishes clearing.
* bugfix: STT only beeps when it's out
| -rw-r--r-- | README.md | 5 | ||||
| -rw-r--r-- | libtastt.py | 20 | ||||
| -rw-r--r-- | osc_ctrl.py | 4 | ||||
| -rw-r--r-- | transcribe.py | 89 |
4 files changed, 104 insertions, 14 deletions
@@ -164,6 +164,11 @@ To use the STT: layer. Something must be rethought to bring these numbers down. 3. Implement multicore YAML parsing. This will make working with large animators much more practical. + 4. Transcription engine sleep interval increases exponentially up to 1-2 + seconds, then jumps back to a short interval once speech is detected. + This should significantly cut down on idle resource consumption. Perhaps + there's even a more efficient way to detect the odds that anything is + being said, which we could use to gate transcription. 5. Bugfixes 1. The whisper STT says "Thank you." when there's no audio? 6. Shine diff --git a/libtastt.py b/libtastt.py index 12c95f0..e12f93f 100644 --- a/libtastt.py +++ b/libtastt.py @@ -337,11 +337,14 @@ def generateFXLayer(which_layer: int, anim: libunity.UnityAnimator, layer: # Generic toggle adding utility. # Generates the layer and parameter. +# Returns a map containing the off and on states, as well as the +# transitions between them. def generateToggle(layer_name: str, gen_anim_dir: str, off_anim_basename: str, on_anim_basename: str, - anim: libunity.UnityAnimator): + anim: libunity.UnityAnimator) -> typing.Dict[str, + libunity.UnityDocument]: layer = anim.addLayer(layer_name) # For simplicity, use the layer name as the parameter name. @@ -370,7 +373,13 @@ def generateToggle(layer_name: str, anim.addTransitionBooleanCondition(on_state, on_to_off_trans, parameter_name, False) - pass + result = {} + result["off"] = off_state + result["on"] = on_state + result["off_to_on"] = off_to_on_trans + result["on_to_off"] = on_to_off_trans + + return result def generateFX(guid_map, gen_anim_dir): anim = libunity.UnityAnimator() @@ -382,11 +391,16 @@ def generateFX(guid_map, gen_anim_dir): print("Generating layer {}/{}".format(which_layer, len(layers.items())), file=sys.stderr) generateFXLayer(which_layer, anim, layer, gen_anim_dir) - generateToggle(generate_utils.getSpeechNoiseToggleParam(), + states = generateToggle( + generate_utils.getSpeechNoiseToggleParam(), "Animations/", "TaSTT_Speech_Noise_Off.anim", "TaSTT_Speech_Noise_On.anim", anim) + # Enable beeping only if board is out. + anim.addTransitionBooleanCondition(states["off"], + states["off_to_on"], generate_utils.getToggleParam(), True) + generateToggle(generate_utils.getToggleParam(), "Animations/", "TaSTT_Toggle_Off.anim", diff --git a/osc_ctrl.py b/osc_ctrl.py index 4ef238e..4353939 100644 --- a/osc_ctrl.py +++ b/osc_ctrl.py @@ -327,8 +327,8 @@ def sendRawMessage(client, msg): #print("Send cell {}".format(cell)) sendMessageCellDiscrete(client, cell_msg, cell) -def clear(): - sendRawMessage([state.encoding[' ']] * BOARD_ROWS * BOARD_COLS) +def clear(client): + sendRawMessage(client, [state.encoding[' ']] * BOARD_ROWS * BOARD_COLS) if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/transcribe.py b/transcribe.py index 45b2a8e..fa3b166 100644 --- a/transcribe.py +++ b/transcribe.py @@ -32,6 +32,8 @@ class AudioState: # The minimum length that recordAudio() will wait for before saving audio. MIN_LENGTH_S = 1 + VOICE_AUDIO_FILENAME = "audio.wav" + # PyAudio object p = None @@ -46,10 +48,12 @@ class AudioState: # transcriptions before "committing" to a transcription. text_candidate = "" text_lock = threading.Lock() + clear_requested = False record_audio = True transcribe_audio = True send_audio = True + run_control_thread = True osc_client = osc_ctrl.getClient() @@ -131,15 +135,30 @@ def saveAudio(audio_state, filename): normalized = pydub_effects.normalize(raw) normalized.export(filename, format="wav") +def resetDiskAudioLocked(audio_state, filename): + wf = wave.open(filename, 'wb') + wf.setnchannels(audio_state.CHANNELS) + wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT)) + wf.setframerate(audio_state.RATE) + + wf.writeframes(b''.join([])) + wf.close() + +def resetAudioLocked(audio_state): + audio_state.frames = [] + def resetAudio(audio_state): audio_state.frames_lock.acquire() - audio_state.frames = [] + resetAudioLocked(audio_state) audio_state.frames_lock.release() # Transcribe the audio recorded in a file. def transcribe(model, filename): + audio_state.frames_lock.acquire() audio = whisper.load_audio(filename) + audio_state.frames_lock.release() + audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) #_, probs = model.detect_language(mel) @@ -155,16 +174,35 @@ def transcribe(model, filename): def transcribeAudio(audio_state, model): while audio_state.transcribe_audio == True: - saveAudio(audio_state, "audio.wav") + # Pace this out + time.sleep(0.05) - if not os.path.isfile("audio.wav"): + saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME) + + if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): time.sleep(0.1) continue - text = transcribe(model, "audio.wav") + text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME) audio_state.text_lock.acquire() + if audio_state.clear_requested: + audio_state.text = "" + audio_state.text_candidate = "" + audio_state.clear_requested = False + audio_state.text_lock.release() + continue + + words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() + print("words: {}".format(words)) + if len(words) > 0 and words[-1] == "clear": + audio_state.text = "" + audio_state.text_candidate = "" + audio_state.clear_requested = True + audio_state.text_lock.release() + continue + # We use a few heuristics to handle spurious mistranscriptions and to # handle events where we trim off the start of the audio clip. # 1. If we get 2 consecutive identical transcriptions, we commit to @@ -216,8 +254,6 @@ def transcribeAudio(audio_state, model): audio_state.text_lock.release() - # Pace this out - time.sleep(0.05) def sendAudio(audio_state): tx_state = osc_ctrl.OscTxState() while audio_state.send_audio == True: @@ -230,6 +266,34 @@ def sendAudio(audio_state): # Pace this out time.sleep(0.01) +def controlThread(audio_state): + while audio_state.run_control_thread: + time.sleep(0.1) + if audio_state.clear_requested: + print("here a") + audio_state.text_lock.acquire() + audio_state.frames_lock.acquire() + + if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): + # empty out the voice file + open(audio_state.VOICE_AUDIO_FILENAME, "w").close() + resetAudioLocked(audio_state) + resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME) + audio_state.clear_requested = False + + # Allow audio collection to resume now. If we don't do this, then + # any audio spoken while the board is slowly clearing will be lost. + audio_state.frames_lock.release() + + # Clearing can take a while, and the user might be talking in the + # meantime. So we drop audio state before clearing so the other + # threads can continue saving to it. + osc_ctrl.clear(audio_state.osc_client) + + audio_state.text_lock.release() + + print("here b") + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index") @@ -238,11 +302,12 @@ if __name__ == "__main__": if not args.mic: args.mic = "index" - if os.path.isfile("audio.wav"): - os.remove("audio.wav") - audio_state = getMicStream(args.mic) + if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME): + # empty out the voice file + open(audio_state.VOICE_AUDIO_FILENAME, "w").close() + record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state]) record_audio_thd.daemon = True record_audio_thd.start() @@ -259,6 +324,10 @@ if __name__ == "__main__": send_audio_thd.daemon = True send_audio_thd.start() + control_thd = threading.Thread(target = controlThread, args = [audio_state]) + control_thd.daemon = True + control_thd.start() + print("Press enter to start a new message") for line in sys.stdin: resetAudio(audio_state) @@ -268,6 +337,8 @@ if __name__ == "__main__": print("Joining threads") audio_state.record_audio = False audio_state.transcribe_audio = False + audio_state.run_control_thread = False record_audio_thd.join() transcribe_audio_thd.join() + control_thd.join() |
