Saying the word "clear" clears the board

While the board is clearing, you can keep talking, and it will be rendered when the board finishes clearing. * bugfix: STT only beeps when it's out
author: yum <yum.food.vr@gmail.com> 2022-10-24 23:08:11 -0700
committer: yum <yum.food.vr@gmail.com> 2022-10-24 23:13:47 -0700
commit: 0102b725f60c91ca6d095c2b04de71db6d5b1fda (patch)
tree: c87c84ad4f48577e90307ca2f2bf40baa18b6950
parent: 08655f96dc798e3e129058a5e97c5aa7ff96e798 (diff)
4 files changed, 104 insertions, 14 deletions
diff --git a/README.md b/README.md
index 3d999fd..c5e58a0 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,11 @@ To use the STT:
       layer. Something must be rethought to bring these numbers down.
    3. Implement multicore YAML parsing. This will make working with large
       animators much more practical.
+   4. Transcription engine sleep interval increases exponentially up to 1-2
+      seconds, then jumps back to a short interval once speech is detected.
+      This should significantly cut down on idle resource consumption. Perhaps
+      there's even a more efficient way to detect the odds that anything is
+      being said, which we could use to gate transcription.
 5. Bugfixes
    1. The whisper STT says "Thank you." when there's no audio?
 6. Shine
diff --git a/libtastt.py b/libtastt.py
index 12c95f0..e12f93f 100644
--- a/libtastt.py
+++ b/libtastt.py
@@ -337,11 +337,14 @@ def generateFXLayer(which_layer: int, anim: libunity.UnityAnimator, layer:
 
 # Generic toggle adding utility.
 # Generates the layer and parameter.
+# Returns a map containing the off and on states, as well as the
+# transitions between them.
 def generateToggle(layer_name: str,
         gen_anim_dir: str,
         off_anim_basename: str,
         on_anim_basename: str,
-        anim: libunity.UnityAnimator):
+        anim: libunity.UnityAnimator) -> typing.Dict[str,
+                libunity.UnityDocument]:
     layer = anim.addLayer(layer_name)
 
     # For simplicity, use the layer name as the parameter name.
@@ -370,7 +373,13 @@ def generateToggle(layer_name: str,
     anim.addTransitionBooleanCondition(on_state,
             on_to_off_trans, parameter_name, False)
 
-    pass
+    result = {}
+    result["off"] = off_state
+    result["on"] = on_state
+    result["off_to_on"] = off_to_on_trans
+    result["on_to_off"] = on_to_off_trans
+
+    return result
 
 def generateFX(guid_map, gen_anim_dir):
     anim = libunity.UnityAnimator()
@@ -382,11 +391,16 @@ def generateFX(guid_map, gen_anim_dir):
         print("Generating layer {}/{}".format(which_layer, len(layers.items())), file=sys.stderr)
         generateFXLayer(which_layer, anim, layer, gen_anim_dir)
 
-    generateToggle(generate_utils.getSpeechNoiseToggleParam(),
+    states = generateToggle(
+            generate_utils.getSpeechNoiseToggleParam(),
             "Animations/",
             "TaSTT_Speech_Noise_Off.anim",
             "TaSTT_Speech_Noise_On.anim",
             anim)
+    # Enable beeping only if board is out.
+    anim.addTransitionBooleanCondition(states["off"],
+            states["off_to_on"], generate_utils.getToggleParam(), True)
+
     generateToggle(generate_utils.getToggleParam(),
             "Animations/",
             "TaSTT_Toggle_Off.anim",
diff --git a/osc_ctrl.py b/osc_ctrl.py
index 4ef238e..4353939 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -327,8 +327,8 @@ def sendRawMessage(client, msg):
         #print("Send cell {}".format(cell))
         sendMessageCellDiscrete(client, cell_msg, cell)
 
-def clear():
-    sendRawMessage([state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
+def clear(client):
+    sendRawMessage(client, [state.encoding[' ']] * BOARD_ROWS * BOARD_COLS)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/transcribe.py b/transcribe.py
index 45b2a8e..fa3b166 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -32,6 +32,8 @@ class AudioState:
     # The minimum length that recordAudio() will wait for before saving audio.
     MIN_LENGTH_S = 1
 
+    VOICE_AUDIO_FILENAME = "audio.wav"
+
     # PyAudio object
     p = None
 
@@ -46,10 +48,12 @@ class AudioState:
     # transcriptions before "committing" to a transcription.
     text_candidate = ""
     text_lock = threading.Lock()
+    clear_requested = False
 
     record_audio = True
     transcribe_audio = True
     send_audio = True
+    run_control_thread = True
 
     osc_client = osc_ctrl.getClient()
 
@@ -131,15 +135,30 @@ def saveAudio(audio_state, filename):
     normalized = pydub_effects.normalize(raw)
     normalized.export(filename, format="wav")
 
+def resetDiskAudioLocked(audio_state, filename):
+    wf = wave.open(filename, 'wb')
+    wf.setnchannels(audio_state.CHANNELS)
+    wf.setsampwidth(audio_state.p.get_sample_size(audio_state.FORMAT))
+    wf.setframerate(audio_state.RATE)
+
+    wf.writeframes(b''.join([]))
+    wf.close()
+
+def resetAudioLocked(audio_state):
+    audio_state.frames = []
+
 def resetAudio(audio_state):
     audio_state.frames_lock.acquire()
-    audio_state.frames = []
+    resetAudioLocked(audio_state)
     audio_state.frames_lock.release()
 
 # Transcribe the audio recorded in a file.
 def transcribe(model, filename):
 
+    audio_state.frames_lock.acquire()
     audio = whisper.load_audio(filename)
+    audio_state.frames_lock.release()
+
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     #_, probs = model.detect_language(mel)
@@ -155,16 +174,35 @@ def transcribe(model, filename):
 
 def transcribeAudio(audio_state, model):
     while audio_state.transcribe_audio == True:
-        saveAudio(audio_state, "audio.wav")
+        # Pace this out
+        time.sleep(0.05)
 
-        if not os.path.isfile("audio.wav"):
+        saveAudio(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+
+        if not os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
             time.sleep(0.1)
             continue
 
-        text = transcribe(model, "audio.wav")
+        text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME)
 
         audio_state.text_lock.acquire()
 
+        if audio_state.clear_requested:
+            audio_state.text = ""
+            audio_state.text_candidate = ""
+            audio_state.clear_requested = False
+            audio_state.text_lock.release()
+            continue
+
+        words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
+        print("words: {}".format(words))
+        if len(words) > 0 and words[-1] == "clear":
+            audio_state.text = ""
+            audio_state.text_candidate = ""
+            audio_state.clear_requested = True
+            audio_state.text_lock.release()
+            continue
+
         # We use a few heuristics to handle spurious mistranscriptions and to
         # handle events where we trim off the start of the audio clip.
         #   1. If we get 2 consecutive identical transcriptions, we commit to
@@ -216,8 +254,6 @@ def transcribeAudio(audio_state, model):
 
         audio_state.text_lock.release()
 
-        # Pace this out
-        time.sleep(0.05)
 def sendAudio(audio_state):
     tx_state = osc_ctrl.OscTxState()
     while audio_state.send_audio == True:
@@ -230,6 +266,34 @@ def sendAudio(audio_state):
         # Pace this out
         time.sleep(0.01)
 
+def controlThread(audio_state):
+    while audio_state.run_control_thread:
+        time.sleep(0.1)
+        if audio_state.clear_requested:
+            print("here a")
+            audio_state.text_lock.acquire()
+            audio_state.frames_lock.acquire()
+
+            if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+                # empty out the voice file
+                open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+            resetAudioLocked(audio_state)
+            resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
+            audio_state.clear_requested = False
+
+            # Allow audio collection to resume now. If we don't do this, then
+            # any audio spoken while the board is slowly clearing will be lost.
+            audio_state.frames_lock.release()
+
+            # Clearing can take a while, and the user might be talking in the
+            # meantime. So we drop audio state before clearing so the other
+            # threads can continue saving to it.
+            osc_ctrl.clear(audio_state.osc_client)
+
+            audio_state.text_lock.release()
+
+            print("here b")
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
@@ -238,11 +302,12 @@ if __name__ == "__main__":
     if not args.mic:
         args.mic = "index"
 
-    if os.path.isfile("audio.wav"):
-        os.remove("audio.wav")
-
     audio_state = getMicStream(args.mic)
 
+    if os.path.isfile(audio_state.VOICE_AUDIO_FILENAME):
+        # empty out the voice file
+        open(audio_state.VOICE_AUDIO_FILENAME, "w").close()
+
     record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
     record_audio_thd.daemon = True
     record_audio_thd.start()
@@ -259,6 +324,10 @@ if __name__ == "__main__":
     send_audio_thd.daemon = True
     send_audio_thd.start()
 
+    control_thd = threading.Thread(target = controlThread, args = [audio_state])
+    control_thd.daemon = True
+    control_thd.start()
+
     print("Press enter to start a new message")
     for line in sys.stdin:
         resetAudio(audio_state)
@@ -268,6 +337,8 @@ if __name__ == "__main__":
     print("Joining threads")
     audio_state.record_audio = False
     audio_state.transcribe_audio = False
+    audio_state.run_control_thread = False
     record_audio_thd.join()
     transcribe_audio_thd.join()
+    control_thd.join()
author	yum <yum.food.vr@gmail.com>	2022-10-24 23:08:11 -0700
committer	yum <yum.food.vr@gmail.com>	2022-10-24 23:13:47 -0700
commit	0102b725f60c91ca6d095c2b04de71db6d5b1fda (patch)
tree	c87c84ad4f48577e90307ca2f2bf40baa18b6950
parent	08655f96dc798e3e129058a5e97c5aa7ff96e798 (diff)