Add ability to type using STT

To use it, do a medium hold + long hold. Keep the long hold depressed until you're done speaking. The transcription will be typed into the currently selected input field. * Add more audio feedback * Make audio feedback play asynchronously so it doesn't slow down the controller input state machine as much.
author: yum <yum.food.vr@gmail.com> 2023-05-23 15:12:04 -0700
committer: yum <yum.food.vr@gmail.com> 2023-05-23 15:15:39 -0700
commit: 78de7f02bc364606b0939d66903f02b2f91c141b (patch)
tree: e90e441b45ccf777098d8e581bea082855f9e510
parent: 0f08da58a59a126f5307395e822fd140f15f8b36 (diff)
8 files changed, 45 insertions, 14 deletions
diff --git a/README.md b/README.md
index 057acb4..6b75998 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,10 @@ To build your own package from source, see GUI/README.md.
 
 Basic controls:
 * Short click to toggle transcription.
-* Long click to hide the text box.
-* Scale it up/down in the radial menu.
+* Medium click to hide the text box.
+* Hold to update text box without unlocking from worldspace.
+* Medium click + hold to type using STT.
+* Scale up/down in the radial menu.
 
 ## Design philosophy
 
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 422e9c0..b4b7198 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -8,13 +8,14 @@ from playsound import playsound
 
 import argparse
 import copy
-import os
-import osc_ctrl
 import generate_utils
 import keybind_event_machine
+import keyboard
 import langcodes
-import pyaudio
 import numpy as np
+import os
+import osc_ctrl
+import pyaudio
 import steamvr
 import string_matcher
 import sys
@@ -334,7 +335,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.audio_paused = True
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"))
+                playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                    block=False)
         elif state == PAUSE_STATE:
             state = RECORD_STATE
             if not use_builtin:
@@ -348,7 +350,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.audio_paused = False
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"))
+                playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
+                    block=False)
 
 def readControllerInput(audio_state, enable_local_beep: bool,
         use_builtin: bool, button: str):
@@ -371,7 +374,14 @@ def readControllerInput(audio_state, enable_local_beep: bool,
     hand_id = steamvr.hands[button.split()[0]]
     button_id = steamvr.buttons[button.split()[1]]
 
+    # Rough description of state machine:
+    #   Single short press: toggle transcription
+    #   Medium press: dismiss custom chatbox
+    #   Long press: update chatbox in place
+    #   Medium press + long press: type transcription
+
     last_rising = time.time()
+    last_medium_press_end = 0
     while audio_state.run_app == True:
         time.sleep(0.05)
 
@@ -390,7 +400,7 @@ def readControllerInput(audio_state, enable_local_beep: bool,
         elif event == steamvr.EVENT_FALLING_EDGE:
             now = time.time()
             if now - last_rising > 1.5:
-                # Very long hold: treat as the end of transcription.
+                # Long press: treat as the end of transcription.
                 state = PAUSE_STATE
                 if not use_builtin:
                     osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
@@ -398,13 +408,29 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                 audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
                 audio_state.audio_paused = True
 
+                if last_rising - last_medium_press_end < 1.0:
+                    # Type transcription
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/KB_Noise_Off_Quiet.wav"),
+                            block=False)
+                    keyboard.write(audio_state.text)
+                else:
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                            block=False)
+
             elif now - last_rising > 0.5:
-                # Long hold
+                # Medium press
+                last_medium_press_end = now
                 state = PAUSE_STATE
+
+                if enable_local_beep == 1:
+                    playsound(os.path.abspath("Resources/Sounds/Dismiss_Noise_Quiet.wav"),
+                        block=False)
+
                 if not use_builtin:
                     osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
                     osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
-                #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
 
                 resetAudioLocked(audio_state)
                 resetDisplayLocked(audio_state)
@@ -422,17 +448,20 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                     audio_state.audio_paused = True
 
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"))
+                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                            block=False)
                 elif state == PAUSE_STATE:
                     state = RECORD_STATE
+
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
+                            block=False)
+
                     if not use_builtin:
                         osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
                         osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
                         osc_ctrl.lockWorld(audio_state.osc_state.client, False)
 
-                    if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"))
-
 # model should correspond to one of the Whisper models defined in
 # whisper/__init__.py. Examples: tiny, base, small, medium.
 def transcribeLoop(mic: str, language: str, model: str,
diff --git a/Sounds/Dismiss_Noise.wav b/Sounds/Dismiss_Noise.wav
new file mode 100644
index 0000000..fe60f21
--- /dev/null
+++ b/Sounds/Dismiss_Noise.wav
diff --git a/Sounds/Dismiss_Noise_Quiet.wav b/Sounds/Dismiss_Noise_Quiet.wav
new file mode 100644
index 0000000..5c3b1cb
--- /dev/null
+++ b/Sounds/Dismiss_Noise_Quiet.wav
diff --git a/Sounds/KB_Noise_Off.wav b/Sounds/KB_Noise_Off.wav
new file mode 100644
index 0000000..64d9c6f
--- /dev/null
+++ b/Sounds/KB_Noise_Off.wav
diff --git a/Sounds/KB_Noise_Off_Quiet.wav b/Sounds/KB_Noise_Off_Quiet.wav
new file mode 100644
index 0000000..b965e6a
--- /dev/null
+++ b/Sounds/KB_Noise_Off_Quiet.wav
diff --git a/Sounds/KB_Noise_On.wav b/Sounds/KB_Noise_On.wav
new file mode 100644
index 0000000..a959041
--- /dev/null
+++ b/Sounds/KB_Noise_On.wav
diff --git a/Sounds/KB_Noise_On_Quiet.wav b/Sounds/KB_Noise_On_Quiet.wav
new file mode 100644
index 0000000..e49513e
--- /dev/null
+++ b/Sounds/KB_Noise_On_Quiet.wav
author	yum <yum.food.vr@gmail.com>	2023-05-23 15:12:04 -0700
committer	yum <yum.food.vr@gmail.com>	2023-05-23 15:15:39 -0700
commit	78de7f02bc364606b0939d66903f02b2f91c141b (patch)
tree	e90e441b45ccf777098d8e581bea082855f9e510
parent	0f08da58a59a126f5307395e822fd140f15f8b36 (diff)