From 78de7f02bc364606b0939d66903f02b2f91c141b Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Tue, 23 May 2023 15:12:04 -0700
Subject: Add ability to type using STT

To use it, do a medium hold + long hold. Keep the long hold depressed
until you're done speaking. The transcription will be typed into the
currently selected input field.

* Add more audio feedback
* Make audio feedback play asynchronously so it doesn't slow down the
  controller input state machine as much.
---
 README.md                      |   6 +++--
 Scripts/transcribe.py          |  53 +++++++++++++++++++++++++++++++----------
 Sounds/Dismiss_Noise.wav       | Bin 0 -> 192078 bytes
 Sounds/Dismiss_Noise_Quiet.wav | Bin 0 -> 192078 bytes
 Sounds/KB_Noise_Off.wav        | Bin 0 -> 192078 bytes
 Sounds/KB_Noise_Off_Quiet.wav  | Bin 0 -> 192078 bytes
 Sounds/KB_Noise_On.wav         | Bin 0 -> 266318 bytes
 Sounds/KB_Noise_On_Quiet.wav   | Bin 0 -> 266318 bytes
 8 files changed, 45 insertions(+), 14 deletions(-)
 create mode 100644 Sounds/Dismiss_Noise.wav
 create mode 100644 Sounds/Dismiss_Noise_Quiet.wav
 create mode 100644 Sounds/KB_Noise_Off.wav
 create mode 100644 Sounds/KB_Noise_Off_Quiet.wav
 create mode 100644 Sounds/KB_Noise_On.wav
 create mode 100644 Sounds/KB_Noise_On_Quiet.wav

diff --git a/README.md b/README.md
index 057acb4..6b75998 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,10 @@ To build your own package from source, see GUI/README.md.
 
 Basic controls:
 * Short click to toggle transcription.
-* Long click to hide the text box.
-* Scale it up/down in the radial menu.
+* Medium click to hide the text box.
+* Hold to update text box without unlocking from worldspace.
+* Medium click + hold to type using STT.
+* Scale up/down in the radial menu.
 
 ## Design philosophy
 
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 422e9c0..b4b7198 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -8,13 +8,14 @@ from playsound import playsound
 
 import argparse
 import copy
-import os
-import osc_ctrl
 import generate_utils
 import keybind_event_machine
+import keyboard
 import langcodes
-import pyaudio
 import numpy as np
+import os
+import osc_ctrl
+import pyaudio
 import steamvr
 import string_matcher
 import sys
@@ -334,7 +335,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.audio_paused = True
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"))
+                playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                    block=False)
         elif state == PAUSE_STATE:
             state = RECORD_STATE
             if not use_builtin:
@@ -348,7 +350,8 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.audio_paused = False
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"))
+                playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
+                    block=False)
 
 def readControllerInput(audio_state, enable_local_beep: bool,
         use_builtin: bool, button: str):
@@ -371,7 +374,14 @@ def readControllerInput(audio_state, enable_local_beep: bool,
     hand_id = steamvr.hands[button.split()[0]]
     button_id = steamvr.buttons[button.split()[1]]
 
+    # Rough description of state machine:
+    #   Single short press: toggle transcription
+    #   Medium press: dismiss custom chatbox
+    #   Long press: update chatbox in place
+    #   Medium press + long press: type transcription
+
     last_rising = time.time()
+    last_medium_press_end = 0
     while audio_state.run_app == True:
         time.sleep(0.05)
 
@@ -390,7 +400,7 @@ def readControllerInput(audio_state, enable_local_beep: bool,
         elif event == steamvr.EVENT_FALLING_EDGE:
             now = time.time()
             if now - last_rising > 1.5:
-                # Very long hold: treat as the end of transcription.
+                # Long press: treat as the end of transcription.
                 state = PAUSE_STATE
                 if not use_builtin:
                     osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
@@ -398,13 +408,29 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                 audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
                 audio_state.audio_paused = True
 
+                if last_rising - last_medium_press_end < 1.0:
+                    # Type transcription
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/KB_Noise_Off_Quiet.wav"),
+                            block=False)
+                    keyboard.write(audio_state.text)
+                else:
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                            block=False)
+
             elif now - last_rising > 0.5:
-                # Long hold
+                # Medium press
+                last_medium_press_end = now
                 state = PAUSE_STATE
+
+                if enable_local_beep == 1:
+                    playsound(os.path.abspath("Resources/Sounds/Dismiss_Noise_Quiet.wav"),
+                        block=False)
+
                 if not use_builtin:
                     osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
                     osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
-                #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
 
                 resetAudioLocked(audio_state)
                 resetDisplayLocked(audio_state)
@@ -422,17 +448,20 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                     audio_state.audio_paused = True
 
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"))
+                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
+                            block=False)
                 elif state == PAUSE_STATE:
                     state = RECORD_STATE
+
+                    if enable_local_beep == 1:
+                        playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
+                            block=False)
+
                     if not use_builtin:
                         osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
                         osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
                         osc_ctrl.lockWorld(audio_state.osc_state.client, False)
 
-                    if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"))
-
 # model should correspond to one of the Whisper models defined in
 # whisper/__init__.py. Examples: tiny, base, small, medium.
 def transcribeLoop(mic: str, language: str, model: str,
diff --git a/Sounds/Dismiss_Noise.wav b/Sounds/Dismiss_Noise.wav
new file mode 100644
index 0000000..fe60f21
Binary files /dev/null and b/Sounds/Dismiss_Noise.wav differ
diff --git a/Sounds/Dismiss_Noise_Quiet.wav b/Sounds/Dismiss_Noise_Quiet.wav
new file mode 100644
index 0000000..5c3b1cb
Binary files /dev/null and b/Sounds/Dismiss_Noise_Quiet.wav differ
diff --git a/Sounds/KB_Noise_Off.wav b/Sounds/KB_Noise_Off.wav
new file mode 100644
index 0000000..64d9c6f
Binary files /dev/null and b/Sounds/KB_Noise_Off.wav differ
diff --git a/Sounds/KB_Noise_Off_Quiet.wav b/Sounds/KB_Noise_Off_Quiet.wav
new file mode 100644
index 0000000..b965e6a
Binary files /dev/null and b/Sounds/KB_Noise_Off_Quiet.wav differ
diff --git a/Sounds/KB_Noise_On.wav b/Sounds/KB_Noise_On.wav
new file mode 100644
index 0000000..a959041
Binary files /dev/null and b/Sounds/KB_Noise_On.wav differ
diff --git a/Sounds/KB_Noise_On_Quiet.wav b/Sounds/KB_Noise_On_Quiet.wav
new file mode 100644
index 0000000..e49513e
Binary files /dev/null and b/Sounds/KB_Noise_On_Quiet.wav differ
-- 
cgit v1.2.3