Put audio feedback into its own thread

I this improves the code structure of the controller input thread and leads to some deduplication, so I'm going to keep it. However, the intended purpose was to decrease lag when pressing buttons, and in that regard it failed. The lag goes all the way down to the input layer, implying that the input thread is not able to consistently run at its intended 100 Hz sample rate. I suspect that the Python global interpreter lock (GIL) is at fault. Since we can't realistically move all our functionality into one thread in a non-blocking model, I think multiprocessing is the logical choice going forward. Each thread in transcribe.py would become its own process, and pub/sub through some intermediary process sitting in the middle.
author: yum <yum.food.vr@gmail.com> 2023-08-25 12:50:59 -0700
committer: yum <yum.food.vr@gmail.com> 2023-08-25 12:50:59 -0700
commit: 302f7ba09f2ee115d0ee4b8f0841f6ffcd50ec57 (patch)
tree: 5c07175619a1e9d5e56a30f8d2fdd4e6bbde1623 /Scripts
parent: 9e43487c1bf62402e96cb6139b24cd8446515673 (diff)
2 files changed, 52 insertions, 17 deletions
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index bdc93a1..3a2cf42 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -9,7 +9,6 @@ pyopenxr
 pillow
 pyaudio
 python-osc
-playsound==1.2.2
 pyyaml
 sentence_splitter
 transformers>=4.21.0
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index e9873e9..4b00bd0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -5,7 +5,6 @@ from emotes_v2 import EmotesState
 from faster_whisper import WhisperModel
 from functools import partial
 from math import ceil, floor
-from playsound import playsound
 from profanity_filter import ProfanityFilter
 from sentence_splitter import split_text_into_sentences
 
@@ -30,6 +29,7 @@ import time
 import transformers
 import typing
 import wave
+import winsound
 
 class AudioState:
     def __init__(self):
@@ -86,6 +86,14 @@ class AudioState:
         # Locks access to `frames`, and audio stored on disk.
         self.audio_lock = threading.Lock()
 
+        # Audio events that should play. Input thread appends to this list,
+        # audio feedback thread drains it.
+        self.audio_events = []
+        self.AUDIO_EVENT_TOGGLE_ON = 1
+        self.AUDIO_EVENT_TOGGLE_OFF = 2
+        self.AUDIO_EVENT_DISMISS = 3
+        self.AUDIO_EVENT_UPDATE = 4
+
         # Used to tell the threads when to stop.
         self.run_app = True
 
@@ -532,8 +540,7 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.audio_paused = True
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
-                    block=False)
+                audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF)
         elif state == PAUSE_STATE:
             state = RECORD_STATE
             if not use_builtin:
@@ -554,8 +561,37 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             resetDisplayLocked(audio_state)
 
             if enable_local_beep == 1:
-                playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
-                    block=False)
+                audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_ON)
+
+def audioFeedbackThread(audio_state, enable_local_beep: bool,
+        use_builtin: bool, button: str):
+    with open(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"), "rb") as f:
+        waveform0 = f.read()
+    with open(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"), "rb") as f:
+        waveform1 = f.read()
+    with open(os.path.abspath("Resources/Sounds/Dismiss_Noise_Quiet.wav"), "rb") as f:
+        waveform2 = f.read()
+    with open(os.path.abspath("Resources/Sounds/KB_Noise_Off_Quiet.wav"), "rb") as f:
+        waveform3 = f.read()
+    while audio_state.run_app == True:
+        time.sleep(0.01)
+
+        if len(audio_state.audio_events) == 0:
+            continue
+
+        event = audio_state.audio_events[0]
+        audio_state.audio_events = audio_state.audio_events[1:]
+
+        waveform = waveform0
+        if event == audio_state.AUDIO_EVENT_TOGGLE_ON:
+            waveform = waveform0
+        elif event == audio_state.AUDIO_EVENT_TOGGLE_OFF:
+            waveform = waveform1
+        elif event == audio_state.AUDIO_EVENT_DISMISS:
+            waveform = waveform2
+        elif event == audio_state.AUDIO_EVENT_UPDATE:
+            waveform = waveform3
+        winsound.PlaySound(waveform, winsound.SND_MEMORY)
 
 def readControllerInput(audio_state, enable_local_beep: bool,
         use_builtin: bool, button: str):
@@ -578,7 +614,6 @@ def readControllerInput(audio_state, enable_local_beep: bool,
     button_generator = steamvr.pollButtonPress(hand=hand_id, button=button_id)
     while audio_state.run_app == True:
         time.sleep(0.01)
-
         event = next(button_generator)
 
         if event == steamvr.EVENT_RISING_EDGE:
@@ -603,13 +638,11 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                 if last_rising - last_medium_press_end < 1.0:
                     # Type transcription
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/KB_Noise_Off_Quiet.wav"),
-                            block=False)
+                        audio_state.audio_events.append(audio_state.AUDIO_EVENT_UPDATE)
                     keyboard.write(audio_state.filtered_text)
                 else:
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
-                            block=False)
+                        audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF)
 
             elif now - last_rising > 0.5:
                 # Medium press
@@ -617,8 +650,7 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                 state = PAUSE_STATE
 
                 if enable_local_beep == 1:
-                    playsound(os.path.abspath("Resources/Sounds/Dismiss_Noise_Quiet.wav"),
-                        block=False)
+                    audio_state.audio_events.append(audio_state.AUDIO_EVENT_DISMISS)
 
                 if not use_builtin:
                     osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
@@ -638,8 +670,7 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                     audio_state.audio_paused = True
 
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
-                            block=False)
+                        audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF)
                 elif state == PAUSE_STATE:
                     state = RECORD_STATE
                     if not use_builtin:
@@ -659,8 +690,7 @@ def readControllerInput(audio_state, enable_local_beep: bool,
                     resetDisplayLocked(audio_state)
 
                     if enable_local_beep == 1:
-                        playsound(os.path.abspath("Resources/Sounds/Noise_On_Quiet.wav"),
-                            block=False)
+                        audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_ON)
 
 # model should correspond to one of the Whisper models defined in
 # whisper/__init__.py. Examples: tiny, base, small, medium.
@@ -794,6 +824,11 @@ def transcribeLoop(mic: str,
     controller_input_thd.daemon = True
     controller_input_thd.start()
 
+    audio_feedback_thd = threading.Thread(target = audioFeedbackThread, args
+            = [audio_state, enable_local_beep, use_builtin, button])
+    audio_feedback_thd.daemon = True
+    audio_feedback_thd.start()
+
     keyboard_input_thd = threading.Thread(target = readKeyboardInput, args
             = [audio_state, enable_local_beep, use_builtin, keyboard_hotkey])
     keyboard_input_thd.daemon = True
@@ -815,6 +850,7 @@ def transcribeLoop(mic: str,
     audio_state.run_app = False
     transcribe_audio_thd.join()
     controller_input_thd.join()
+    audio_feedback_thd.join()
     keyboard_input_thd.join()
 
 if __name__ == "__main__":
author	yum <yum.food.vr@gmail.com>	2023-08-25 12:50:59 -0700
committer	yum <yum.food.vr@gmail.com>	2023-08-25 12:50:59 -0700
commit	302f7ba09f2ee115d0ee4b8f0841f6ffcd50ec57 (patch)
tree	5c07175619a1e9d5e56a30f8d2fdd4e6bbde1623 /Scripts
parent	9e43487c1bf62402e96cb6139b24cd8446515673 (diff)