From 3db4f81573d89f6ebefb5ec119c7d66affc1a4a0 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Thu, 31 Aug 2023 17:11:11 -0700
Subject: Bugfixes and tweaks

* Temporarily restore normal process priority. Working on adding a UI
  option to set STT prio.
* Give audio indicator phonemes a 1/3 chance to do nothing. Makes result
  sound a little better imo.
* Quiet down steamVR thread when steamVR isn't running
* Fix use of `button_id` and `hand_id` in steamvr.py
* Increase amount of silence allowed before transcript from 1 to 5
  seconds. You want enough buffer to allow for a few full transcripts,
  else you risk spuriously dropping audio.
* Enable background loading in audio metadata (required by vrc sdk)
---
 Scripts/osc_ctrl.py   |  2 +-
 Scripts/steamvr.py    | 13 ++++++++-----
 Scripts/transcribe.py | 25 +++++++++----------------
 3 files changed, 18 insertions(+), 22 deletions(-)

(limited to 'Scripts')

diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py
index 413e2ae..6fc706e 100644
--- a/Scripts/osc_ctrl.py
+++ b/Scripts/osc_ctrl.py
@@ -108,7 +108,7 @@ def pageMessage(osc_state: OscState, msg: str, estate: EmotesState) -> bool:
         letter_i += 1
     if len(sounds_to_make) > 0:
         for i in range(5):
-            if i+1 in sounds_to_make:
+            if i+1 in sounds_to_make and random.randint(1,3) != 1:
                 playAudio(osc_state, i+1, True)
             else:
                 playAudio(osc_state, i+1, False)
diff --git a/Scripts/steamvr.py b/Scripts/steamvr.py
index e0b59e3..da07134 100644
--- a/Scripts/steamvr.py
+++ b/Scripts/steamvr.py
@@ -30,19 +30,22 @@ def pollButtonPress(
     buttons["thumbstick"] = vr.k_EButton_Axis0
 
     system = None
+    first = True
     while not system:
         try:
             system = vr.init(vr.VRApplication_Background)
         except Exception as e:
-            print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr)
-            time.sleep(5)
+            if first:
+                print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr)
+            first = False
+            time.sleep(1)
     last_packet = 0
     event_high = False
 
     while True:
         time.sleep(0.01)
 
-        lh_idx = system.getTrackedDeviceIndexForControllerRole(hand_id)
+        lh_idx = system.getTrackedDeviceIndexForControllerRole(hands[hand])
         #print("left hand device idx: {}".format(lh_idx))
 
         got_state, state = system.getControllerState(lh_idx)
@@ -58,7 +61,7 @@ def pollButtonPress(
         # click, not movement.
         dead_zone_radius = 0.7
 
-        button_mask = (1 << button_id)
+        button_mask = (1 << buttons[button])
         ret = EVENT_NONE
         if (state.ulButtonPressed & button_mask) != 0 and\
                 (state.rAxis[0].x**2 + state.rAxis[0].y**2 < dead_zone_radius**2):
@@ -77,7 +80,7 @@ if __name__ == "__main__":
     while True:
         time.sleep(0.1)
 
-        event = pollButtonPress(session_state, hand_id = hands["left"], button_id = buttons["joystick"])
+        event = pollButtonPress(session_state)
         if event == EVENT_RISING_EDGE:
             print("rising edge")
         elif event == EVENT_FALLING_EDGE:
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index cea2da0..c718f73 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -170,10 +170,6 @@ def onAudioFramesAvailable(
     else:
         audio_state.commit_fuzz_threshold = 1
 
-    max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
-            audio_state.CHUNK)
-    if len(audio_state.frames) > max_frames:
-        audio_state.frames = audio_state.frames[-1 * max_frames:]
     if audio_state.drop_samples_till_i > 0:
         # Caller wants us to keep this many *whisper* samples, assuming that
         # we're getting one full frame every (1024 / 16KHz) seconds.
@@ -198,6 +194,11 @@ def onAudioFramesAvailable(
             audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
         audio_state.drop_samples_till_i = -1
 
+    max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
+            audio_state.CHUNK)
+    if len(audio_state.frames) > max_frames:
+        audio_state.frames = audio_state.frames[-1 * max_frames:]
+
     # Now enforce a minimum duration on frames. This reduces cases where the
     # STT hallucinates random things. In the Whisper paper, they enforce a
     # minimum audio buffer duration of 5.0 seconds, so I do the same here.
@@ -393,9 +394,9 @@ def transcribeAudio(audio_state,
             if audio_state.enable_debug_mode:
                 print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
-            # Prevent audio buffer from holding more than 1 second of silence
+            # Prevent audio buffer from holding more than a few seconds of silence
             # before real speech.
-            audio_state.MAX_LENGTH_S = 1
+            audio_state.MAX_LENGTH_S = 5
             continue
         else:
             audio_state.MAX_LENGTH_S = 300
@@ -731,14 +732,12 @@ def transcribeLoop(mic: str,
         window_duration_s: int,
         gpu_idx: int,
         keyboard_hotkey: str,
-        reset_on_toggle: bool,
-        commit_fuzz_threshold: int):
+        reset_on_toggle: bool):
     audio_state = getMicStream(mic)
     audio_state.whisper_language = language
     audio_state.language = langcodes.find(language).language
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
-    #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
     audio_state.enable_debug_mode = enable_debug_mode
     audio_state.enable_profanity_filter = enable_profanity_filter
 
@@ -905,7 +904,6 @@ if __name__ == "__main__":
     parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
     parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
     parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
-    parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
     parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.")
     args = parser.parse_args()
 
@@ -945,10 +943,6 @@ if __name__ == "__main__":
         print("--gpu_idx required", file=sys.stderr)
         sys.exit(1)
 
-    if not args.commit_fuzz_threshold:
-        print("--commit_fuzz_threshold required", file=sys.stderr)
-        sys.exit(1)
-
     args.gpu_idx = int(args.gpu_idx)
 
     window_duration_s = 120
@@ -1027,6 +1021,5 @@ if __name__ == "__main__":
             estate, window_duration_s,
             args.gpu_idx,
             args.keybind,
-            args.reset_on_toggle,
-            args.commit_fuzz_threshold)
+            args.reset_on_toggle)
 
-- 
cgit v1.2.3