From 3db4f81573d89f6ebefb5ec119c7d66affc1a4a0 Mon Sep 17 00:00:00 2001 From: yum Date: Thu, 31 Aug 2023 17:11:11 -0700 Subject: Bugfixes and tweaks * Temporarily restore normal process priority. Working on adding a UI option to set STT prio. * Give audio indicator phonemes a 1/3 chance to do nothing. Makes result sound a little better imo. * Quiet down steamVR thread when steamVR isn't running * Fix use of `button_id` and `hand_id` in steamvr.py * Increase amount of silence allowed before transcript from 1 to 5 seconds. You want enough buffer to allow for a few full transcripts, else you risk spuriously dropping audio. * Enable background loading in audio metadata (required by vrc sdk) --- Scripts/osc_ctrl.py | 2 +- Scripts/steamvr.py | 13 ++++++++----- Scripts/transcribe.py | 25 +++++++++---------------- 3 files changed, 18 insertions(+), 22 deletions(-) (limited to 'Scripts') diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py index 413e2ae..6fc706e 100644 --- a/Scripts/osc_ctrl.py +++ b/Scripts/osc_ctrl.py @@ -108,7 +108,7 @@ def pageMessage(osc_state: OscState, msg: str, estate: EmotesState) -> bool: letter_i += 1 if len(sounds_to_make) > 0: for i in range(5): - if i+1 in sounds_to_make: + if i+1 in sounds_to_make and random.randint(1,3) != 1: playAudio(osc_state, i+1, True) else: playAudio(osc_state, i+1, False) diff --git a/Scripts/steamvr.py b/Scripts/steamvr.py index e0b59e3..da07134 100644 --- a/Scripts/steamvr.py +++ b/Scripts/steamvr.py @@ -30,19 +30,22 @@ def pollButtonPress( buttons["thumbstick"] = vr.k_EButton_Axis0 system = None + first = True while not system: try: system = vr.init(vr.VRApplication_Background) except Exception as e: - print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr) - time.sleep(5) + if first: + print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr) + first = False + time.sleep(1) last_packet = 0 event_high = False while True: time.sleep(0.01) - lh_idx = system.getTrackedDeviceIndexForControllerRole(hand_id) + lh_idx = system.getTrackedDeviceIndexForControllerRole(hands[hand]) #print("left hand device idx: {}".format(lh_idx)) got_state, state = system.getControllerState(lh_idx) @@ -58,7 +61,7 @@ def pollButtonPress( # click, not movement. dead_zone_radius = 0.7 - button_mask = (1 << button_id) + button_mask = (1 << buttons[button]) ret = EVENT_NONE if (state.ulButtonPressed & button_mask) != 0 and\ (state.rAxis[0].x**2 + state.rAxis[0].y**2 < dead_zone_radius**2): @@ -77,7 +80,7 @@ if __name__ == "__main__": while True: time.sleep(0.1) - event = pollButtonPress(session_state, hand_id = hands["left"], button_id = buttons["joystick"]) + event = pollButtonPress(session_state) if event == EVENT_RISING_EDGE: print("rising edge") elif event == EVENT_FALLING_EDGE: diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index cea2da0..c718f73 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -170,10 +170,6 @@ def onAudioFramesAvailable( else: audio_state.commit_fuzz_threshold = 1 - max_frames = int(input_rate * audio_state.MAX_LENGTH_S / - audio_state.CHUNK) - if len(audio_state.frames) > max_frames: - audio_state.frames = audio_state.frames[-1 * max_frames:] if audio_state.drop_samples_till_i > 0: # Caller wants us to keep this many *whisper* samples, assuming that # we're getting one full frame every (1024 / 16KHz) seconds. @@ -198,6 +194,11 @@ def onAudioFramesAvailable( audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] audio_state.drop_samples_till_i = -1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / + audio_state.CHUNK) + if len(audio_state.frames) > max_frames: + audio_state.frames = audio_state.frames[-1 * max_frames:] + # Now enforce a minimum duration on frames. This reduces cases where the # STT hallucinates random things. In the Whisper paper, they enforce a # minimum audio buffer duration of 5.0 seconds, so I do the same here. @@ -393,9 +394,9 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() - # Prevent audio buffer from holding more than 1 second of silence + # Prevent audio buffer from holding more than a few seconds of silence # before real speech. - audio_state.MAX_LENGTH_S = 1 + audio_state.MAX_LENGTH_S = 5 continue else: audio_state.MAX_LENGTH_S = 300 @@ -731,14 +732,12 @@ def transcribeLoop(mic: str, window_duration_s: int, gpu_idx: int, keyboard_hotkey: str, - reset_on_toggle: bool, - commit_fuzz_threshold: int): + reset_on_toggle: bool): audio_state = getMicStream(mic) audio_state.whisper_language = language audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter @@ -905,7 +904,6 @@ if __name__ == "__main__": parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") - parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.") parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.") args = parser.parse_args() @@ -945,10 +943,6 @@ if __name__ == "__main__": print("--gpu_idx required", file=sys.stderr) sys.exit(1) - if not args.commit_fuzz_threshold: - print("--commit_fuzz_threshold required", file=sys.stderr) - sys.exit(1) - args.gpu_idx = int(args.gpu_idx) window_duration_s = 120 @@ -1027,6 +1021,5 @@ if __name__ == "__main__": estate, window_duration_s, args.gpu_idx, args.keybind, - args.reset_on_toggle, - args.commit_fuzz_threshold) + args.reset_on_toggle) -- cgit v1.2.3