diff options
| author | yum <yum.food.vr@gmail.com> | 2023-08-31 17:11:11 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-08-31 17:17:01 -0700 |
| commit | 3db4f81573d89f6ebefb5ec119c7d66affc1a4a0 (patch) | |
| tree | 202672af81898cfdf559dcfa3a2d89341584f25c /Scripts/transcribe.py | |
| parent | 4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (diff) | |
Bugfixes and tweaks
* Temporarily restore normal process priority. Working on adding a UI
option to set STT prio.
* Give audio indicator phonemes a 1/3 chance to do nothing. Makes result
sound a little better imo.
* Quiet down steamVR thread when steamVR isn't running
* Fix use of `button_id` and `hand_id` in steamvr.py
* Increase amount of silence allowed before transcript from 1 to 5
seconds. You want enough buffer to allow for a few full transcripts,
else you risk spuriously dropping audio.
* Enable background loading in audio metadata (required by vrc sdk)
Diffstat (limited to 'Scripts/transcribe.py')
| -rw-r--r-- | Scripts/transcribe.py | 25 |
1 files changed, 9 insertions, 16 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index cea2da0..c718f73 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -170,10 +170,6 @@ def onAudioFramesAvailable( else: audio_state.commit_fuzz_threshold = 1 - max_frames = int(input_rate * audio_state.MAX_LENGTH_S / - audio_state.CHUNK) - if len(audio_state.frames) > max_frames: - audio_state.frames = audio_state.frames[-1 * max_frames:] if audio_state.drop_samples_till_i > 0: # Caller wants us to keep this many *whisper* samples, assuming that # we're getting one full frame every (1024 / 16KHz) seconds. @@ -198,6 +194,11 @@ def onAudioFramesAvailable( audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] audio_state.drop_samples_till_i = -1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / + audio_state.CHUNK) + if len(audio_state.frames) > max_frames: + audio_state.frames = audio_state.frames[-1 * max_frames:] + # Now enforce a minimum duration on frames. This reduces cases where the # STT hallucinates random things. In the Whisper paper, they enforce a # minimum audio buffer duration of 5.0 seconds, so I do the same here. @@ -393,9 +394,9 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() - # Prevent audio buffer from holding more than 1 second of silence + # Prevent audio buffer from holding more than a few seconds of silence # before real speech. - audio_state.MAX_LENGTH_S = 1 + audio_state.MAX_LENGTH_S = 5 continue else: audio_state.MAX_LENGTH_S = 300 @@ -731,14 +732,12 @@ def transcribeLoop(mic: str, window_duration_s: int, gpu_idx: int, keyboard_hotkey: str, - reset_on_toggle: bool, - commit_fuzz_threshold: int): + reset_on_toggle: bool): audio_state = getMicStream(mic) audio_state.whisper_language = language audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter @@ -905,7 +904,6 @@ if __name__ == "__main__": parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") - parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.") parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.") args = parser.parse_args() @@ -945,10 +943,6 @@ if __name__ == "__main__": print("--gpu_idx required", file=sys.stderr) sys.exit(1) - if not args.commit_fuzz_threshold: - print("--commit_fuzz_threshold required", file=sys.stderr) - sys.exit(1) - args.gpu_idx = int(args.gpu_idx) window_duration_s = 120 @@ -1027,6 +1021,5 @@ if __name__ == "__main__": estate, window_duration_s, args.gpu_idx, args.keybind, - args.reset_on_toggle, - args.commit_fuzz_threshold) + args.reset_on_toggle) |
