diff options
Diffstat (limited to 'Scripts/transcribe.py')
| -rw-r--r-- | Scripts/transcribe.py | 25 |
1 files changed, 9 insertions, 16 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index cea2da0..c718f73 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -170,10 +170,6 @@ def onAudioFramesAvailable( else: audio_state.commit_fuzz_threshold = 1 - max_frames = int(input_rate * audio_state.MAX_LENGTH_S / - audio_state.CHUNK) - if len(audio_state.frames) > max_frames: - audio_state.frames = audio_state.frames[-1 * max_frames:] if audio_state.drop_samples_till_i > 0: # Caller wants us to keep this many *whisper* samples, assuming that # we're getting one full frame every (1024 / 16KHz) seconds. @@ -198,6 +194,11 @@ def onAudioFramesAvailable( audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:] audio_state.drop_samples_till_i = -1 + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / + audio_state.CHUNK) + if len(audio_state.frames) > max_frames: + audio_state.frames = audio_state.frames[-1 * max_frames:] + # Now enforce a minimum duration on frames. This reduces cases where the # STT hallucinates random things. In the Whisper paper, they enforce a # minimum audio buffer duration of 5.0 seconds, so I do the same here. @@ -393,9 +394,9 @@ def transcribeAudio(audio_state, if audio_state.enable_debug_mode: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() - # Prevent audio buffer from holding more than 1 second of silence + # Prevent audio buffer from holding more than a few seconds of silence # before real speech. - audio_state.MAX_LENGTH_S = 1 + audio_state.MAX_LENGTH_S = 5 continue else: audio_state.MAX_LENGTH_S = 300 @@ -731,14 +732,12 @@ def transcribeLoop(mic: str, window_duration_s: int, gpu_idx: int, keyboard_hotkey: str, - reset_on_toggle: bool, - commit_fuzz_threshold: int): + reset_on_toggle: bool): audio_state = getMicStream(mic) audio_state.whisper_language = language audio_state.language = langcodes.find(language).language audio_state.MAX_LENGTH_S = window_duration_s audio_state.reset_on_toggle = reset_on_toggle - #audio_state.commit_fuzz_threshold = commit_fuzz_threshold audio_state.enable_debug_mode = enable_debug_mode audio_state.enable_profanity_filter = enable_profanity_filter @@ -905,7 +904,6 @@ if __name__ == "__main__": parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.") parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s") parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.") - parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.") parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.") args = parser.parse_args() @@ -945,10 +943,6 @@ if __name__ == "__main__": print("--gpu_idx required", file=sys.stderr) sys.exit(1) - if not args.commit_fuzz_threshold: - print("--commit_fuzz_threshold required", file=sys.stderr) - sys.exit(1) - args.gpu_idx = int(args.gpu_idx) window_duration_s = 120 @@ -1027,6 +1021,5 @@ if __name__ == "__main__": estate, window_duration_s, args.gpu_idx, args.keybind, - args.reset_on_toggle, - args.commit_fuzz_threshold) + args.reset_on_toggle) |
