diff options
| author | yum <yum.food.vr@gmail.com> | 2023-02-19 14:15:30 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-02-22 21:49:22 -0800 |
| commit | cece1ee8f1b985c2a89adb661dd02c6d44787f67 (patch) | |
| tree | 51d2c3af43d567c385180f988b39d3cb00644b19 | |
| parent | 42f8275d84ad431c19b7c22ac702e58ac95d0680 (diff) | |
Apply previous window conditioning to decoding layer
Per the Whisper source code, this should result in better temporal
stability.
| -rw-r--r-- | Scripts/transcribe.py | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index c4d7682..b6c4863 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -48,6 +48,7 @@ class AudioState: self.stream = None self.text = "" + self.tokens = [] self.committed_text = "" self.frames = [] @@ -197,7 +198,7 @@ def resetAudio(audio_state): audio_state.transcribe_lock.release() # Transcribe the audio recorded in a file. -def transcribe(audio_state, model, frames, use_cpu: bool): +def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []): start_time = time.time() frames = audio_state.frames @@ -220,7 +221,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool): use_gpu = not use_cpu options = whisper.DecodingOptions(language = audio_state.language, beam_size = 5, temperature = temp, without_timestamps = True, - fp16 = use_gpu) + fp16 = use_gpu, prompt = prev_tokens) result = whisper.decode(model, mel, options) if result.avg_logprob < -1.0: @@ -238,7 +239,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool): result = None continue - result = result.text + result = (result.text, result.tokens) break return result @@ -266,11 +267,13 @@ def transcribeAudio(audio_state, model, use_cpu: bool): audio_state.transcribe_sleep_duration_max_s, longer_sleep_dur) - text = transcribe(audio_state, model, audio_state.frames, use_cpu) - if not text: + result = transcribe(audio_state, model, audio_state.frames, use_cpu, + audio_state.tokens) + if not result: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue + text, tokens = result if audio_state.drop_transcription: audio_state.drop_transcription = False @@ -284,6 +287,7 @@ def transcribeAudio(audio_state, model, use_cpu: bool): audio_state.text = string_matcher.matchStrings(audio_state.text, text, window_size = 25) + audio_state.tokens = tokens now = time.time() print("Transcription ({} seconds): {}".format( |
