From 718319ee7b79d7cdbead5d765769b50c25e968f4 Mon Sep 17 00:00:00 2001 From: yum Date: Wed, 22 Feb 2023 21:49:58 -0800 Subject: Revert "Apply previous window conditioning to decoding layer" This reverts commit cece1ee8f1b985c2a89adb661dd02c6d44787f67. This does *not* in fact result in improved temporal stability. It makes makes things so unstable that even single-sentence messages fail to ever stabilize. --- Scripts/transcribe.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'Scripts') diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index b6c4863..c4d7682 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -48,7 +48,6 @@ class AudioState: self.stream = None self.text = "" - self.tokens = [] self.committed_text = "" self.frames = [] @@ -198,7 +197,7 @@ def resetAudio(audio_state): audio_state.transcribe_lock.release() # Transcribe the audio recorded in a file. -def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []): +def transcribe(audio_state, model, frames, use_cpu: bool): start_time = time.time() frames = audio_state.frames @@ -221,7 +220,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []): use_gpu = not use_cpu options = whisper.DecodingOptions(language = audio_state.language, beam_size = 5, temperature = temp, without_timestamps = True, - fp16 = use_gpu, prompt = prev_tokens) + fp16 = use_gpu) result = whisper.decode(model, mel, options) if result.avg_logprob < -1.0: @@ -239,7 +238,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []): result = None continue - result = (result.text, result.tokens) + result = result.text break return result @@ -267,13 +266,11 @@ def transcribeAudio(audio_state, model, use_cpu: bool): audio_state.transcribe_sleep_duration_max_s, longer_sleep_dur) - result = transcribe(audio_state, model, audio_state.frames, use_cpu, - audio_state.tokens) - if not result: + text = transcribe(audio_state, model, audio_state.frames, use_cpu) + if not text: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue - text, tokens = result if audio_state.drop_transcription: audio_state.drop_transcription = False @@ -287,7 +284,6 @@ def transcribeAudio(audio_state, model, use_cpu: bool): audio_state.text = string_matcher.matchStrings(audio_state.text, text, window_size = 25) - audio_state.tokens = tokens now = time.time() print("Transcription ({} seconds): {}".format( -- cgit v1.2.3