summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-02-22 21:49:58 -0800
committeryum <yum.food.vr@gmail.com>2023-02-22 21:54:39 -0800
commit718319ee7b79d7cdbead5d765769b50c25e968f4 (patch)
treeb07440950d806ddc3dec06502daec95307ca4f67 /Scripts
parent08e269d9c2a7faaa08b27e0fb63d50c20529a5c2 (diff)
Revert "Apply previous window conditioning to decoding layer"
This reverts commit cece1ee8f1b985c2a89adb661dd02c6d44787f67. This does *not* in fact result in improved temporal stability. It makes makes things so unstable that even single-sentence messages fail to ever stabilize.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe.py14
1 files changed, 5 insertions, 9 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index b6c4863..c4d7682 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -48,7 +48,6 @@ class AudioState:
self.stream = None
self.text = ""
- self.tokens = []
self.committed_text = ""
self.frames = []
@@ -198,7 +197,7 @@ def resetAudio(audio_state):
audio_state.transcribe_lock.release()
# Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []):
+def transcribe(audio_state, model, frames, use_cpu: bool):
start_time = time.time()
frames = audio_state.frames
@@ -221,7 +220,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []):
use_gpu = not use_cpu
options = whisper.DecodingOptions(language = audio_state.language,
beam_size = 5, temperature = temp, without_timestamps = True,
- fp16 = use_gpu, prompt = prev_tokens)
+ fp16 = use_gpu)
result = whisper.decode(model, mel, options)
if result.avg_logprob < -1.0:
@@ -239,7 +238,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []):
result = None
continue
- result = (result.text, result.tokens)
+ result = result.text
break
return result
@@ -267,13 +266,11 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
audio_state.transcribe_sleep_duration_max_s,
longer_sleep_dur)
- result = transcribe(audio_state, model, audio_state.frames, use_cpu,
- audio_state.tokens)
- if not result:
+ text = transcribe(audio_state, model, audio_state.frames, use_cpu)
+ if not text:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
continue
- text, tokens = result
if audio_state.drop_transcription:
audio_state.drop_transcription = False
@@ -287,7 +284,6 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
audio_state.text = string_matcher.matchStrings(audio_state.text,
text, window_size = 25)
- audio_state.tokens = tokens
now = time.time()
print("Transcription ({} seconds): {}".format(