summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-02-19 14:15:30 -0800
committeryum <yum.food.vr@gmail.com>2023-02-22 21:49:22 -0800
commitcece1ee8f1b985c2a89adb661dd02c6d44787f67 (patch)
tree51d2c3af43d567c385180f988b39d3cb00644b19 /Scripts
parent42f8275d84ad431c19b7c22ac702e58ac95d0680 (diff)
Apply previous window conditioning to decoding layer
Per the Whisper source code, this should result in better temporal stability.
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/transcribe.py14
1 files changed, 9 insertions, 5 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index c4d7682..b6c4863 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -48,6 +48,7 @@ class AudioState:
self.stream = None
self.text = ""
+ self.tokens = []
self.committed_text = ""
self.frames = []
@@ -197,7 +198,7 @@ def resetAudio(audio_state):
audio_state.transcribe_lock.release()
# Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, frames, use_cpu: bool):
+def transcribe(audio_state, model, frames, use_cpu: bool, prev_tokens = []):
start_time = time.time()
frames = audio_state.frames
@@ -220,7 +221,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
use_gpu = not use_cpu
options = whisper.DecodingOptions(language = audio_state.language,
beam_size = 5, temperature = temp, without_timestamps = True,
- fp16 = use_gpu)
+ fp16 = use_gpu, prompt = prev_tokens)
result = whisper.decode(model, mel, options)
if result.avg_logprob < -1.0:
@@ -238,7 +239,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
result = None
continue
- result = result.text
+ result = (result.text, result.tokens)
break
return result
@@ -266,11 +267,13 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
audio_state.transcribe_sleep_duration_max_s,
longer_sleep_dur)
- text = transcribe(audio_state, model, audio_state.frames, use_cpu)
- if not text:
+ result = transcribe(audio_state, model, audio_state.frames, use_cpu,
+ audio_state.tokens)
+ if not result:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
continue
+ text, tokens = result
if audio_state.drop_transcription:
audio_state.drop_transcription = False
@@ -284,6 +287,7 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
audio_state.text = string_matcher.matchStrings(audio_state.text,
text, window_size = 25)
+ audio_state.tokens = tokens
now = time.time()
print("Transcription ({} seconds): {}".format(