summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe.py
diff options
context:
space:
mode:
Diffstat (limited to 'Scripts/transcribe.py')
-rw-r--r--Scripts/transcribe.py66
1 files changed, 55 insertions, 11 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index fe06631..9711d15 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -4,6 +4,7 @@ from datetime import datetime
from emotes_v2 import EmotesState
from faster_whisper import WhisperModel
from functools import partial
+from math import ceil
from playsound import playsound
from sentence_splitter import split_text_into_sentences
@@ -20,12 +21,12 @@ import os
import osc_ctrl
import pyaudio
import steamvr
-import string_matcher
import subprocess
import sys
import threading
import time
import transformers
+import typing
import wave
class AudioState:
@@ -48,8 +49,13 @@ class AudioState:
# PyAudio stream object
self.stream = None
+ self.committed_text = ""
self.text = ""
self.filtered_text = ""
+ # List of:
+ # List of tuples of:
+ # Segment start time, end time, and text
+ self.ranges_ls = []
self.frames = []
# Locks access to `text`.
@@ -189,6 +195,8 @@ def resetAudioLocked(audio_state):
audio_state.transcribe_sleep_duration_min_s
audio_state.text = ""
+ audio_state.preview_text = ""
+ audio_state.filtered_text = ""
def resetDisplayLocked(audio_state):
osc_ctrl.clear(audio_state.osc_state)
@@ -201,7 +209,10 @@ def resetAudio(audio_state):
audio_state.transcribe_lock.release()
# Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, frames, use_cpu: bool):
+# Returns two strings: committed text, and preview text.
+# Committed text is temporally stable. Preview text is *not* temporally stable,
+# but is lower latency than committed text.
+def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,str]:
start_time = time.time()
frames = audio_state.frames
@@ -217,9 +228,41 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
beam_size = 5,
language = audio_state.language,
vad_filter = True,
- without_timestamps = True)
+ condition_on_previous_text = True,
+ without_timestamps = False)
+ ranges = []
+ for s in segments:
+ #print(f"Segment: {s}")
+ ranges.append((s.start, s.end, s.text))
+ audio_state.ranges_ls.append(ranges)
+
+ committed_text = ""
+ if True:
+ # Tuple of (start time, end time, transcript)
+ first_segments = []
+ for ranges in audio_state.ranges_ls:
+ for segment in ranges:
+ first_segments.append(segment)
+ break
+ if len(first_segments) >= 3:
+ c0 = first_segments[-3]
+ c1 = first_segments[-2]
+ c2 = first_segments[-1]
+ #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
+ if c0 == c1 and c1 == c2:
+ # For simplicity, completely reset saved audio ranges.
+ audio_state.ranges_ls = []
+ committed_text = c2[2]
+ n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
+ del audio_state.frames[0:n_frames_to_drop]
+
+ preview_text = ""
+ for seg in ranges:
+ if seg[2] == committed_text:
+ continue
+ preview_text += seg[2]
- return "".join(s.text for s in segments)
+ return (committed_text, preview_text)
def transcribeAudio(audio_state,
model,
@@ -251,8 +294,8 @@ def transcribeAudio(audio_state,
audio_state.transcribe_sleep_duration_max_s,
longer_sleep_dur)
- text = transcribe(audio_state, model, audio_state.frames, use_cpu)
- if not text:
+ text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu)
+ if len(text) == 0 and len(preview_text) == 0:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
continue
@@ -260,23 +303,24 @@ def transcribeAudio(audio_state,
if audio_state.drop_transcription:
audio_state.drop_transcription = False
audio_state.text = ""
+ audio_state.preview_text = ""
audio_state.filtered_text = ""
print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
continue
old_text = audio_state.text
- audio_state.text = string_matcher.matchStrings(audio_state.text,
- text, window_size = 25)
+ audio_state.text += text
+ audio_state.preview_text = audio_state.text + preview_text
now = time.time()
print("Transcription ({} seconds): {}".format(
now - last_transcribe_time,
- audio_state.text))
+ audio_state.preview_text))
last_transcribe_time = now
# Translate if requested.
- translated = audio_state.text
+ translated = audio_state.preview_text
if audio_state.language_target:
whisper_lang = audio_state.whisper_language
nllb_lang = lang_compat.whisper_to_nllb[whisper_lang]
@@ -314,7 +358,7 @@ def transcribeAudio(audio_state,
filtered_text = filtered_text.lower()
audio_state.filtered_text = filtered_text
- if old_text != audio_state.text:
+ if old_text != audio_state.preview_text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.
audio_state.transcribe_no_change_count = 0