diff options
Diffstat (limited to 'Scripts/transcribe.py')
| -rw-r--r-- | Scripts/transcribe.py | 66 |
1 files changed, 55 insertions, 11 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index fe06631..9711d15 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -4,6 +4,7 @@ from datetime import datetime from emotes_v2 import EmotesState from faster_whisper import WhisperModel from functools import partial +from math import ceil from playsound import playsound from sentence_splitter import split_text_into_sentences @@ -20,12 +21,12 @@ import os import osc_ctrl import pyaudio import steamvr -import string_matcher import subprocess import sys import threading import time import transformers +import typing import wave class AudioState: @@ -48,8 +49,13 @@ class AudioState: # PyAudio stream object self.stream = None + self.committed_text = "" self.text = "" self.filtered_text = "" + # List of: + # List of tuples of: + # Segment start time, end time, and text + self.ranges_ls = [] self.frames = [] # Locks access to `text`. @@ -189,6 +195,8 @@ def resetAudioLocked(audio_state): audio_state.transcribe_sleep_duration_min_s audio_state.text = "" + audio_state.preview_text = "" + audio_state.filtered_text = "" def resetDisplayLocked(audio_state): osc_ctrl.clear(audio_state.osc_state) @@ -201,7 +209,10 @@ def resetAudio(audio_state): audio_state.transcribe_lock.release() # Transcribe the audio recorded in a file. -def transcribe(audio_state, model, frames, use_cpu: bool): +# Returns two strings: committed text, and preview text. +# Committed text is temporally stable. Preview text is *not* temporally stable, +# but is lower latency than committed text. +def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,str]: start_time = time.time() frames = audio_state.frames @@ -217,9 +228,41 @@ def transcribe(audio_state, model, frames, use_cpu: bool): beam_size = 5, language = audio_state.language, vad_filter = True, - without_timestamps = True) + condition_on_previous_text = True, + without_timestamps = False) + ranges = [] + for s in segments: + #print(f"Segment: {s}") + ranges.append((s.start, s.end, s.text)) + audio_state.ranges_ls.append(ranges) + + committed_text = "" + if True: + # Tuple of (start time, end time, transcript) + first_segments = [] + for ranges in audio_state.ranges_ls: + for segment in ranges: + first_segments.append(segment) + break + if len(first_segments) >= 3: + c0 = first_segments[-3] + c1 = first_segments[-2] + c2 = first_segments[-1] + #print(f"c0: {c0}, c1: {c1}, c2: {c2}") + if c0 == c1 and c1 == c2: + # For simplicity, completely reset saved audio ranges. + audio_state.ranges_ls = [] + committed_text = c2[2] + n_frames_to_drop = int(ceil(audio_state.RATE * c0[1])) + del audio_state.frames[0:n_frames_to_drop] + + preview_text = "" + for seg in ranges: + if seg[2] == committed_text: + continue + preview_text += seg[2] - return "".join(s.text for s in segments) + return (committed_text, preview_text) def transcribeAudio(audio_state, model, @@ -251,8 +294,8 @@ def transcribeAudio(audio_state, audio_state.transcribe_sleep_duration_max_s, longer_sleep_dur) - text = transcribe(audio_state, model, audio_state.frames, use_cpu) - if not text: + text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu) + if len(text) == 0 and len(preview_text) == 0: print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue @@ -260,23 +303,24 @@ def transcribeAudio(audio_state, if audio_state.drop_transcription: audio_state.drop_transcription = False audio_state.text = "" + audio_state.preview_text = "" audio_state.filtered_text = "" print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time)) last_transcribe_time = time.time() continue old_text = audio_state.text - audio_state.text = string_matcher.matchStrings(audio_state.text, - text, window_size = 25) + audio_state.text += text + audio_state.preview_text = audio_state.text + preview_text now = time.time() print("Transcription ({} seconds): {}".format( now - last_transcribe_time, - audio_state.text)) + audio_state.preview_text)) last_transcribe_time = now # Translate if requested. - translated = audio_state.text + translated = audio_state.preview_text if audio_state.language_target: whisper_lang = audio_state.whisper_language nllb_lang = lang_compat.whisper_to_nllb[whisper_lang] @@ -314,7 +358,7 @@ def transcribeAudio(audio_state, filtered_text = filtered_text.lower() audio_state.filtered_text = filtered_text - if old_text != audio_state.text: + if old_text != audio_state.preview_text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. audio_state.transcribe_no_change_count = 0 |
