From 8d0add86f66db5324f8b965b832aea7cc1361498 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Sat, 24 Jun 2023 18:02:37 -0700
Subject: Rework transcription commit logic

At the core of the STT, there's a loop which uses Whisper to convert
audio into a transcript. As you say something, whisper sees growing
fragments of your sentence:

t0: "Hell"
t1: "Hello"
t2: "Hello, world!"

So we need some algorithm which takes these fragments and
accumulates them into an ever-growing transcript.

Previously I did this with fuzzy string matching. I'd find the region
where the two transcripts overlap and edit the two together to produce a
longer transcript. The big problem is that if there's no overlap, it's
not clear whether whisper radically changed its mind as to what was
said, or whether the user paused for a long time before saying
something new. So I'd have to reset the growing transcript.

Now I get the timestamps from Whisper and wait for it to give me the
same 3 transcripts for the last utterance. Once the transcript
stabilizes like this, I commit the text. This enables a temporally
stable, ever-growing transcript that's also quite accurate.

To prevent a latency regression, I also introduce the notion of "preview
text", which is a preview of an utterance that has not yet stabilized.
These previews do not contribute to the ever-growing transcript, but do
get fed through the rest of the app, so they show up in-game / in OBS.
Once they eventually stabilize, they get committed to the ever-growing
transcript.

This change is lightly tested!
---
 Scripts/transcribe.py | 66 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 11 deletions(-)

(limited to 'Scripts/transcribe.py')

diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index fe06631..9711d15 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -4,6 +4,7 @@ from datetime import datetime
 from emotes_v2 import EmotesState
 from faster_whisper import WhisperModel
 from functools import partial
+from math import ceil
 from playsound import playsound
 from sentence_splitter import split_text_into_sentences
 
@@ -20,12 +21,12 @@ import os
 import osc_ctrl
 import pyaudio
 import steamvr
-import string_matcher
 import subprocess
 import sys
 import threading
 import time
 import transformers
+import typing
 import wave
 
 class AudioState:
@@ -48,8 +49,13 @@ class AudioState:
         # PyAudio stream object
         self.stream = None
 
+        self.committed_text = ""
         self.text = ""
         self.filtered_text = ""
+        # List of:
+        #   List of tuples of:
+        #     Segment start time, end time, and text
+        self.ranges_ls = []
         self.frames = []
 
         # Locks access to `text`.
@@ -189,6 +195,8 @@ def resetAudioLocked(audio_state):
             audio_state.transcribe_sleep_duration_min_s
 
     audio_state.text = ""
+    audio_state.preview_text = ""
+    audio_state.filtered_text = ""
 
 def resetDisplayLocked(audio_state):
     osc_ctrl.clear(audio_state.osc_state)
@@ -201,7 +209,10 @@ def resetAudio(audio_state):
     audio_state.transcribe_lock.release()
 
 # Transcribe the audio recorded in a file.
-def transcribe(audio_state, model, frames, use_cpu: bool):
+# Returns two strings: committed text, and preview text.
+# Committed text is temporally stable. Preview text is *not* temporally stable,
+# but is lower latency than committed text.
+def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,str]:
     start_time = time.time()
 
     frames = audio_state.frames
@@ -217,9 +228,41 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
             beam_size = 5,
             language = audio_state.language,
             vad_filter = True,
-            without_timestamps = True)
+            condition_on_previous_text = True,
+            without_timestamps = False)
+    ranges = []
+    for s in segments:
+        #print(f"Segment: {s}")
+        ranges.append((s.start, s.end, s.text))
+    audio_state.ranges_ls.append(ranges)
+
+    committed_text = ""
+    if True:
+        # Tuple of (start time, end time, transcript)
+        first_segments = []
+        for ranges in audio_state.ranges_ls:
+            for segment in ranges:
+                first_segments.append(segment)
+                break
+        if len(first_segments) >= 3:
+            c0 = first_segments[-3]
+            c1 = first_segments[-2]
+            c2 = first_segments[-1]
+            #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
+            if c0 == c1 and c1 == c2:
+                # For simplicity, completely reset saved audio ranges.
+                audio_state.ranges_ls = []
+                committed_text = c2[2]
+                n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
+                del audio_state.frames[0:n_frames_to_drop]
+
+    preview_text = ""
+    for seg in ranges:
+        if seg[2] == committed_text:
+            continue
+        preview_text += seg[2]
 
-    return "".join(s.text for s in segments)
+    return (committed_text, preview_text)
 
 def transcribeAudio(audio_state,
         model,
@@ -251,8 +294,8 @@ def transcribeAudio(audio_state,
                     audio_state.transcribe_sleep_duration_max_s,
                     longer_sleep_dur)
 
-        text = transcribe(audio_state, model, audio_state.frames, use_cpu)
-        if not text:
+        text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu)
+        if len(text) == 0 and len(preview_text) == 0:
             print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
@@ -260,23 +303,24 @@ def transcribeAudio(audio_state,
         if audio_state.drop_transcription:
             audio_state.drop_transcription = False
             audio_state.text = ""
+            audio_state.preview_text = ""
             audio_state.filtered_text = ""
             print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
 
         old_text = audio_state.text
-        audio_state.text = string_matcher.matchStrings(audio_state.text,
-                text, window_size = 25)
+        audio_state.text += text
+        audio_state.preview_text = audio_state.text + preview_text
 
         now = time.time()
         print("Transcription ({} seconds): {}".format(
             now - last_transcribe_time,
-            audio_state.text))
+            audio_state.preview_text))
         last_transcribe_time = now
 
         # Translate if requested.
-        translated = audio_state.text
+        translated = audio_state.preview_text
         if audio_state.language_target:
             whisper_lang = audio_state.whisper_language
             nllb_lang = lang_compat.whisper_to_nllb[whisper_lang]
@@ -314,7 +358,7 @@ def transcribeAudio(audio_state,
             filtered_text = filtered_text.lower()
         audio_state.filtered_text = filtered_text
 
-        if old_text != audio_state.text:
+        if old_text != audio_state.preview_text:
             # We think the user said something, so  reset the amount of
             # time we sleep between transcriptions to the minimum.
             audio_state.transcribe_no_change_count = 0
-- 
cgit v1.2.3