Add browser source, hardcoded to port 8097

Transcription output now streams to localhost:8097. In OBS: * Create a browser source. * url: localhost:8097 * width: 2200 * height: 400 TODO: * Put behind toggle. * Create input field for port. Misc cleanup: * transcribe.py: Drop frames from audio capture thread instead of the transcription thread. Doing it the other way would result in occasional data loss.
author: yum <yum.food.vr@gmail.com> 2023-06-26 00:58:58 -0700
committer: yum <yum.food.vr@gmail.com> 2023-06-26 01:46:42 -0700
commit: 0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch)
tree: 057ea9d58991ab71cd4f6eb4dfb33693f8914e03 /Scripts
parent: e35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff)
1 files changed, 14 insertions, 6 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 9711d15..51d7e0a 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -57,6 +57,7 @@ class AudioState:
         #     Segment start time, end time, and text
         self.ranges_ls = []
         self.frames = []
+        self.drop_frames_till_i = -1
 
         # Locks access to `text`.
         self.transcribe_lock = threading.Lock()
@@ -131,6 +132,9 @@ def onAudioFramesAvailable(
             audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
         audio_state.frames = audio_state.frames[-1 * max_frames:]
+    if audio_state.drop_frames_till_i > 0:
+        audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1]
+        audio_state.drop_frames_till_i = -1
 
 
     return (frames, pyaudio.paContinue)
@@ -227,6 +231,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             audio,
             beam_size = 5,
             language = audio_state.language,
+            temperature = [0.0],
             vad_filter = True,
             condition_on_previous_text = True,
             without_timestamps = False)
@@ -244,17 +249,20 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             for segment in ranges:
                 first_segments.append(segment)
                 break
-        if len(first_segments) >= 3:
-            c0 = first_segments[-3]
+        if len(first_segments) >= 5:
+            # Hack: require convergence across many frames to give the
+            # algorithm a longer buffer to work with.
+            c0 = first_segments[-1]
             c1 = first_segments[-2]
-            c2 = first_segments[-1]
+            c2 = first_segments[-3]
+            c3 = first_segments[-4]
             #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
-            if c0 == c1 and c1 == c2:
+            if c0 == c1 and c1 == c2 and c2 == c3:
                 # For simplicity, completely reset saved audio ranges.
                 audio_state.ranges_ls = []
-                committed_text = c2[2]
+                committed_text = c0[2]
                 n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
-                del audio_state.frames[0:n_frames_to_drop]
+                audio_state.drop_frames_till_i = n_frames_to_drop
 
     preview_text = ""
     for seg in ranges:
author	yum <yum.food.vr@gmail.com>	2023-06-26 00:58:58 -0700
committer	yum <yum.food.vr@gmail.com>	2023-06-26 01:46:42 -0700
commit	0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch)
tree	057ea9d58991ab71cd4f6eb4dfb33693f8914e03 /Scripts
parent	e35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff)