Add UI for transcription loop delay

Allows users to directly modulate the performance-latency tradeoff. Also: * Bump up UI buffer to 1k lines. * Fix browser source reset. It now also resets preview text.
author: yum <yum.food.vr@gmail.com> 2023-09-10 00:28:03 -0700
committer: yum <yum.food.vr@gmail.com> 2023-09-10 00:29:04 -0700
commit: 4ee396584c348c11e0272f0c2842f6a5d3175586 (patch)
tree: 5f8854f1a2a67c5dd18bf7a7e29489172426b660 /Scripts
parent: 9924a141b0b1266671915be12e21df6c8f4c5366 (diff)
1 files changed, 16 insertions, 7 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index df333bc..f0e994f 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -367,7 +367,8 @@ class Segment:
             end_ts: float,
             wall_ts: float,
             avg_logprob: float,
-            no_speech_prob: float):
+            no_speech_prob: float,
+            compression_ratio: float):
         self.transcript = transcript
         # start_ts, end_ts are timestamps in seconds relative to `wall_ts`.
         self.start_ts = start_ts
@@ -377,6 +378,7 @@ class Segment:
         self.wall_ts = wall_ts
         self.avg_logprob = avg_logprob
         self.no_speech_prob = no_speech_prob
+        self.compression_ratio = compression_ratio
 
     def __str__(self):
         ts = f"(ts: {self.start_ts}-{self.end_ts}) "
@@ -438,11 +440,17 @@ class Whisper:
         for s in segments:
             # Manual touchup. I see a decent number of hallucinations sneaking
             # in with high `no_speech_prob` and modest `avg_logprob`.
-            if s.no_speech_prob > 0.8 and s.avg_logprob < -0.5:
+            if s.no_speech_prob > 0.6 and s.avg_logprob < -0.5:
+                continue
+            if cfg["enable_debug_mode"]:
+                print(f"s get: {s}")
+            if s.avg_logprob < -1.0:
+                continue
+            if s.compression_ratio > 2.4:
                 continue
             res.append(Segment(s.text, s.start, s.end,
                 self.collector.begin(),
-                s.avg_logprob, s.no_speech_prob))
+                s.avg_logprob, s.no_speech_prob, s.compression_ratio))
         return res
 
 class TranscriptCommit:
@@ -490,11 +498,12 @@ class VadCommitter:
             commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
 
             segments = self.whisper.transcribe(commit_audio)
-            for s in segments:
-                print(f"commit segment: {s}", file=sys.stderr)
             delta = ''.join(s.transcript for s in segments)
-            print(f"delta get: {delta}", file=sys.stderr)
             audio = self.collector.getAudio()
+            if cfg["enable_debug_mode"]:
+                for s in segments:
+                    print(f"commit segment: {s}", file=sys.stderr)
+                print(f"delta get: {delta}", file=sys.stderr)
 
             #ts = datetime.fromtimestamp(self.collector.now() - latency_s)
             #filename = str(ts.strftime('%Y_%m_%d__%H-%M-%S')) + ".wav"
@@ -665,7 +674,7 @@ def optimize(cfg,
 
 def transcriptionThread(ctrl: ThreadControl):
     while ctrl.run_app:
-        time.sleep(.005)
+        time.sleep(ctrl.cfg["transcription_loop_delay_ms"] / 1000.0);
 
         op = None
author	yum <yum.food.vr@gmail.com>	2023-09-10 00:28:03 -0700
committer	yum <yum.food.vr@gmail.com>	2023-09-10 00:29:04 -0700
commit	4ee396584c348c11e0272f0c2842f6a5d3175586 (patch)
tree	5f8854f1a2a67c5dd18bf7a7e29489172426b660 /Scripts
parent	9924a141b0b1266671915be12e21df6c8f4c5366 (diff)