From 68db1b25f8f29a487dab0953248e0effdff68567 Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Sat, 15 Oct 2022 18:49:51 -0700
Subject: Further improve transcribe.py responsiveness

Add a third heuristic. If the transcription is relatively long and the
first bit differs from the previous transcription, immediately
overwrite. Because the transcription is long, it's a bit less likely to
be a complete mistranscription.
---
 transcribe.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'transcribe.py')

diff --git a/transcribe.py b/transcribe.py
index 8369c43..73b2c92 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -4,6 +4,9 @@ import argparse
 import copy
 import os
 import osc_ctrl
+# python3 -m pip install pydub
+from pydub import AudioSegment as pydub_AudioSegment
+from pydub import effects as pydub_effects
 # python3 -m pip install pyaudio
 import pyaudio
 import sys
@@ -120,6 +123,12 @@ def saveAudio(audio_state, filename):
     wf.writeframes(b''.join(frames))
     wf.close()
 
+    # Normalize volume. This seems to make the neural net a little more
+    # consistent.
+    raw = pydub_AudioSegment.from_wav(filename)
+    normalized = pydub_effects.normalize(raw)
+    normalized.export(filename, format="wav")
+
     print("audio save")
 
 def resetAudio(audio_state):
@@ -153,8 +162,22 @@ def transcribeAudio(audio_state, model):
 
         audio_state.text_lock.acquire()
 
+        # We use a few heuristics to handle spurious mistranscriptions and to
+        # handle events where we trim off the start of the audio clip.
+        #   1. If we get 2 consecutive identical transcriptions, we commit to
+        #       the transcription. This reduces the number of
+        #       mistranscriptions by a lot.
+        #   2. If the last transcription is a prefix of the current one, we
+        #       immediately accept it, since the transcription is obviously
+        #       somewhat stable.
+        #   3. If the transcription is somewhat long and the
+        #       first few characters change, we assume this is due to a
+        #       trim event and immediately accept the transcription.
         if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate):
             audio_state.text = text
+        elif len(text) > 30 and text[0:10] != audio_state.text_candidate[0:10]:
+            audio_state.text = text
+
         audio_state.text_candidate = text
 
         audio_state.text_lock.release()
@@ -176,7 +199,7 @@ def sendAudio(audio_state):
         osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state)
 
         # Pace this out
-        time.sleep(0.05)
+        time.sleep(0.01)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-- 
cgit v1.2.3