From 7146acb9d4ad751fc5ced411a2990d0aad17d08f Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Sun, 6 Nov 2022 12:50:38 -0800
Subject: String matching no longer relies on spaces

Add a `matchStrings` which does basically the same thing as
`matchStringList` except it doesn't split the input at space boundaries.
I think this should work better for Japanese and Chinese, since they
don't use spaces.

Doesn't seem to cause any accuracy regressions for English.

Also update the README.
---
 transcribe.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'transcribe.py')

diff --git a/transcribe.py b/transcribe.py
index 5d2897c..4014dc8 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -43,9 +43,6 @@ class AudioState:
     frames_lock = threading.Lock()
 
     text = ""
-    # To improve temporal stability, we require two consecutive identical
-    # transcriptions before "committing" to a transcription.
-    text_candidate = ""
     text_lock = threading.Lock()
 
     record_audio = True
@@ -56,6 +53,9 @@ class AudioState:
     transcribe_sleep_duration_max_s = 1.50
     transcribe_no_change_count = 0
     transcribe_sleep_duration = transcribe_sleep_duration_min_s
+    # The language the user is speaking in.
+    language = whisper.tokenizer.TO_LANGUAGE_CODE["japanese"]
+
     # When the user says `over`, we stop displaying new transcriptions until
     # they clear the board again.
     display_paused = False
@@ -162,7 +162,6 @@ def resetAudioLocked(audio_state):
     resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME)
 
     audio_state.text = ""
-    audio_state.text_candidate = ""
     osc_ctrl.clear(audio_state.osc_client)
 
 def resetAudio(audio_state):
@@ -171,7 +170,7 @@ def resetAudio(audio_state):
     audio_state.frames_lock.release()
 
 # Transcribe the audio recorded in a file.
-def transcribe(model, filename):
+def transcribe(audio_state, model, filename):
 
     audio_state.frames_lock.acquire()
     audio = whisper.load_audio(filename)
@@ -179,7 +178,8 @@ def transcribe(model, filename):
 
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    options = whisper.DecodingOptions(language = "en",
+    #options = whisper.DecodingOptions(language = "en",
+    options = whisper.DecodingOptions(language = audio_state.language,
             beam_size = 5)
     result = whisper.decode(model, mel, options)
 
@@ -220,7 +220,7 @@ def transcribeAudio(audio_state, model):
             time.sleep(0.1)
             continue
 
-        text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME)
+        text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME)
         if not text:
             continue
 
@@ -241,18 +241,17 @@ def transcribeAudio(audio_state, model):
         print("Transcription: {}".format(audio_state.text))
 
         old_text = audio_state.text
-        old_words = audio_state.text.split()
-        new_words = text.split()
+        #old_words = audio_state.text.split()
+        #new_words = text.split()
 
-        audio_state.text = string_matcher.matchStringList(old_words, new_words)
+        audio_state.text = string_matcher.matchStrings(audio_state.text,
+                text, window_size = 5)
         if old_text != audio_state.text:
             # We think the user said something, so  reset the amount of
             # time we sleep between transcriptions to the minimum.
             audio_state.transcribe_no_change_count = 0
             audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
-        audio_state.text_candidate = text
-
         audio_state.text_lock.release()
 
 def sendAudio(audio_state):
-- 
cgit v1.2.3