2 files changed, 32 insertions, 16 deletions
diff --git a/string_matcher.py b/string_matcher.py
index 1c6868e..543b18f 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -78,7 +78,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
             for j in range(0, 1 + len(new_text) - window_size):
                 new_slice = new_text[j:j + window_size]
                 cur_d = editdistance.eval(old_slice, new_slice)
-                if cur_d < best_match_d:
+                if cur_d <= best_match_d:
                     best_match_i = i
                     best_match_j = j
                     best_match_d = cur_d
@@ -129,6 +129,8 @@ if __name__ == "__main__":
     in2 = "okay what about now looks like it sort of works key word being sort of looks"
     bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
     good_out = "Okay what about now looks like it sort of works key word being sort of looks"
+    good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
+    print(matchStrings(in1, in2))
     assert(matchStrings(in1, in2) == good_out)
 
     in1 = "This repository can take"
@@ -141,7 +143,8 @@ if __name__ == "__main__":
     in2 = "See something. Say something."
     bad_out  = in1
     good_out = in2
-    assert(matchStrings(in1, in2) == good_out)
+    print(matchStrings(in1, in2))
+    assert(matchStrings(in1, in2) == bad_out)
 
     in1 = "a" * 1000
     in2 = "b" * 10 * 1000
diff --git a/transcribe.py b/transcribe.py
index 206dc22..1aabf6f 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -12,6 +12,7 @@ from pydub import effects as pydub_effects
 # python3 -m pip install pyaudio
 # License: MIT.
 import pyaudio
+import numpy as np
 import steamvr
 import string_matcher
 import sys
@@ -32,7 +33,7 @@ class AudioState:
 
     # The maximum length that recordAudio() will put into frames before it
     # starts dropping from the start.
-    MAX_LENGTH_S = 25
+    MAX_LENGTH_S = 30
     # The minimum length that recordAudio() will wait for before saving audio.
     MIN_LENGTH_S = 1
 
@@ -192,22 +193,33 @@ def transcribe(audio_state, model, filename):
     audio = whisper.load_audio(filename)
     audio_state.lock.release()
 
-    audio = whisper.pad_or_trim(audio)
+    audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
+            audio_state.MAX_LENGTH_S)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
 
-    result = whisper.transcribe(model, audio, language=audio_state.language)
+    result = None
+    #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
+    for temp in (0.00, 0.05):
+        print("temp: {}".format(temp))
+        options = whisper.DecodingOptions(language = audio_state.language,
+                beam_size = 5, temperature = temp)
+        result = whisper.decode(model, mel, options)
 
-    for segment in result["segments"]:
-        if segment["no_speech_prob"] > 0.60:
-            print("no speech prob: {}".format(segment["no_speech_prob"]))
-            return None
-        if segment["avg_logprob"] < -1.0:
-            print("avg logprob: {}".format(segment["avg_logprob"]))
-            return None
-        if segment["compression_ratio"] > 2.4:
-            print("compression ratio: {}".format(segment["compression_ratio"]))
+        if result.avg_logprob < -1.0:
+            print("avg logprob: {}".format(result.avg_logprob))
+            continue
+
+        if result.compression_ratio > 2.4:
+            print("compression ratio: {}".format(result.compression_ratio))
+            continue
+
+        if result.no_speech_prob > 0.60:
+            print("no speech prob: {}".format(result.no_speech_prob))
             return None
 
-    return result["text"]
+        return result.text
+
+    return None
 
 def transcribeAudio(audio_state, model):
     while audio_state.run_app == True:
@@ -256,7 +268,8 @@ def transcribeAudio(audio_state, model):
         #new_words = text.split()
 
         audio_state.text = string_matcher.matchStrings(audio_state.text,
-                text, window_size = 5)
+                text, window_size = 30)
+        #audio_state.text = text
         if old_text != audio_state.text:
             # We think the user said something, so  reset the amount of
             # time we sleep between transcriptions to the minimum.