summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--string_matcher.py7
-rw-r--r--transcribe.py41
2 files changed, 32 insertions, 16 deletions
diff --git a/string_matcher.py b/string_matcher.py
index 1c6868e..543b18f 100644
--- a/string_matcher.py
+++ b/string_matcher.py
@@ -78,7 +78,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str:
for j in range(0, 1 + len(new_text) - window_size):
new_slice = new_text[j:j + window_size]
cur_d = editdistance.eval(old_slice, new_slice)
- if cur_d < best_match_d:
+ if cur_d <= best_match_d:
best_match_i = i
best_match_j = j
best_match_d = cur_d
@@ -129,6 +129,8 @@ if __name__ == "__main__":
in2 = "okay what about now looks like it sort of works key word being sort of looks"
bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks"
good_out = "Okay what about now looks like it sort of works key word being sort of looks"
+ good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks"
+ print(matchStrings(in1, in2))
assert(matchStrings(in1, in2) == good_out)
in1 = "This repository can take"
@@ -141,7 +143,8 @@ if __name__ == "__main__":
in2 = "See something. Say something."
bad_out = in1
good_out = in2
- assert(matchStrings(in1, in2) == good_out)
+ print(matchStrings(in1, in2))
+ assert(matchStrings(in1, in2) == bad_out)
in1 = "a" * 1000
in2 = "b" * 10 * 1000
diff --git a/transcribe.py b/transcribe.py
index 206dc22..1aabf6f 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -12,6 +12,7 @@ from pydub import effects as pydub_effects
# python3 -m pip install pyaudio
# License: MIT.
import pyaudio
+import numpy as np
import steamvr
import string_matcher
import sys
@@ -32,7 +33,7 @@ class AudioState:
# The maximum length that recordAudio() will put into frames before it
# starts dropping from the start.
- MAX_LENGTH_S = 25
+ MAX_LENGTH_S = 30
# The minimum length that recordAudio() will wait for before saving audio.
MIN_LENGTH_S = 1
@@ -192,22 +193,33 @@ def transcribe(audio_state, model, filename):
audio = whisper.load_audio(filename)
audio_state.lock.release()
- audio = whisper.pad_or_trim(audio)
+ audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
+ audio_state.MAX_LENGTH_S)
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
- result = whisper.transcribe(model, audio, language=audio_state.language)
+ result = None
+ #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
+ for temp in (0.00, 0.05):
+ print("temp: {}".format(temp))
+ options = whisper.DecodingOptions(language = audio_state.language,
+ beam_size = 5, temperature = temp)
+ result = whisper.decode(model, mel, options)
- for segment in result["segments"]:
- if segment["no_speech_prob"] > 0.60:
- print("no speech prob: {}".format(segment["no_speech_prob"]))
- return None
- if segment["avg_logprob"] < -1.0:
- print("avg logprob: {}".format(segment["avg_logprob"]))
- return None
- if segment["compression_ratio"] > 2.4:
- print("compression ratio: {}".format(segment["compression_ratio"]))
+ if result.avg_logprob < -1.0:
+ print("avg logprob: {}".format(result.avg_logprob))
+ continue
+
+ if result.compression_ratio > 2.4:
+ print("compression ratio: {}".format(result.compression_ratio))
+ continue
+
+ if result.no_speech_prob > 0.60:
+ print("no speech prob: {}".format(result.no_speech_prob))
return None
- return result["text"]
+ return result.text
+
+ return None
def transcribeAudio(audio_state, model):
while audio_state.run_app == True:
@@ -256,7 +268,8 @@ def transcribeAudio(audio_state, model):
#new_words = text.split()
audio_state.text = string_matcher.matchStrings(audio_state.text,
- text, window_size = 5)
+ text, window_size = 30)
+ #audio_state.text = text
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.