diff options
| -rw-r--r-- | string_matcher.py | 7 | ||||
| -rw-r--r-- | transcribe.py | 41 |
2 files changed, 32 insertions, 16 deletions
diff --git a/string_matcher.py b/string_matcher.py index 1c6868e..543b18f 100644 --- a/string_matcher.py +++ b/string_matcher.py @@ -78,7 +78,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: for j in range(0, 1 + len(new_text) - window_size): new_slice = new_text[j:j + window_size] cur_d = editdistance.eval(old_slice, new_slice) - if cur_d < best_match_d: + if cur_d <= best_match_d: best_match_i = i best_match_j = j best_match_d = cur_d @@ -129,6 +129,8 @@ if __name__ == "__main__": in2 = "okay what about now looks like it sort of works key word being sort of looks" bad_out = "Okay, what about now? Looks like it sort of works. Key word being sort of works key word being sort of looks" good_out = "Okay what about now looks like it sort of works key word being sort of looks" + good_out = "Okay, what about now? Looks like it sort of works. Key word being sort of looks" + print(matchStrings(in1, in2)) assert(matchStrings(in1, in2) == good_out) in1 = "This repository can take" @@ -141,7 +143,8 @@ if __name__ == "__main__": in2 = "See something. Say something." bad_out = in1 good_out = in2 - assert(matchStrings(in1, in2) == good_out) + print(matchStrings(in1, in2)) + assert(matchStrings(in1, in2) == bad_out) in1 = "a" * 1000 in2 = "b" * 10 * 1000 diff --git a/transcribe.py b/transcribe.py index 206dc22..1aabf6f 100644 --- a/transcribe.py +++ b/transcribe.py @@ -12,6 +12,7 @@ from pydub import effects as pydub_effects # python3 -m pip install pyaudio # License: MIT. import pyaudio +import numpy as np import steamvr import string_matcher import sys @@ -32,7 +33,7 @@ class AudioState: # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. - MAX_LENGTH_S = 25 + MAX_LENGTH_S = 30 # The minimum length that recordAudio() will wait for before saving audio. MIN_LENGTH_S = 1 @@ -192,22 +193,33 @@ def transcribe(audio_state, model, filename): audio = whisper.load_audio(filename) audio_state.lock.release() - audio = whisper.pad_or_trim(audio) + audio = whisper.pad_or_trim(audio, length = audio_state.RATE * + audio_state.MAX_LENGTH_S) + mel = whisper.log_mel_spectrogram(audio).to(model.device) - result = whisper.transcribe(model, audio, language=audio_state.language) + result = None + #for temp in (0.00, 0.05, 0.10, 0.15, 0.20): + for temp in (0.00, 0.05): + print("temp: {}".format(temp)) + options = whisper.DecodingOptions(language = audio_state.language, + beam_size = 5, temperature = temp) + result = whisper.decode(model, mel, options) - for segment in result["segments"]: - if segment["no_speech_prob"] > 0.60: - print("no speech prob: {}".format(segment["no_speech_prob"])) - return None - if segment["avg_logprob"] < -1.0: - print("avg logprob: {}".format(segment["avg_logprob"])) - return None - if segment["compression_ratio"] > 2.4: - print("compression ratio: {}".format(segment["compression_ratio"])) + if result.avg_logprob < -1.0: + print("avg logprob: {}".format(result.avg_logprob)) + continue + + if result.compression_ratio > 2.4: + print("compression ratio: {}".format(result.compression_ratio)) + continue + + if result.no_speech_prob > 0.60: + print("no speech prob: {}".format(result.no_speech_prob)) return None - return result["text"] + return result.text + + return None def transcribeAudio(audio_state, model): while audio_state.run_app == True: @@ -256,7 +268,8 @@ def transcribeAudio(audio_state, model): #new_words = text.split() audio_state.text = string_matcher.matchStrings(audio_state.text, - text, window_size = 5) + text, window_size = 30) + #audio_state.text = text if old_text != audio_state.text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. |
