diff options
| author | yum <yum.food.vr@gmail.com> | 2022-11-14 21:30:50 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-11-14 21:36:13 -0800 |
| commit | 2505a5cc486cd913db50a475e45c3701b9710282 (patch) | |
| tree | 86855b5772cc6400205926ed8d935227a574a7e6 | |
| parent | 9921697816c9f9473bac54444793f702e54d24a6 (diff) | |
Another transcription rework
After re-reading the paper, I noticed that they apply a couple
optimizations I wasn't using. Use the top-level `whisper.transcribe`
method, which is a little slower, but more accurate than the one I was
using.
Although this method is slower, it has better temporal stability due to
the increased quality, which I think should make for an overall more
responsive UX. Lower transcription quality means the paging layer has to
waste time updating earlier cells.
Also, drop the auto-commit stuff and go back to string stitching. I
think it's better to let the user manually commit. A rework of the hand
controls is probably coming soon.
Finally, update README.
| -rw-r--r-- | README.md | 3 | ||||
| -rw-r--r-- | transcribe.py | 51 |
2 files changed, 18 insertions, 36 deletions
@@ -157,6 +157,9 @@ To use the STT: 3. ~~Speech-to-text interface. Speak out loud, show in game.~~ DONE 4. Translation into non-English. Whisper natively supports translating N languages into English, but not the other way around. + 5. Display text in overlay. Enables (1) lower latency view of TaSTT's + transcription state; (2) checking transcriptions ahead of time; (3) + checking transcriptions without having to see the board in game. 4. Optimization 1. ~~Utilize the avatar 3.0 SDK's ability to drive parameters to reduce the total # of parameters (and therefore OSC messages & sync events). Note diff --git a/transcribe.py b/transcribe.py index c92825c..206dc22 100644 --- a/transcribe.py +++ b/transcribe.py @@ -13,6 +13,7 @@ from pydub import effects as pydub_effects # License: MIT. import pyaudio import steamvr +import string_matcher import sys import threading import time @@ -45,7 +46,6 @@ class AudioState: text = "" committed_text = "" - text_ts = datetime.now() frames = [] # Locks access to `text`, `frames`, and audio stored on disk. lock = threading.Lock() @@ -193,25 +193,21 @@ def transcribe(audio_state, model, filename): audio_state.lock.release() audio = whisper.pad_or_trim(audio) - mel = whisper.log_mel_spectrogram(audio).to(model.device) - #options = whisper.DecodingOptions(language = "en", - options = whisper.DecodingOptions(language = audio_state.language, - beam_size = 5) - result = whisper.decode(model, mel, options) - if result.no_speech_prob > 0.60: - print("no speech prob: {}".format(result.no_speech_prob)) - return None + result = whisper.transcribe(model, audio, language=audio_state.language) - if result.avg_logprob < -1.0: - print("avg logprob: {}".format(result.avg_logprob)) - return None + for segment in result["segments"]: + if segment["no_speech_prob"] > 0.60: + print("no speech prob: {}".format(segment["no_speech_prob"])) + return None + if segment["avg_logprob"] < -1.0: + print("avg logprob: {}".format(segment["avg_logprob"])) + return None + if segment["compression_ratio"] > 2.4: + print("compression ratio: {}".format(segment["compression_ratio"])) + return None - if result.compression_ratio > 2.4: - print("compression ratio: {}".format(result.compression_ratio)) - return None - - return result.text + return result["text"] def transcribeAudio(audio_state, model): while audio_state.run_app == True: @@ -246,24 +242,6 @@ def transcribeAudio(audio_state, model): audio_state.lock.release() continue - # Hack: transcriptions that remain the same for N seconds get - # committed. - now = datetime.now() - dt = now - audio_state.text_ts - dt_s = dt.seconds + float(dt.microseconds) / (1000 * 1000) - if dt_s >= 1 and text == audio_state.text: - print("Commit!") - old_commit = audio_state.committed_text - resetAudioLocked(audio_state) - audio_state.committed_text = old_commit + " " + text - audio_state.lock.release() - continue - else: - if text != audio_state.text: - audio_state.text_ts = now - print("text: {}".format(text)) - print("audio_state.text: {}".format(audio_state.text)) - words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() if len(words) > 0: @@ -277,7 +255,8 @@ def transcribeAudio(audio_state, model): #old_words = audio_state.text.split() #new_words = text.split() - audio_state.text = text + audio_state.text = string_matcher.matchStrings(audio_state.text, + text, window_size = 5) if old_text != audio_state.text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. |
