summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-11-14 21:30:50 -0800
committeryum <yum.food.vr@gmail.com>2022-11-14 21:36:13 -0800
commit2505a5cc486cd913db50a475e45c3701b9710282 (patch)
tree86855b5772cc6400205926ed8d935227a574a7e6
parent9921697816c9f9473bac54444793f702e54d24a6 (diff)
Another transcription rework
After re-reading the paper, I noticed that they apply a couple optimizations I wasn't using. Use the top-level `whisper.transcribe` method, which is a little slower, but more accurate than the one I was using. Although this method is slower, it has better temporal stability due to the increased quality, which I think should make for an overall more responsive UX. Lower transcription quality means the paging layer has to waste time updating earlier cells. Also, drop the auto-commit stuff and go back to string stitching. I think it's better to let the user manually commit. A rework of the hand controls is probably coming soon. Finally, update README.
-rw-r--r--README.md3
-rw-r--r--transcribe.py51
2 files changed, 18 insertions, 36 deletions
diff --git a/README.md b/README.md
index a5e3ff8..885e424 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,9 @@ To use the STT:
3. ~~Speech-to-text interface. Speak out loud, show in game.~~ DONE
4. Translation into non-English. Whisper natively supports translating N
languages into English, but not the other way around.
+ 5. Display text in overlay. Enables (1) lower latency view of TaSTT's
+ transcription state; (2) checking transcriptions ahead of time; (3)
+ checking transcriptions without having to see the board in game.
4. Optimization
1. ~~Utilize the avatar 3.0 SDK's ability to drive parameters to reduce the
total # of parameters (and therefore OSC messages & sync events). Note
diff --git a/transcribe.py b/transcribe.py
index c92825c..206dc22 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -13,6 +13,7 @@ from pydub import effects as pydub_effects
# License: MIT.
import pyaudio
import steamvr
+import string_matcher
import sys
import threading
import time
@@ -45,7 +46,6 @@ class AudioState:
text = ""
committed_text = ""
- text_ts = datetime.now()
frames = []
# Locks access to `text`, `frames`, and audio stored on disk.
lock = threading.Lock()
@@ -193,25 +193,21 @@ def transcribe(audio_state, model, filename):
audio_state.lock.release()
audio = whisper.pad_or_trim(audio)
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
- #options = whisper.DecodingOptions(language = "en",
- options = whisper.DecodingOptions(language = audio_state.language,
- beam_size = 5)
- result = whisper.decode(model, mel, options)
- if result.no_speech_prob > 0.60:
- print("no speech prob: {}".format(result.no_speech_prob))
- return None
+ result = whisper.transcribe(model, audio, language=audio_state.language)
- if result.avg_logprob < -1.0:
- print("avg logprob: {}".format(result.avg_logprob))
- return None
+ for segment in result["segments"]:
+ if segment["no_speech_prob"] > 0.60:
+ print("no speech prob: {}".format(segment["no_speech_prob"]))
+ return None
+ if segment["avg_logprob"] < -1.0:
+ print("avg logprob: {}".format(segment["avg_logprob"]))
+ return None
+ if segment["compression_ratio"] > 2.4:
+ print("compression ratio: {}".format(segment["compression_ratio"]))
+ return None
- if result.compression_ratio > 2.4:
- print("compression ratio: {}".format(result.compression_ratio))
- return None
-
- return result.text
+ return result["text"]
def transcribeAudio(audio_state, model):
while audio_state.run_app == True:
@@ -246,24 +242,6 @@ def transcribeAudio(audio_state, model):
audio_state.lock.release()
continue
- # Hack: transcriptions that remain the same for N seconds get
- # committed.
- now = datetime.now()
- dt = now - audio_state.text_ts
- dt_s = dt.seconds + float(dt.microseconds) / (1000 * 1000)
- if dt_s >= 1 and text == audio_state.text:
- print("Commit!")
- old_commit = audio_state.committed_text
- resetAudioLocked(audio_state)
- audio_state.committed_text = old_commit + " " + text
- audio_state.lock.release()
- continue
- else:
- if text != audio_state.text:
- audio_state.text_ts = now
- print("text: {}".format(text))
- print("audio_state.text: {}".format(audio_state.text))
-
words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
if len(words) > 0:
@@ -277,7 +255,8 @@ def transcribeAudio(audio_state, model):
#old_words = audio_state.text.split()
#new_words = text.split()
- audio_state.text = text
+ audio_state.text = string_matcher.matchStrings(audio_state.text,
+ text, window_size = 5)
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.