diff options
| author | yum <yum.food.vr@gmail.com> | 2022-11-06 12:50:38 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-11-06 12:50:38 -0800 |
| commit | 7146acb9d4ad751fc5ced411a2990d0aad17d08f (patch) | |
| tree | 30d5f9f9a7f47bc4272fa9e9fff5c0226c376686 /transcribe.py | |
| parent | 3a123fb5cabdbdef4f1b98031ec90c42e1d6e911 (diff) | |
String matching no longer relies on spaces
Add a `matchStrings` which does basically the same thing as
`matchStringList` except it doesn't split the input at space boundaries.
I think this should work better for Japanese and Chinese, since they
don't use spaces.
Doesn't seem to cause any accuracy regressions for English.
Also update the README.
Diffstat (limited to 'transcribe.py')
| -rw-r--r-- | transcribe.py | 23 |
1 files changed, 11 insertions, 12 deletions
diff --git a/transcribe.py b/transcribe.py index 5d2897c..4014dc8 100644 --- a/transcribe.py +++ b/transcribe.py @@ -43,9 +43,6 @@ class AudioState: frames_lock = threading.Lock() text = "" - # To improve temporal stability, we require two consecutive identical - # transcriptions before "committing" to a transcription. - text_candidate = "" text_lock = threading.Lock() record_audio = True @@ -56,6 +53,9 @@ class AudioState: transcribe_sleep_duration_max_s = 1.50 transcribe_no_change_count = 0 transcribe_sleep_duration = transcribe_sleep_duration_min_s + # The language the user is speaking in. + language = whisper.tokenizer.TO_LANGUAGE_CODE["japanese"] + # When the user says `over`, we stop displaying new transcriptions until # they clear the board again. display_paused = False @@ -162,7 +162,6 @@ def resetAudioLocked(audio_state): resetDiskAudioLocked(audio_state, audio_state.VOICE_AUDIO_FILENAME) audio_state.text = "" - audio_state.text_candidate = "" osc_ctrl.clear(audio_state.osc_client) def resetAudio(audio_state): @@ -171,7 +170,7 @@ def resetAudio(audio_state): audio_state.frames_lock.release() # Transcribe the audio recorded in a file. -def transcribe(model, filename): +def transcribe(audio_state, model, filename): audio_state.frames_lock.acquire() audio = whisper.load_audio(filename) @@ -179,7 +178,8 @@ def transcribe(model, filename): audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) - options = whisper.DecodingOptions(language = "en", + #options = whisper.DecodingOptions(language = "en", + options = whisper.DecodingOptions(language = audio_state.language, beam_size = 5) result = whisper.decode(model, mel, options) @@ -220,7 +220,7 @@ def transcribeAudio(audio_state, model): time.sleep(0.1) continue - text = transcribe(model, audio_state.VOICE_AUDIO_FILENAME) + text = transcribe(audio_state, model, audio_state.VOICE_AUDIO_FILENAME) if not text: continue @@ -241,18 +241,17 @@ def transcribeAudio(audio_state, model): print("Transcription: {}".format(audio_state.text)) old_text = audio_state.text - old_words = audio_state.text.split() - new_words = text.split() + #old_words = audio_state.text.split() + #new_words = text.split() - audio_state.text = string_matcher.matchStringList(old_words, new_words) + audio_state.text = string_matcher.matchStrings(audio_state.text, + text, window_size = 5) if old_text != audio_state.text: # We think the user said something, so reset the amount of # time we sleep between transcriptions to the minimum. audio_state.transcribe_no_change_count = 0 audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s - audio_state.text_candidate = text - audio_state.text_lock.release() def sendAudio(audio_state): |
