From b4bb6524652e0f76834ca26a4afa232855ca1348 Mon Sep 17 00:00:00 2001 From: yum Date: Sun, 23 Apr 2023 20:52:36 -0700 Subject: Begin integrating faster-whisper This is a much faster, lower-VRAM reimplementation of Whisper in Python. Early testing is extremely promising: fast transcription speed, extremely low resource usage (CPU/RAM/VRAM), high accuracy. --- .gitignore | 1 + BrowserSource/index.html | 6 +++++- README.md | 4 ++-- Scripts/requirements.txt | 2 +- Scripts/transcribe.py | 53 ++++++++++++++++-------------------------------- TaSTT-Whisper | 2 +- 6 files changed, 27 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index e803dfb..0b41544 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ # Ignore vim swap files. *.sw[po] +*.dll diff --git a/BrowserSource/index.html b/BrowserSource/index.html index c422e4b..253b9ef 100644 --- a/BrowserSource/index.html +++ b/BrowserSource/index.html @@ -9,9 +9,13 @@
diff --git a/README.md b/README.md index 0af72bd..7e91e7b 100644 --- a/README.md +++ b/README.md @@ -120,8 +120,8 @@ reason or another: who want to sell closed-source software. 4. [I5UCC's VRCTextboxSTT](https://github.com/I5UCC/VRCTextboxSTT) makes KillFrenzy's AvatarText and Whisper kiss. It's the closest spiritual cousin - to this repository. There are two crucial differences: it's GPL not MIT, and - it doesn't abstract away the command line. + to this repository. The author has made incredible sustained progress on + the problem. Definitely take a look! 5. [VRCWizard's TTS-Voice-Wizard](https://github.com/VRCWizard/TTS-Voice-Wizard) also uses Whisper, but they rely on the C# interface to Const-Me's CUDA-enabled Whisper implementation. This implementation does not support diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt index c218302..043fb40 100644 --- a/Scripts/requirements.txt +++ b/Scripts/requirements.txt @@ -9,4 +9,4 @@ pyyaml --extra-index-url https://download.pytorch.org/whl/cu116 torch==1.13.1+cu116 -git+https://github.com/openai/whisper.git +faster-whisper diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 6793336..208bcd1 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -2,6 +2,7 @@ from datetime import datetime from emotes_v2 import EmotesState +from faster_whisper import WhisperModel from functools import partial from playsound import playsound @@ -18,7 +19,6 @@ import sys import threading import time import wave -import whisper class Config: def __init__(self): @@ -72,7 +72,7 @@ class AudioState: # The language the user is speaking in. Default is English but user may set # this to whatever they want. - self.language = whisper.tokenizer.TO_LANGUAGE_CODE["english"] + self.language = "en" self.audio_paused = False @@ -210,38 +210,19 @@ def transcribe(audio_state, model, frames, use_cpu: bool): frames = np.asarray(audio_state.frames) audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0 - audio = whisper.pad_or_trim(audio, length = audio_state.RATE * - audio_state.MAX_LENGTH_S_WHISPER) + segments, info = model.transcribe(audio, beam_size=5, + language=audio_state.language) - mel = whisper.log_mel_spectrogram(audio).to(model.device) - - result = None - #for temp in (0.00, 0.05, 0.10, 0.15, 0.20): - #for temp in (0.00, 0.05): - for temp in (0.00,): - use_gpu = not use_cpu - options = whisper.DecodingOptions(language = audio_state.language, - beam_size = 5, temperature = temp, without_timestamps = True, - fp16 = use_gpu) - result = whisper.decode(model, mel, options) - - if result.avg_logprob < -1.0: - print("avg logprob: {}".format(result.avg_logprob)) - result = None - continue - - if result.compression_ratio > 2.4: - print("compression ratio: {}".format(result.compression_ratio)) - result = None - continue - - if result.no_speech_prob > 0.60: - print("no speech prob: {}".format(result.no_speech_prob)) - result = None - continue + result = "" + for s in segments: + print(f" s: {s}") + print(f" s.text: {s.text}") + if (len(result) == 0): + result = str(s.text) + else: + result += " " + str(s.text) - result = result.text - break + print(f"Result: {result}") return result @@ -391,7 +372,7 @@ def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, use_cpu: bool, use_builtin: bool, button: str, estate: EmotesState): audio_state = getMicStream(mic) - audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language] + audio_state.language = language print("Safe to start talking") @@ -400,7 +381,7 @@ def transcribeLoop(mic: str, language: str, model: str, model_root = os.path.join(dname, "Models") print("Model {} will be saved to {}".format(model, model_root)) - model = whisper.load_model(model, download_root=model_root) + model = WhisperModel("large-v2", device="cuda", compute_type="float16") transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model, use_cpu]) transcribe_audio_thd.daemon = True @@ -442,7 +423,7 @@ if __name__ == "__main__": dname = os.path.dirname(abspath) dname = os.path.dirname(dname) dname = os.path.dirname(dname) - os.chdir(dname) + #os.chdir(dname) print(f"Set cwd to {os.getcwd()}") parser = argparse.ArgumentParser() @@ -468,7 +449,7 @@ if __name__ == "__main__": args.language = "english" if not args.model: - args.language = "base" + args.model = "base" if not args.bytes_per_char or not args.chars_per_sync: print("--bytes_per_char and --chars_per_sync required", file=sys.stderr) diff --git a/TaSTT-Whisper b/TaSTT-Whisper index aaa0188..5929750 160000 --- a/TaSTT-Whisper +++ b/TaSTT-Whisper @@ -1 +1 @@ -Subproject commit aaa0188da81056748ef8ffcd5ad86d6f4bffa6bd +Subproject commit 59297502afb8f61c1216c6d57d6cc18ab5b9f467 -- cgit v1.2.3