summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-04-23 20:52:36 -0700
committeryum <yum.food.vr@gmail.com>2023-04-23 20:52:36 -0700
commitb4bb6524652e0f76834ca26a4afa232855ca1348 (patch)
treeb1aa7e7c329b69481393b28936ad4c82f7a6acd0
parentf5c1611f9cdf027f75c81576d17dfee8671d65ca (diff)
Begin integrating faster-whisperv0.11.0
This is a much faster, lower-VRAM reimplementation of Whisper in Python. Early testing is extremely promising: fast transcription speed, extremely low resource usage (CPU/RAM/VRAM), high accuracy.
-rw-r--r--.gitignore1
-rw-r--r--BrowserSource/index.html6
-rw-r--r--README.md4
-rw-r--r--Scripts/requirements.txt2
-rw-r--r--Scripts/transcribe.py53
m---------TaSTT-Whisper0
6 files changed, 26 insertions, 40 deletions
diff --git a/.gitignore b/.gitignore
index e803dfb..0b41544 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
# Ignore vim swap files.
*.sw[po]
+*.dll
diff --git a/BrowserSource/index.html b/BrowserSource/index.html
index c422e4b..253b9ef 100644
--- a/BrowserSource/index.html
+++ b/BrowserSource/index.html
@@ -9,9 +9,13 @@
<style>
body {
font-family: 'Noto Sans Mono', monospace;
- font-size: 48px;
+ font-size: 96px;
font-weight: 700;
}
+ #transcript {
+ color: #fff;
+ -webkit-text-stroke: 1.0px #000;
+ }
</style>
<body>
<div id="transcript"></div>
diff --git a/README.md b/README.md
index 0af72bd..7e91e7b 100644
--- a/README.md
+++ b/README.md
@@ -120,8 +120,8 @@ reason or another:
who want to sell closed-source software.
4. [I5UCC's VRCTextboxSTT](https://github.com/I5UCC/VRCTextboxSTT) makes
KillFrenzy's AvatarText and Whisper kiss. It's the closest spiritual cousin
- to this repository. There are two crucial differences: it's GPL not MIT, and
- it doesn't abstract away the command line.
+ to this repository. The author has made incredible sustained progress on
+ the problem. Definitely take a look!
5. [VRCWizard's TTS-Voice-Wizard](https://github.com/VRCWizard/TTS-Voice-Wizard)
also uses Whisper, but they rely on the C# interface to Const-Me's
CUDA-enabled Whisper implementation. This implementation does not support
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index c218302..043fb40 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -9,4 +9,4 @@ pyyaml
--extra-index-url https://download.pytorch.org/whl/cu116
torch==1.13.1+cu116
-git+https://github.com/openai/whisper.git
+faster-whisper
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 6793336..208bcd1 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -2,6 +2,7 @@
from datetime import datetime
from emotes_v2 import EmotesState
+from faster_whisper import WhisperModel
from functools import partial
from playsound import playsound
@@ -18,7 +19,6 @@ import sys
import threading
import time
import wave
-import whisper
class Config:
def __init__(self):
@@ -72,7 +72,7 @@ class AudioState:
# The language the user is speaking in. Default is English but user may set
# this to whatever they want.
- self.language = whisper.tokenizer.TO_LANGUAGE_CODE["english"]
+ self.language = "en"
self.audio_paused = False
@@ -210,38 +210,19 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
frames = np.asarray(audio_state.frames)
audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0
- audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
- audio_state.MAX_LENGTH_S_WHISPER)
+ segments, info = model.transcribe(audio, beam_size=5,
+ language=audio_state.language)
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
-
- result = None
- #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
- #for temp in (0.00, 0.05):
- for temp in (0.00,):
- use_gpu = not use_cpu
- options = whisper.DecodingOptions(language = audio_state.language,
- beam_size = 5, temperature = temp, without_timestamps = True,
- fp16 = use_gpu)
- result = whisper.decode(model, mel, options)
-
- if result.avg_logprob < -1.0:
- print("avg logprob: {}".format(result.avg_logprob))
- result = None
- continue
-
- if result.compression_ratio > 2.4:
- print("compression ratio: {}".format(result.compression_ratio))
- result = None
- continue
-
- if result.no_speech_prob > 0.60:
- print("no speech prob: {}".format(result.no_speech_prob))
- result = None
- continue
+ result = ""
+ for s in segments:
+ print(f" s: {s}")
+ print(f" s.text: {s.text}")
+ if (len(result) == 0):
+ result = str(s.text)
+ else:
+ result += " " + str(s.text)
- result = result.text
- break
+ print(f"Result: {result}")
return result
@@ -391,7 +372,7 @@ def transcribeLoop(mic: str, language: str, model: str,
enable_local_beep: bool, use_cpu: bool, use_builtin: bool,
button: str, estate: EmotesState):
audio_state = getMicStream(mic)
- audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
+ audio_state.language = language
print("Safe to start talking")
@@ -400,7 +381,7 @@ def transcribeLoop(mic: str, language: str, model: str,
model_root = os.path.join(dname, "Models")
print("Model {} will be saved to {}".format(model, model_root))
- model = whisper.load_model(model, download_root=model_root)
+ model = WhisperModel("large-v2", device="cuda", compute_type="float16")
transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model, use_cpu])
transcribe_audio_thd.daemon = True
@@ -442,7 +423,7 @@ if __name__ == "__main__":
dname = os.path.dirname(abspath)
dname = os.path.dirname(dname)
dname = os.path.dirname(dname)
- os.chdir(dname)
+ #os.chdir(dname)
print(f"Set cwd to {os.getcwd()}")
parser = argparse.ArgumentParser()
@@ -468,7 +449,7 @@ if __name__ == "__main__":
args.language = "english"
if not args.model:
- args.language = "base"
+ args.model = "base"
if not args.bytes_per_char or not args.chars_per_sync:
print("--bytes_per_char and --chars_per_sync required", file=sys.stderr)
diff --git a/TaSTT-Whisper b/TaSTT-Whisper
-Subproject aaa0188da81056748ef8ffcd5ad86d6f4bffa6b
+Subproject 59297502afb8f61c1216c6d57d6cc18ab5b9f46