Begin integrating faster-whisperv0.11.0

This is a much faster, lower-VRAM reimplementation of Whisper in Python. Early testing is extremely promising: fast transcription speed, extremely low resource usage (CPU/RAM/VRAM), high accuracy.
author: yum <yum.food.vr@gmail.com> 2023-04-23 20:52:36 -0700
committer: yum <yum.food.vr@gmail.com> 2023-04-23 20:52:36 -0700
commit: b4bb6524652e0f76834ca26a4afa232855ca1348 (patch)
tree: b1aa7e7c329b69481393b28936ad4c82f7a6acd0
parent: f5c1611f9cdf027f75c81576d17dfee8671d65ca (diff)
6 files changed, 26 insertions, 40 deletions
diff --git a/.gitignore b/.gitignore
index e803dfb..0b41544 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 # Ignore vim swap files.
 *.sw[po]
+*.dll
diff --git a/BrowserSource/index.html b/BrowserSource/index.html
index c422e4b..253b9ef 100644
--- a/BrowserSource/index.html
+++ b/BrowserSource/index.html
@@ -9,9 +9,13 @@
   <style>
     body {
       font-family: 'Noto Sans Mono', monospace;
-      font-size: 48px;
+      font-size: 96px;
       font-weight: 700;
     }
+    #transcript {
+      color: #fff;
+      -webkit-text-stroke: 1.0px #000;
+    }
   </style>
   <body>
     <div id="transcript"></div>
diff --git a/README.md b/README.md
index 0af72bd..7e91e7b 100644
--- a/README.md
+++ b/README.md
@@ -120,8 +120,8 @@ reason or another:
    who want to sell closed-source software.
 4. [I5UCC's VRCTextboxSTT](https://github.com/I5UCC/VRCTextboxSTT) makes
    KillFrenzy's AvatarText and Whisper kiss. It's the closest spiritual cousin
-   to this repository. There are two crucial differences: it's GPL not MIT, and
-   it doesn't abstract away the command line.
+   to this repository. The author has made incredible sustained progress on
+   the problem. Definitely take a look!
 5. [VRCWizard's TTS-Voice-Wizard](https://github.com/VRCWizard/TTS-Voice-Wizard)
    also uses Whisper, but they rely on the C# interface to Const-Me's
    CUDA-enabled Whisper implementation. This implementation does not support
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index c218302..043fb40 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -9,4 +9,4 @@ pyyaml
 
 --extra-index-url https://download.pytorch.org/whl/cu116
 torch==1.13.1+cu116
-git+https://github.com/openai/whisper.git
+faster-whisper
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 6793336..208bcd1 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -2,6 +2,7 @@
 
 from datetime import datetime
 from emotes_v2 import EmotesState
+from faster_whisper import WhisperModel
 from functools import partial
 from playsound import playsound
 
@@ -18,7 +19,6 @@ import sys
 import threading
 import time
 import wave
-import whisper
 
 class Config:
     def __init__(self):
@@ -72,7 +72,7 @@ class AudioState:
 
         # The language the user is speaking in. Default is English but user may set
         # this to whatever they want.
-        self.language = whisper.tokenizer.TO_LANGUAGE_CODE["english"]
+        self.language = "en"
 
         self.audio_paused = False
 
@@ -210,38 +210,19 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
     frames = np.asarray(audio_state.frames)
     audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0
 
-    audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
-            audio_state.MAX_LENGTH_S_WHISPER)
+    segments, info = model.transcribe(audio, beam_size=5,
+            language=audio_state.language)
 
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-
-    result = None
-    #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
-    #for temp in (0.00, 0.05):
-    for temp in (0.00,):
-        use_gpu = not use_cpu
-        options = whisper.DecodingOptions(language = audio_state.language,
-                beam_size = 5, temperature = temp, without_timestamps = True,
-                fp16 = use_gpu)
-        result = whisper.decode(model, mel, options)
-
-        if result.avg_logprob < -1.0:
-            print("avg logprob: {}".format(result.avg_logprob))
-            result = None
-            continue
-
-        if result.compression_ratio > 2.4:
-            print("compression ratio: {}".format(result.compression_ratio))
-            result = None
-            continue
-
-        if result.no_speech_prob > 0.60:
-            print("no speech prob: {}".format(result.no_speech_prob))
-            result = None
-            continue
+    result = ""
+    for s in segments:
+        print(f"  s: {s}")
+        print(f"  s.text: {s.text}")
+        if (len(result) == 0):
+            result = str(s.text)
+        else:
+            result += " " + str(s.text)
 
-        result = result.text
-        break
+    print(f"Result: {result}")
 
     return result
 
@@ -391,7 +372,7 @@ def transcribeLoop(mic: str, language: str, model: str,
         enable_local_beep: bool, use_cpu: bool, use_builtin: bool,
         button: str, estate: EmotesState):
     audio_state = getMicStream(mic)
-    audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
+    audio_state.language = language
 
     print("Safe to start talking")
 
@@ -400,7 +381,7 @@ def transcribeLoop(mic: str, language: str, model: str,
     model_root = os.path.join(dname, "Models")
 
     print("Model {} will be saved to {}".format(model, model_root))
-    model = whisper.load_model(model, download_root=model_root)
+    model = WhisperModel("large-v2", device="cuda", compute_type="float16")
 
     transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model, use_cpu])
     transcribe_audio_thd.daemon = True
@@ -442,7 +423,7 @@ if __name__ == "__main__":
     dname = os.path.dirname(abspath)
     dname = os.path.dirname(dname)
     dname = os.path.dirname(dname)
-    os.chdir(dname)
+    #os.chdir(dname)
     print(f"Set cwd to {os.getcwd()}")
 
     parser = argparse.ArgumentParser()
@@ -468,7 +449,7 @@ if __name__ == "__main__":
         args.language = "english"
 
     if not args.model:
-        args.language = "base"
+        args.model = "base"
 
     if not args.bytes_per_char or not args.chars_per_sync:
         print("--bytes_per_char and --chars_per_sync required", file=sys.stderr)
diff --git a/TaSTT-Whisper b/TaSTT-Whisper
-Subproject aaa0188da81056748ef8ffcd5ad86d6f4bffa6b
+Subproject 59297502afb8f61c1216c6d57d6cc18ab5b9f46
author	yum <yum.food.vr@gmail.com>	2023-04-23 20:52:36 -0700
committer	yum <yum.food.vr@gmail.com>	2023-04-23 20:52:36 -0700
commit	b4bb6524652e0f76834ca26a4afa232855ca1348 (patch)
tree	b1aa7e7c329b69481393b28936ad4c82f7a6acd0
parent	f5c1611f9cdf027f75c81576d17dfee8671d65ca (diff)