~Finish integrating faster-whisper

I'm able to use the new code to show text in game. Not yet play-tested.
author: yum <yum.food.vr@gmail.com> 2023-04-24 18:08:16 -0700
committer: yum <yum.food.vr@gmail.com> 2023-04-24 18:14:22 -0700
commit: 89c4fa29e5810fac7c75cb5edda1565320a5b8a9 (patch)
tree: ab422edb4ce8b1eaab377a28f852ba936cf7cc10
parent: b4bb6524652e0f76834ca26a4afa232855ca1348 (diff)
7 files changed, 108 insertions, 46 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index f45aa45..f35f95c 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -66,7 +66,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	language("english"),
 	model("base.en"),
 	button("left joystick"),
-	window_duration("15"),
+	window_duration("120"),
 
 	enable_local_beep(true),
 	use_cpu(false),
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index c9b7281..1195540 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -250,6 +250,8 @@ namespace {
         "small",
         "medium.en",
         "medium",
+        "large-v1",
+        "large-v2",
     };
     const size_t kNumModelChoices = sizeof(kModelChoices) / sizeof(kModelChoices[0]);
     constexpr int kModelDefault = 2;  // base.en
@@ -2123,7 +2125,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     const int max_rows = 10;
     const int max_cols = 240;
     const int min_window_duration_s = 10;
-    const int max_window_duration_s = 28;
+    const int max_window_duration_s = 300;
     if (rows < 0 || rows > max_rows ||
         cols < 0 || cols > max_cols ||
         window_duration < min_window_duration_s ||
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index ae0fa49..3a7cc5d 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -296,6 +296,24 @@ bool PythonWrapper::InvokeCommandWithArgs(const std::string& cmd,
 				return false;
 			}
 		}
+
+		// Add scripts to PATH
+		std::filesystem::path dll_bin = (std::filesystem::current_path() /
+			"Resources/Scripts").lexically_normal();
+		if (env.find(dll_bin.string()) == std::string::npos) {
+			env += ";" + dll_bin.string();
+
+			// Add updated PATH to current process's environment
+			if (!SetEnvironmentVariableA("PATH", env.c_str())) {
+				std::ostringstream err_oss;
+				err_oss << "Error while executing python command \""
+					<< cmd_oss.str()
+					<< "\": Failed to add python scripts to PATH: "
+					<< GetWin32ErrMsg() << std::endl;
+				out_cb("", err_oss.str());
+				return false;
+			}
+		}
 	}
 
 	std::string cmd_str = cmd_oss.str();
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 1b7fa6d..e25473a 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -1,10 +1,12 @@
 param(
   [switch]$skip_zip = $false,
-  [string]$release = "Release"
+  [string]$release = "Release",
+  [string]$install_pip = $true
 )
 
 echo "Skip zip: $skip_zip"
 echo "Release: $release"
+echo "Install pip: $install_pip"
 
 $PSDefaultParameterValues['Out-File:Encoding'] = 'utf8'
 
@@ -50,7 +52,7 @@ if (-Not (Test-Path $pip_path)) {
   mv $PIP_FILE $pip_path
 }
 
-if ($true) {
+if ($install_pip) {
   ./Python/python.exe Python/get-pip.py
 
   echo "Installing future"
@@ -77,6 +79,39 @@ if (-Not (Test-Path $git_dir)) {
   Read-Host -Prompt "Press enter once PortableGit is installed at $pwd\PortableGit"
 }
 
+$nvidia_dir = "nvidia_dll"
+
+if (-Not (Test-Path $nvidia_dir)) {
+  echo "Fetching CUDNN dependencies"
+
+  mkdir $nvidia_dir
+  pushd $nvidia_dir > $null
+
+  $ZLIB_URL = "https://drive.google.com/uc?export=download&id=1NpWU83JVOWG0tJtFK7ObygTbOasGWZpI"
+  Invoke-WebRequest $ZLIB_URL -OutFile "zlibwapi.dll"
+
+  # NVIDIA locks these files behind a fucking login making it a massive
+  # pain in the dick for end users to download, so I rehosted them.
+  # TODO check hashes.
+  echo "Fetching NVIDIA dll 1/4 (600MB)"
+  $CUDNN_CNN_INFER_URL = "https://drive.google.com/uc?export=download&confirm=yes&id=1Px7SGEOM8uAJNxxMGBSwo4sEE8H7GzkB"
+  Invoke-WebRequest $CUDNN_CNN_INFER_URL -OutFile "cudnn_cnn_infer64_8.dll"
+
+  echo "Fetching NVIDIA dll 2/4 (80MB)"
+  $CUDNN_OPS_INFER_URL = "https://drive.google.com/uc?export=download&confirm=yes&id=1mw6Ds1x-4G_GtSzM-GM8y27E9vpQRi_P"
+  Invoke-WebRequest $CUDNN_OPS_INFER_URL -OutFile "cudnn_ops_infer64_8.dll"
+
+  echo "Fetching NVIDIA dll 3/4 (80MB)"
+  $CUBLAS_64_DLL = "https://drive.google.com/uc?export=download&confirm=yes&id=1bflxDt83inYM0N2N0ebD1tw0Jh9la33R"
+  Invoke-WebRequest $CUBLAS_64_DLL -OutFile "cublas64_11.dll"
+
+  echo "Fetching NVIDIA dll 4/4 (150MB)"
+  $CUBLAS_LT64_DLL = "https://drive.google.com/uc?export=download&confirm=yes&id=1fQuVgpkbI8tNPTwueEeiLCSDzqSSGldI"
+  Invoke-WebRequest $CUBLAS_Lt64_DLL -OutFile "cublasLt64_11.dll"
+
+  popd > $null
+}
+
 mkdir $install_dir > $null
 mkdir $install_dir/Resources > $null
 cp -Recurse ../Animations TaSTT/Resources/Animations
@@ -87,6 +122,7 @@ cp -Recurse ../Images TaSTT/Resources/Images
 cp -Recurse Python TaSTT/Resources/Python
 cp -Recurse PortableGit TaSTT/Resources/PortableGit
 cp -Recurse ../Scripts TaSTT/Resources/Scripts
+cp $nvidia_dir/*.dll TaSTT/Resources/Scripts/
 cp -Recurse ../Shaders TaSTT/Resources/Shaders
 cp -Recurse ../Sounds TaSTT/Resources/Sounds
 cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets
@@ -94,9 +130,9 @@ cp -Recurse ../BrowserSource TaSTT/Resources/BrowserSource
 cp GUI/x64/$release/GUI.exe TaSTT/TaSTT.exe
 cp ../"TaSTT-Whisper"/x64/Release/Whisper.dll TaSTT/Whisper.dll
 mkdir TaSTT/Resources/Models
-#cp $WHISPER_CHECKPOINT_FILE TaSTT/Resources/Models/
 
 if (-Not $skip_zip) {
-  Compress-Archive -Path "$install_dir" -DestinationPath "$install_dir.zip" -Force
+  # Compress-Archive shits the bed if the input is larger than 2GB.
+  & "C:\Program Files\7-Zip\7z.exe" a -tzip "$install_dir.zip" "$install_dir" -mx=9
 }
 
diff --git a/README.md b/README.md
index 7e91e7b..f4e2aff 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Basic controls:
 
 ## Features
 
+* Works with the built-in chatbox (usable with public avatars!)
 * Customizable board resolution, [up to ridiculous sizes](https://www.youtube.com/watch?v=u5h-ivkwS0M).
 * Lighweight design:
   * Custom textbox requires as few as 65 parameter bits
@@ -55,7 +56,6 @@ Basic controls:
     metallic, roughness, and emission are all implemented.
   * Border width and rounding are customizable.
   * Shader supports physically based shading: smoothness, metallic, and emissive.
-* Works with the built-in chatbox (usable with public avatars!)
 * Many optional quality-of-life features:
   * Audio feedback: hear distinct beeps when transcription starts and stops.
   * May also enable in-game noise indicator, to grab others' attention.
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index 043fb40..725ee47 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -1,5 +1,7 @@
 editdistance
 future==0.18.2
+langcodes
+language-data
 openvr
 pillow
 pyaudio
@@ -9,4 +11,5 @@ pyyaml
 
 --extra-index-url https://download.pytorch.org/whl/cu116
 torch==1.13.1+cu116
-faster-whisper
+
+faster-whisper@https://github.com/guillaumekln/faster-whisper/archive/358d373691c95205021bd4bbf28cde7ce4d10030.tar.gz
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 208bcd1..4d3169f 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -11,22 +11,15 @@ import copy
 import os
 import osc_ctrl
 import generate_utils
+import langcodes
 import pyaudio
 import numpy as np
 import steamvr
-import string_matcher
 import sys
 import threading
 import time
 import wave
 
-class Config:
-    def __init__(self):
-        # The maximum length that recordAudio() will put into frames before it
-        # starts dropping from the start.
-        self.MAX_LENGTH_S = 10
-config = Config()
-
 class AudioState:
     def __init__(self):
         self.CHUNK = 1024
@@ -37,7 +30,7 @@ class AudioState:
 
         # The maximum length that recordAudio() will put into frames before it
         # starts dropping from the start.
-        self.MAX_LENGTH_S_WHISPER = 30
+        self.MAX_LENGTH_S = 30
         # The minimum length that recordAudio() will wait for before saving audio.
         self.MIN_LENGTH_S = 1
 
@@ -120,9 +113,11 @@ def onAudioFramesAvailable(
 
     audio_state.frames.append(decimated)
 
-    max_frames = int(input_rate * config.MAX_LENGTH_S / audio_state.CHUNK)
+    max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
+            audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
-        audio_state.frames = audio_state.frames[-1 * max_frames :]
+        audio_state.frames = audio_state.frames[-1 * max_frames:]
+
 
     return (frames, pyaudio.paContinue)
 
@@ -210,21 +205,14 @@ def transcribe(audio_state, model, frames, use_cpu: bool):
     frames = np.asarray(audio_state.frames)
     audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0
 
-    segments, info = model.transcribe(audio, beam_size=5,
-            language=audio_state.language)
-
-    result = ""
-    for s in segments:
-        print(f"  s: {s}")
-        print(f"  s.text: {s.text}")
-        if (len(result) == 0):
-            result = str(s.text)
-        else:
-            result += " " + str(s.text)
+    segments, info = model.transcribe(
+            audio,
+            beam_size = 5,
+            language = audio_state.language,
+            vad_filter = True,
+            without_timestamps = True)
 
-    print(f"Result: {result}")
-
-    return result
+    return "".join(s.text for s in segments)
 
 def transcribeAudio(audio_state, model, use_cpu: bool):
     last_transcribe_time = time.time()
@@ -261,12 +249,8 @@ def transcribeAudio(audio_state, model, use_cpu: bool):
             last_transcribe_time = time.time()
             continue
 
-        words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
-
-        old_text = audio_state.text
-
-        audio_state.text = string_matcher.matchStrings(audio_state.text,
-                text, window_size = 25)
+        old_text = copy.copy(audio_state.text)
+        audio_state.text = text
 
         now = time.time()
         print("Transcription ({} seconds): {}".format(
@@ -370,18 +354,32 @@ def readControllerInput(audio_state, enable_local_beep: bool,
 # whisper/__init__.py. Examples: tiny, base, small, medium.
 def transcribeLoop(mic: str, language: str, model: str,
         enable_local_beep: bool, use_cpu: bool, use_builtin: bool,
-        button: str, estate: EmotesState):
+        button: str, estate: EmotesState,
+        window_duration_s: int):
     audio_state = getMicStream(mic)
-    audio_state.language = language
+    audio_state.language = langcodes.find(language).language
+    audio_state.MAX_LENGTH_S = window_duration_s
 
     print("Safe to start talking")
 
     abspath = os.path.abspath(__file__)
     dname = os.path.dirname(abspath)
-    model_root = os.path.join(dname, "Models")
+    model_root = os.path.join(dname, "Models", model)
 
     print("Model {} will be saved to {}".format(model, model_root))
-    model = WhisperModel("large-v2", device="cuda", compute_type="float16")
+
+    model_device = "cuda"
+    if use_cpu:
+        model_device = "cpu"
+
+    download_it = os.path.exists(model_root)
+    if download_it:
+        model = model_root
+    model = WhisperModel(model,
+            device=model_device,
+            compute_type="int8",
+            download_root=model_root,
+            local_files_only=download_it)
 
     transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model, use_cpu])
     transcribe_audio_thd.daemon = True
@@ -429,7 +427,9 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
     parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.")
-    parser.add_argument("--model", type=str, help="Which AI model to use. Ex: tiny, base, small, medium")
+    parser.add_argument("--model", type=str, help="Which AI model to use. \
+            Options: tiny, tiny.en, base, base.en, small, small.en, \
+            medium, medium.en, large-v1, large-v2")
     parser.add_argument("--bytes_per_char", type=str, help="The number of bytes to use to represent each character")
     parser.add_argument("--chars_per_sync", type=str, help="The number of characters to send on each sync event")
     parser.add_argument("--enable_local_beep", type=int, help="Whether to play a local auditory indicator when transcription starts/stops.")
@@ -467,8 +467,9 @@ if __name__ == "__main__":
         print("--emotes_pickle required", file=sys.stderr)
         sys.exit(1)
 
+    window_duration_s = 120
     if args.window_duration_s:
-        config.MAX_LENGTH_S = int(args.window_duration_s)
+        window_duration_s = int(args.window_duration_s)
 
     if args.cpu == 1:
         args.cpu = True
@@ -488,6 +489,8 @@ if __name__ == "__main__":
     generate_utils.config.BOARD_ROWS = int(args.rows)
     generate_utils.config.BOARD_COLS = int(args.cols)
 
+    print(f"PATH: {os.environ['PATH']}")
+
     transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep,
-            args.cpu, args.use_builtin, args.button, estate)
+            args.cpu, args.use_builtin, args.button, estate, window_duration_s)
author	yum <yum.food.vr@gmail.com>	2023-04-24 18:08:16 -0700
committer	yum <yum.food.vr@gmail.com>	2023-04-24 18:14:22 -0700
commit	89c4fa29e5810fac7c75cb5edda1565320a5b8a9 (patch)
tree	ab422edb4ce8b1eaab377a28f852ba936cf7cc10
parent	b4bb6524652e0f76834ca26a4afa232855ca1348 (diff)