diff options
| author | yum <yum.food.vr@gmail.com> | 2023-04-24 18:08:16 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-04-24 18:14:22 -0700 |
| commit | 89c4fa29e5810fac7c75cb5edda1565320a5b8a9 (patch) | |
| tree | ab422edb4ce8b1eaab377a28f852ba936cf7cc10 | |
| parent | b4bb6524652e0f76834ca26a4afa232855ca1348 (diff) | |
~Finish integrating faster-whisper
I'm able to use the new code to show text in game. Not yet play-tested.
| -rw-r--r-- | GUI/GUI/GUI/Config.cpp | 2 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 4 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.cpp | 18 | ||||
| -rw-r--r-- | GUI/package.ps1 | 44 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | Scripts/requirements.txt | 5 | ||||
| -rw-r--r-- | Scripts/transcribe.py | 79 |
7 files changed, 108 insertions, 46 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp index f45aa45..f35f95c 100644 --- a/GUI/GUI/GUI/Config.cpp +++ b/GUI/GUI/GUI/Config.cpp @@ -66,7 +66,7 @@ AppConfig::AppConfig(wxTextCtrl* out) language("english"),
model("base.en"),
button("left joystick"),
- window_duration("15"),
+ window_duration("120"),
enable_local_beep(true),
use_cpu(false),
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index c9b7281..1195540 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -250,6 +250,8 @@ namespace { "small",
"medium.en",
"medium",
+ "large-v1",
+ "large-v2",
};
const size_t kNumModelChoices = sizeof(kModelChoices) / sizeof(kModelChoices[0]);
constexpr int kModelDefault = 2; // base.en
@@ -2123,7 +2125,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { const int max_rows = 10;
const int max_cols = 240;
const int min_window_duration_s = 10;
- const int max_window_duration_s = 28;
+ const int max_window_duration_s = 300;
if (rows < 0 || rows > max_rows ||
cols < 0 || cols > max_cols ||
window_duration < min_window_duration_s ||
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index ae0fa49..3a7cc5d 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -296,6 +296,24 @@ bool PythonWrapper::InvokeCommandWithArgs(const std::string& cmd, return false; } } + + // Add scripts to PATH + std::filesystem::path dll_bin = (std::filesystem::current_path() / + "Resources/Scripts").lexically_normal(); + if (env.find(dll_bin.string()) == std::string::npos) { + env += ";" + dll_bin.string(); + + // Add updated PATH to current process's environment + if (!SetEnvironmentVariableA("PATH", env.c_str())) { + std::ostringstream err_oss; + err_oss << "Error while executing python command \"" + << cmd_oss.str() + << "\": Failed to add python scripts to PATH: " + << GetWin32ErrMsg() << std::endl; + out_cb("", err_oss.str()); + return false; + } + } } std::string cmd_str = cmd_oss.str(); diff --git a/GUI/package.ps1 b/GUI/package.ps1 index 1b7fa6d..e25473a 100644 --- a/GUI/package.ps1 +++ b/GUI/package.ps1 @@ -1,10 +1,12 @@ param(
[switch]$skip_zip = $false,
- [string]$release = "Release"
+ [string]$release = "Release",
+ [string]$install_pip = $true
)
echo "Skip zip: $skip_zip"
echo "Release: $release"
+echo "Install pip: $install_pip"
$PSDefaultParameterValues['Out-File:Encoding'] = 'utf8'
@@ -50,7 +52,7 @@ if (-Not (Test-Path $pip_path)) { mv $PIP_FILE $pip_path
}
-if ($true) {
+if ($install_pip) {
./Python/python.exe Python/get-pip.py
echo "Installing future"
@@ -77,6 +79,39 @@ if (-Not (Test-Path $git_dir)) { Read-Host -Prompt "Press enter once PortableGit is installed at $pwd\PortableGit"
}
+$nvidia_dir = "nvidia_dll"
+
+if (-Not (Test-Path $nvidia_dir)) {
+ echo "Fetching CUDNN dependencies"
+
+ mkdir $nvidia_dir
+ pushd $nvidia_dir > $null
+
+ $ZLIB_URL = "https://drive.google.com/uc?export=download&id=1NpWU83JVOWG0tJtFK7ObygTbOasGWZpI"
+ Invoke-WebRequest $ZLIB_URL -OutFile "zlibwapi.dll"
+
+ # NVIDIA locks these files behind a fucking login making it a massive
+ # pain in the dick for end users to download, so I rehosted them.
+ # TODO check hashes.
+ echo "Fetching NVIDIA dll 1/4 (600MB)"
+ $CUDNN_CNN_INFER_URL = "https://drive.google.com/uc?export=download&confirm=yes&id=1Px7SGEOM8uAJNxxMGBSwo4sEE8H7GzkB"
+ Invoke-WebRequest $CUDNN_CNN_INFER_URL -OutFile "cudnn_cnn_infer64_8.dll"
+
+ echo "Fetching NVIDIA dll 2/4 (80MB)"
+ $CUDNN_OPS_INFER_URL = "https://drive.google.com/uc?export=download&confirm=yes&id=1mw6Ds1x-4G_GtSzM-GM8y27E9vpQRi_P"
+ Invoke-WebRequest $CUDNN_OPS_INFER_URL -OutFile "cudnn_ops_infer64_8.dll"
+
+ echo "Fetching NVIDIA dll 3/4 (80MB)"
+ $CUBLAS_64_DLL = "https://drive.google.com/uc?export=download&confirm=yes&id=1bflxDt83inYM0N2N0ebD1tw0Jh9la33R"
+ Invoke-WebRequest $CUBLAS_64_DLL -OutFile "cublas64_11.dll"
+
+ echo "Fetching NVIDIA dll 4/4 (150MB)"
+ $CUBLAS_LT64_DLL = "https://drive.google.com/uc?export=download&confirm=yes&id=1fQuVgpkbI8tNPTwueEeiLCSDzqSSGldI"
+ Invoke-WebRequest $CUBLAS_Lt64_DLL -OutFile "cublasLt64_11.dll"
+
+ popd > $null
+}
+
mkdir $install_dir > $null
mkdir $install_dir/Resources > $null
cp -Recurse ../Animations TaSTT/Resources/Animations
@@ -87,6 +122,7 @@ cp -Recurse ../Images TaSTT/Resources/Images cp -Recurse Python TaSTT/Resources/Python
cp -Recurse PortableGit TaSTT/Resources/PortableGit
cp -Recurse ../Scripts TaSTT/Resources/Scripts
+cp $nvidia_dir/*.dll TaSTT/Resources/Scripts/
cp -Recurse ../Shaders TaSTT/Resources/Shaders
cp -Recurse ../Sounds TaSTT/Resources/Sounds
cp -Recurse ../UnityAssets TaSTT/Resources/UnityAssets
@@ -94,9 +130,9 @@ cp -Recurse ../BrowserSource TaSTT/Resources/BrowserSource cp GUI/x64/$release/GUI.exe TaSTT/TaSTT.exe
cp ../"TaSTT-Whisper"/x64/Release/Whisper.dll TaSTT/Whisper.dll
mkdir TaSTT/Resources/Models
-#cp $WHISPER_CHECKPOINT_FILE TaSTT/Resources/Models/
if (-Not $skip_zip) {
- Compress-Archive -Path "$install_dir" -DestinationPath "$install_dir.zip" -Force
+ # Compress-Archive shits the bed if the input is larger than 2GB.
+ & "C:\Program Files\7-Zip\7z.exe" a -tzip "$install_dir.zip" "$install_dir" -mx=9
}
@@ -37,6 +37,7 @@ Basic controls: ## Features +* Works with the built-in chatbox (usable with public avatars!) * Customizable board resolution, [up to ridiculous sizes](https://www.youtube.com/watch?v=u5h-ivkwS0M). * Lighweight design: * Custom textbox requires as few as 65 parameter bits @@ -55,7 +56,6 @@ Basic controls: metallic, roughness, and emission are all implemented. * Border width and rounding are customizable. * Shader supports physically based shading: smoothness, metallic, and emissive. -* Works with the built-in chatbox (usable with public avatars!) * Many optional quality-of-life features: * Audio feedback: hear distinct beeps when transcription starts and stops. * May also enable in-game noise indicator, to grab others' attention. diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt index 043fb40..725ee47 100644 --- a/Scripts/requirements.txt +++ b/Scripts/requirements.txt @@ -1,5 +1,7 @@ editdistance future==0.18.2 +langcodes +language-data openvr pillow pyaudio @@ -9,4 +11,5 @@ pyyaml --extra-index-url https://download.pytorch.org/whl/cu116 torch==1.13.1+cu116 -faster-whisper + +faster-whisper@https://github.com/guillaumekln/faster-whisper/archive/358d373691c95205021bd4bbf28cde7ce4d10030.tar.gz diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 208bcd1..4d3169f 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -11,22 +11,15 @@ import copy import os import osc_ctrl import generate_utils +import langcodes import pyaudio import numpy as np import steamvr -import string_matcher import sys import threading import time import wave -class Config: - def __init__(self): - # The maximum length that recordAudio() will put into frames before it - # starts dropping from the start. - self.MAX_LENGTH_S = 10 -config = Config() - class AudioState: def __init__(self): self.CHUNK = 1024 @@ -37,7 +30,7 @@ class AudioState: # The maximum length that recordAudio() will put into frames before it # starts dropping from the start. - self.MAX_LENGTH_S_WHISPER = 30 + self.MAX_LENGTH_S = 30 # The minimum length that recordAudio() will wait for before saving audio. self.MIN_LENGTH_S = 1 @@ -120,9 +113,11 @@ def onAudioFramesAvailable( audio_state.frames.append(decimated) - max_frames = int(input_rate * config.MAX_LENGTH_S / audio_state.CHUNK) + max_frames = int(input_rate * audio_state.MAX_LENGTH_S / + audio_state.CHUNK) if len(audio_state.frames) > max_frames: - audio_state.frames = audio_state.frames[-1 * max_frames :] + audio_state.frames = audio_state.frames[-1 * max_frames:] + return (frames, pyaudio.paContinue) @@ -210,21 +205,14 @@ def transcribe(audio_state, model, frames, use_cpu: bool): frames = np.asarray(audio_state.frames) audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0 - segments, info = model.transcribe(audio, beam_size=5, - language=audio_state.language) - - result = "" - for s in segments: - print(f" s: {s}") - print(f" s.text: {s.text}") - if (len(result) == 0): - result = str(s.text) - else: - result += " " + str(s.text) + segments, info = model.transcribe( + audio, + beam_size = 5, + language = audio_state.language, + vad_filter = True, + without_timestamps = True) - print(f"Result: {result}") - - return result + return "".join(s.text for s in segments) def transcribeAudio(audio_state, model, use_cpu: bool): last_transcribe_time = time.time() @@ -261,12 +249,8 @@ def transcribeAudio(audio_state, model, use_cpu: bool): last_transcribe_time = time.time() continue - words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() - - old_text = audio_state.text - - audio_state.text = string_matcher.matchStrings(audio_state.text, - text, window_size = 25) + old_text = copy.copy(audio_state.text) + audio_state.text = text now = time.time() print("Transcription ({} seconds): {}".format( @@ -370,18 +354,32 @@ def readControllerInput(audio_state, enable_local_beep: bool, # whisper/__init__.py. Examples: tiny, base, small, medium. def transcribeLoop(mic: str, language: str, model: str, enable_local_beep: bool, use_cpu: bool, use_builtin: bool, - button: str, estate: EmotesState): + button: str, estate: EmotesState, + window_duration_s: int): audio_state = getMicStream(mic) - audio_state.language = language + audio_state.language = langcodes.find(language).language + audio_state.MAX_LENGTH_S = window_duration_s print("Safe to start talking") abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) - model_root = os.path.join(dname, "Models") + model_root = os.path.join(dname, "Models", model) print("Model {} will be saved to {}".format(model, model_root)) - model = WhisperModel("large-v2", device="cuda", compute_type="float16") + + model_device = "cuda" + if use_cpu: + model_device = "cpu" + + download_it = os.path.exists(model_root) + if download_it: + model = model_root + model = WhisperModel(model, + device=model_device, + compute_type="int8", + download_root=model_root, + local_files_only=download_it) transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model, use_cpu]) transcribe_audio_thd.daemon = True @@ -429,7 +427,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index") parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.") - parser.add_argument("--model", type=str, help="Which AI model to use. Ex: tiny, base, small, medium") + parser.add_argument("--model", type=str, help="Which AI model to use. \ + Options: tiny, tiny.en, base, base.en, small, small.en, \ + medium, medium.en, large-v1, large-v2") parser.add_argument("--bytes_per_char", type=str, help="The number of bytes to use to represent each character") parser.add_argument("--chars_per_sync", type=str, help="The number of characters to send on each sync event") parser.add_argument("--enable_local_beep", type=int, help="Whether to play a local auditory indicator when transcription starts/stops.") @@ -467,8 +467,9 @@ if __name__ == "__main__": print("--emotes_pickle required", file=sys.stderr) sys.exit(1) + window_duration_s = 120 if args.window_duration_s: - config.MAX_LENGTH_S = int(args.window_duration_s) + window_duration_s = int(args.window_duration_s) if args.cpu == 1: args.cpu = True @@ -488,6 +489,8 @@ if __name__ == "__main__": generate_utils.config.BOARD_ROWS = int(args.rows) generate_utils.config.BOARD_COLS = int(args.cols) + print(f"PATH: {os.environ['PATH']}") + transcribeLoop(args.mic, args.language, args.model, args.enable_local_beep, - args.cpu, args.use_builtin, args.button, estate) + args.cpu, args.use_builtin, args.button, estate, window_duration_s) |
