diff options
| author | yum <yum.food.vr@gmail.com> | 2023-11-07 15:05:29 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-11-07 15:05:29 -0800 |
| commit | dbb2f72792e2af3ff220313f84bf76a9a1ddbeb4 (patch) | |
| tree | 2725c8caba7bcfb8c474ea64e1abcad4d8d06e29 /Scripts/transcribe_v2.py | |
| parent | add7bd8ef86ec21cd1327eb45bcb739aa54f7db8 (diff) | |
Add distilled whisper-medium model
I converted distil-whisper-medium.en to CTranslate2 format and uploaded
it to huggingface. This model is exceptionally fast and light compared
to the non-distilled version, at the cost of some accuracy.
Diffstat (limited to 'Scripts/transcribe_v2.py')
| -rw-r--r-- | Scripts/transcribe_v2.py | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 889e1cf..4ae17d1 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -3,6 +3,7 @@ from datetime import datetime from emotes_v2 import EmotesState from faster_whisper import WhisperModel from functools import partial +from huggingface_hub import hf_hub_download from profanity_filter import ProfanityFilter from pydub import AudioSegment from sentence_splitter import split_text_into_sentences @@ -410,7 +411,8 @@ class Whisper: my_dir = os.path.dirname(abspath) parent_dir = os.path.dirname(my_dir) - model_root = os.path.join(parent_dir, "Models", cfg["model"]) + model_str = cfg["model"] + model_root = os.path.join(parent_dir, "Models", model_str) print(f"Model {cfg['model']} will be saved to {model_root}", file=sys.stderr) @@ -419,13 +421,21 @@ class Whisper: model_device = "cpu" download_it = os.path.exists(model_root) - model_str = cfg["model"] + model_str = model_str if download_it: + if '/' in model_str: + hf_hub_download(repo_id=model_str, filename='model.bin', + local_dir=model_root) + hf_hub_download(repo_id=model_str, filename='vocabulary.json', + local_dir=model_root) + hf_hub_download(repo_id=model_str, filename='config.json', + local_dir=model_root) + model_str = model_root self.model = WhisperModel(model_str, device = model_device, device_index = cfg["gpu_idx"], - compute_type = "int8", + compute_type = "float16", download_root = model_root, local_files_only = download_it) @@ -437,6 +447,7 @@ class Whisper: audio = np.frombuffer(frames, dtype=np.int16).flatten().astype(np.float32) / 32768.0 + t0 = time.time() segments, info = self.model.transcribe( audio, language = langcodes.find(self.cfg["language"]).language, @@ -458,6 +469,9 @@ class Whisper: res.append(Segment(s.text, s.start, s.end, self.collector.begin(), s.avg_logprob, s.no_speech_prob, s.compression_ratio)) + t1 = time.time() + if cfg["enable_debug_mode"]: + print(f"Transcription latency (s): {t1 - t0}") return res def saveAudio(audio: bytes, path: str): |
