summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-12-18 19:11:47 -0800
committeryum <yum.food.vr@gmail.com>2022-12-18 19:11:47 -0800
commitb7b2b112a106138d99dda3f259620b350c896f1a (patch)
treeec30bc9daeb0fc4b75c488f675cf70d55f0a1f85 /Scripts
parent79f1b48042cbb724892301afdee842fb33ab2b37 (diff)
Add ability to select model
* icon now works when pinned to taskbar * add model selection * add script to dump mic devices * whisper models now download into the virtual environment
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/dump_mic_devices.py7
-rw-r--r--Scripts/emotes.py1
-rw-r--r--Scripts/transcribe.py31
3 files changed, 28 insertions, 11 deletions
diff --git a/Scripts/dump_mic_devices.py b/Scripts/dump_mic_devices.py
new file mode 100644
index 0000000..2b712cf
--- /dev/null
+++ b/Scripts/dump_mic_devices.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+from transcribe import dumpMicDevices
+
+if __name__ == "__main__":
+ dumpMicDevices()
+
diff --git a/Scripts/emotes.py b/Scripts/emotes.py
index b922fdf..0a4ed01 100644
--- a/Scripts/emotes.py
+++ b/Scripts/emotes.py
@@ -31,6 +31,7 @@ IMG_TEX_DATA.append(("Images/Emotes/based.png", "based"))
IMG_TEX_DATA.append(("Images/Emotes/chad.png", "chad"))
IMG_TEX_DATA.append(("Images/Emotes/aware.png", "aware"))
IMG_TEX_DATA.append(("Images/Emotes/girl.png", "girl"))
+IMG_TEX_DATA = []
IMG_TEX_KEYWORD_TO_COORD = {}
for i in range(0, len(IMG_TEX_DATA)):
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 48426e2..0f7ae37 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -77,13 +77,14 @@ class AudioState:
osc_client = osc_ctrl.getClient()
-def dumpMicDevices(audio_state):
- info = audio_state.p.get_host_api_info_by_index(0)
+def dumpMicDevices():
+ p = pyaudio.PyAudio()
+ info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
- if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
- device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
+ if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
+ device_name = p.get_device_info_by_host_api_device_index(0, i).get('name')
print("Input Device id ", i, " - ", device_name)
def onAudioFramesAvailable(
@@ -119,7 +120,7 @@ def getMicStream(which_mic):
audio_state.p = pyaudio.PyAudio()
print("Finding mic {}...".format(which_mic))
- dumpMicDevices(audio_state)
+ dumpMicDevices()
got_match = False
device_index = -1
focusrite_str = "Focusrite"
@@ -318,16 +319,20 @@ def readControllerInput(audio_state):
audio_state.drop_transcription = True
audio_state.audio_paused = False
-def transcribeLoop(mic: str, language: str):
+# model should correspond to one of the Whisper models defined in
+# whisper/__init__.py. Examples: tiny, base, small, medium.
+def transcribeLoop(mic: str, language: str, model: str):
audio_state = getMicStream(mic)
audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
print("Safe to start talking")
- #model = whisper.load_model("tiny")
- #model = whisper.load_model("base")
- model = whisper.load_model("small")
- #model = whisper.load_model("medium")
+ abspath = os.path.abspath(__file__)
+ dname = os.path.dirname(abspath)
+ model_root = os.path.join(dname, "Models")
+
+ print("Model {} will be saved to {}".format(model, model_root))
+ model = whisper.load_model(model, download_root=model_root)
transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model])
transcribe_audio_thd.daemon = True
@@ -369,6 +374,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.")
+ parser.add_argument("--model", type=str, help="Which AI model to use. Ex: tiny, base, small, medium")
args = parser.parse_args()
if not args.mic:
@@ -377,5 +383,8 @@ if __name__ == "__main__":
if not args.language:
args.language = "english"
- transcribeLoop(args.mic, args.language)
+ if not args.model:
+ args.language = "base"
+
+ transcribeLoop(args.mic, args.language, args.model)