summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2022-12-18 14:46:53 -0800
committeryum <yum.food.vr@gmail.com>2022-12-18 15:08:28 -0800
commit79f1b48042cbb724892301afdee842fb33ab2b37 (patch)
tree46e588b1a12115892ab45e9853915b32c4bd15d6
parent03fbf0e8ca409fe4c26e246286a975724ad0994b (diff)
GUI: Add mic, language selection
Users can now select their mic & spoken language in the GUI. * pyaudio now samples at the mic rate, fixing an issue where frames would drop. We downsample in the callback by dropping frames. * add Sounds folder to package
-rw-r--r--GUI/GUI/GUI/Frame.cpp174
-rw-r--r--GUI/GUI/GUI/Frame.h2
-rw-r--r--GUI/GUI/GUI/PythonWrapper.cpp10
-rw-r--r--GUI/GUI/GUI/PythonWrapper.h4
-rw-r--r--GUI/package.ps11
-rw-r--r--Scripts/transcribe.py100
6 files changed, 234 insertions, 57 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index db2bdd7..55112db 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -13,20 +13,145 @@ namespace {
ID_PY_APP_START_BUTTON,
ID_PY_APP_STOP_BUTTON,
ID_PY_OUT,
+ ID_PY_APP_MIC,
+ ID_PY_APP_LANG,
};
-};
+
+ const wxString kMicChoices[] = {
+ "index",
+ "focusrite",
+ // ok now this is epic
+ "0",
+ "1",
+ "2",
+ "3",
+ "4",
+ "5",
+ "6",
+ "7",
+ "8",
+ "9",
+ };
+ const size_t kNumMicChoices = sizeof(kMicChoices) / sizeof(kMicChoices[0]);
+
+ // lifted from whisper/tokenizer.py
+ const wxString kLangChoices[] = {
+ "english",
+ "chinese",
+ "german",
+ "spanish",
+ "russian",
+ "korean",
+ "french",
+ "japanese",
+ "portuguese",
+ "turkish",
+ "polish",
+ "catalan",
+ "dutch",
+ "arabic",
+ "swedish",
+ "italian",
+ "indonesian",
+ "hindi",
+ "finnish",
+ "vietnamese",
+ "hebrew",
+ "ukrainian",
+ "greek",
+ "malay",
+ "czech",
+ "romanian",
+ "danish",
+ "hungarian",
+ "tamil",
+ "norwegian",
+ "thai",
+ "urdu",
+ "croatian",
+ "bulgarian",
+ "lithuanian",
+ "latin",
+ "maori",
+ "malayalam",
+ "welsh",
+ "slovak",
+ "telugu",
+ "persian",
+ "latvian",
+ "bengali",
+ "serbian",
+ "azerbaijani",
+ "slovenian",
+ "kannada",
+ "estonian",
+ "macedonian",
+ "breton",
+ "basque",
+ "icelandic",
+ "armenian",
+ "nepali",
+ "mongolian",
+ "bosnian",
+ "kazakh",
+ "albanian",
+ "swahili",
+ "galician",
+ "marathi",
+ "punjabi",
+ "sinhala",
+ "khmer",
+ "shona",
+ "yoruba",
+ "somali",
+ "afrikaans",
+ "occitan",
+ "georgian",
+ "belarusian",
+ "tajik",
+ "sindhi",
+ "gujarati",
+ "amharic",
+ "yiddish",
+ "lao",
+ "uzbek",
+ "faroese",
+ "haitian creole",
+ "pashto",
+ "turkmen",
+ "nynorsk",
+ "maltese",
+ "sanskrit",
+ "luxembourgish",
+ "myanmar",
+ "tibetan",
+ "tagalog",
+ "malagasy",
+ "assamese",
+ "tatar",
+ "hawaiian",
+ "lingala",
+ "hausa",
+ "bashkir",
+ "javanese",
+ "sundanese"
+ };
+ const size_t kNumLangChoices = sizeof(kLangChoices) / sizeof(kLangChoices[0]);
+} // namespace
Frame::Frame()
- : wxFrame(nullptr, wxID_ANY, "TaSTT"),
- py_panel_(this, ID_PY_PANEL),
- py_panel_sizer_(wxVERTICAL),
- py_version_button_(&py_panel_, ID_PY_VERSION_BUTTON, "Check embedded Python version"),
- py_setup_button_(&py_panel_, ID_PY_SETUP_BUTTON, "Set up Python virtual environment"),
- py_app_start_button_(&py_panel_, ID_PY_APP_START_BUTTON, "Begin transcribing"),
- py_app_stop_button_(&py_panel_, ID_PY_APP_STOP_BUTTON, "Stop transcribing"),
- py_out_(&py_panel_, ID_PY_OUT, wxEmptyString, wxDefaultPosition,
- wxSize(/*x_px=*/480, /*y_px=*/160), wxTE_MULTILINE),
- py_app_(nullptr)
+ : wxFrame(nullptr, wxID_ANY, "TaSTT"),
+ py_panel_(this, ID_PY_PANEL),
+ py_panel_sizer_(wxVERTICAL),
+ py_version_button_(&py_panel_, ID_PY_VERSION_BUTTON, "Check embedded Python version"),
+ py_setup_button_(&py_panel_, ID_PY_SETUP_BUTTON, "Set up Python virtual environment"),
+ py_app_start_button_(&py_panel_, ID_PY_APP_START_BUTTON, "Begin transcribing"),
+ py_app_stop_button_(&py_panel_, ID_PY_APP_STOP_BUTTON, "Stop transcribing"),
+ py_out_(&py_panel_, ID_PY_OUT, wxEmptyString, wxDefaultPosition,
+ wxSize(/*x_px=*/480, /*y_px=*/160), wxTE_MULTILINE),
+ py_app_(nullptr),
+ py_app_mic_(&py_panel_, ID_PY_APP_MIC, wxDefaultPosition, wxDefaultSize, kNumMicChoices, kMicChoices),
+ py_app_lang_(&py_panel_, ID_PY_APP_LANG, wxDefaultPosition, wxDefaultSize, kNumLangChoices, kLangChoices)
{
Bind(wxEVT_MENU, &Frame::OnExit, this, wxID_EXIT);
Bind(wxEVT_BUTTON, &Frame::OnGetPythonVersion, this, ID_PY_VERSION_BUTTON);
@@ -41,10 +166,14 @@ Frame::Frame()
wxSize py_out_size(/*x=*/80, /*y=*/20);
py_out_.SetSize(py_out_size);
+ py_app_mic_.SetSelection(0);
+ py_app_lang_.SetSelection(0);
py_panel_.SetSizer(&py_panel_sizer_);
py_panel_sizer_.Add(&py_version_button_);
py_panel_sizer_.Add(&py_setup_button_);
+ py_panel_sizer_.Add(&py_app_mic_);
+ py_panel_sizer_.Add(&py_app_lang_);
py_panel_sizer_.Add(&py_app_start_button_);
py_panel_sizer_.Add(&py_app_stop_button_);
py_panel_sizer_.Add(&py_out_);
@@ -75,7 +204,7 @@ void Frame::OnSetupPython(wxCommandEvent& event)
{
std::string py_out;
std::ostringstream py_out_oss;
- py_out_oss << "Installing pip" << std::endl;
+ py_out_oss << " Installing pip" << std::endl;
py_out_.AppendText(py_out_oss.str());
if (!py.InstallPip(&py_out)) {
std::ostringstream py_out_oss;
@@ -85,22 +214,20 @@ void Frame::OnSetupPython(wxCommandEvent& event)
}
const std::vector<std::string> pip_deps{
+ "openvr",
"pillow",
- "pydub",
"pyaudio",
+ "python-osc",
"playsound==1.2.2",
"torch --extra-index-url https://download.pytorch.org/whl/cu116",
"git+https://github.com/openai/whisper.git",
- "openvr",
"editdistance",
- "pydub",
- "python-osc",
};
for (const auto& pip_dep : pip_deps) {
{
std::ostringstream py_out_oss;
- py_out_oss << "Installing " << pip_dep << std::endl;
+ py_out_oss << " Installing " << pip_dep << std::endl;
py_out_.AppendText(py_out_oss.str());
}
std::string py_out;
@@ -137,7 +264,18 @@ void Frame::OnAppStart(wxCommandEvent& event) {
return;
};
- wxProcess* p = py.StartApp(std::move(cb));
+ int which_mic = py_app_mic_.GetSelection();
+ if (which_mic == wxNOT_FOUND) {
+ which_mic = 0;
+ }
+ int which_lang = py_app_lang_.GetSelection();
+ if (which_lang == wxNOT_FOUND) {
+ which_lang = 0;
+ }
+
+ wxProcess* p = py.StartApp(std::move(cb),
+ kMicChoices[which_mic].ToStdString(),
+ kLangChoices[which_lang].ToStdString());
if (!p) {
py_out_.AppendText("Failed to launch transcription engine\n");
return;
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 414d2b3..8132cce 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -22,6 +22,8 @@ private:
wxButton py_app_start_button_;
wxButton py_app_stop_button_;
wxTextCtrl py_out_;
+ wxChoice py_app_mic_;
+ wxChoice py_app_lang_;
wxProcess* py_app_;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index 7270ab5..53fcc06 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -85,8 +85,14 @@ bool PythonWrapper::InstallPip(std::string* out) {
return InvokeWithArgs({ pip_path }, out);
}
-wxProcess* PythonWrapper::StartApp(std::function<void(wxProcess* proc, int ret)>&& exit_callback) {
- return InvokeAsyncWithArgs({ "Resources/Scripts/transcribe.py" },
+wxProcess* PythonWrapper::StartApp(
+ std::function<void(wxProcess* proc, int ret)>&& exit_callback,
+ const std::string& mic, const std::string& lang) {
+ return InvokeAsyncWithArgs({
+ "Resources/Scripts/transcribe.py",
+ "--mic", mic,
+ "--lang", lang,
+ },
std::move(exit_callback));
}
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 4407b5e..0fa3c94 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -34,6 +34,8 @@ public:
// Execute get-pip.py.
bool InstallPip(std::string* out);
- wxProcess* StartApp(std::function<void(wxProcess* proc, int ret)>&& exit_callback);
+ wxProcess* StartApp(
+ std::function<void(wxProcess* proc, int ret)>&& exit_callback,
+ const std::string& mic, const std::string& lang);
};
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 0c2cec0..0346e67 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -9,5 +9,6 @@ mkdir $install_dir/Resources > $null
cp ../Images/logo.png TaSTT/Resources
cp -Recurse ../Python TaSTT/Resources/Python
cp -Recurse ../Scripts TaSTT/Resources/Scripts
+cp -Recurse ../Sounds TaSTT/Resources/Sounds
cp GUI/x64/Release/GUI.exe TaSTT/TaSTT.exe
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 62e6add..48426e2 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -5,10 +5,7 @@ import copy
from datetime import datetime
import os
import osc_ctrl
-# python3 -m pip install pydub
-# License: MIT.
-from pydub import AudioSegment as pydub_AudioSegment
-from pydub import effects as pydub_effects
+from functools import partial
# python3 -m pip install pyaudio
# License: MIT.
import pyaudio
@@ -80,11 +77,49 @@ class AudioState:
osc_client = osc_ctrl.getClient()
+def dumpMicDevices(audio_state):
+ info = audio_state.p.get_host_api_info_by_index(0)
+ numdevices = info.get('deviceCount')
+
+ for i in range(0, numdevices):
+ if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
+ device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
+ print("Input Device id ", i, " - ", device_name)
+
+def onAudioFramesAvailable(
+ audio_state,
+ input_rate,
+ frames,
+ frame_count,
+ time_info,
+ status_flags):
+
+ # Reduce sample rate from mic rate to Whisper rate by dropping frames.
+ decimated = b''
+ frame_len = int(len(frames) / frame_count)
+ next_frame = 0.0
+ keep_every = float(input_rate) / audio_state.RATE
+ i = 0
+ for i in range(0, frame_count):
+ if i >= next_frame:
+ decimated += frames[i*frame_len:(i+1)*frame_len]
+ next_frame += keep_every
+ i += 1
+
+ audio_state.frames.append(decimated)
+
+ max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
+ if len(audio_state.frames) > max_frames:
+ audio_state.frames = audio_state.frames[-1 * max_frames :]
+
+ return (frames, pyaudio.paContinue)
+
def getMicStream(which_mic):
audio_state = AudioState()
audio_state.p = pyaudio.PyAudio()
- print("Finding index mic...")
+ print("Finding mic {}...".format(which_mic))
+ dumpMicDevices(audio_state)
got_match = False
device_index = -1
focusrite_str = "Focusrite"
@@ -94,15 +129,16 @@ def getMicStream(which_mic):
elif which_mic == "focusrite":
target_str = focusrite_str
else:
- raise Exception("Unrecognized mic requested: {}".format(which_mic))
+ print("Mic {} requested, treating it as a numerical device ID".format(which_mic))
+ device_index = int(which_mic)
+ got_match = True
+
while got_match == False:
info = audio_state.p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
-
for i in range(0, numdevices):
if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
- print("Input Device id ", i, " - ", device_name)
if target_str in device_name:
print("Got match: {}".format(device_name))
device_index = i
@@ -112,29 +148,22 @@ def getMicStream(which_mic):
print("No match, sleeping")
time.sleep(3)
- audio_state.stream = audio_state.p.open(format=audio_state.FORMAT,
- channels=audio_state.CHANNELS, rate=audio_state.RATE,
- input=True, frames_per_buffer=audio_state.CHUNK,
- input_device_index=device_index)
-
- return audio_state
+ info = audio_state.p.get_device_info_by_host_api_device_index(0, device_index)
+ input_rate = int(info['defaultSampleRate'])
+ print("input rate: {}".format(input_rate))
-# Continuously records audio as long as audio_state.run_app is set.
-def recordAudio(audio_state):
- print("Recording audio")
- while audio_state.run_app:
- data = audio_state.stream.read(audio_state.CHUNK)
+ # Bind audio_state to onAudioFramesAvailable
+ callback = partial(onAudioFramesAvailable, audio_state, input_rate)
- if audio_state.audio_paused:
- time.sleep(0.1)
- continue
+ audio_state.stream = audio_state.p.open(format=audio_state.FORMAT,
+ channels=audio_state.CHANNELS, rate=input_rate,
+ input=True, frames_per_buffer=audio_state.CHUNK,
+ input_device_index=device_index,
+ stream_callback=callback)
- audio_state.frames.append(data)
- max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
- if len(audio_state.frames) > max_frames:
- audio_state.frames = audio_state.frames[-1 * max_frames :]
+ audio_state.stream.start_stream()
- print("Done recording")
+ return audio_state
def resetAudioLocked(audio_state):
audio_state.frames = []
@@ -241,7 +270,7 @@ def transcribeAudio(audio_state, model):
old_text = audio_state.text
audio_state.text = string_matcher.matchStrings(audio_state.text,
- text, window_size = 30)
+ text, window_size = 20)
if old_text != audio_state.text:
# We think the user said something, so reset the amount of
# time we sleep between transcriptions to the minimum.
@@ -276,13 +305,13 @@ def readControllerInput(audio_state):
if state == RECORD_STATE:
state = PAUSE_STATE
osc_ctrl.indicateSpeech(audio_state.osc_client, False)
- playsound(os.path.abspath("Sounds/Noise_Off.wav"))
+ playsound(os.path.abspath("../Sounds/Noise_Off.wav"))
audio_state.audio_paused = True
elif state == PAUSE_STATE:
state = RECORD_STATE
osc_ctrl.indicateSpeech(audio_state.osc_client, True)
- playsound(os.path.abspath("Sounds/Noise_On.wav"))
+ playsound(os.path.abspath("../Sounds/Noise_On.wav"))
resetAudioLocked(audio_state)
resetDisplayLocked(audio_state)
@@ -293,10 +322,6 @@ def transcribeLoop(mic: str, language: str):
audio_state = getMicStream(mic)
audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
- record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
- record_audio_thd.daemon = True
- record_audio_thd.start()
-
print("Safe to start talking")
#model = whisper.load_model("tiny")
@@ -331,13 +356,16 @@ def transcribeLoop(mic: str, language: str):
print("Joining threads")
audio_state.run_app = False
- audio_state.run_app = False
- record_audio_thd.join()
transcribe_audio_thd.join()
controller_input_thd.join()
if __name__ == "__main__":
+ # Set cwd to the directory holding the script
+ abspath = os.path.abspath(__file__)
+ dname = os.path.dirname(abspath)
+ os.chdir(dname)
+
parser = argparse.ArgumentParser()
parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.")