GUI: Add mic, language selection

Users can now select their mic & spoken language in the GUI. * pyaudio now samples at the mic rate, fixing an issue where frames would drop. We downsample in the callback by dropping frames. * add Sounds folder to package
author: yum <yum.food.vr@gmail.com> 2022-12-18 14:46:53 -0800
committer: yum <yum.food.vr@gmail.com> 2022-12-18 15:08:28 -0800
commit: 79f1b48042cbb724892301afdee842fb33ab2b37 (patch)
tree: 46e588b1a12115892ab45e9853915b32c4bd15d6
parent: 03fbf0e8ca409fe4c26e246286a975724ad0994b (diff)
6 files changed, 234 insertions, 57 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index db2bdd7..55112db 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -13,20 +13,145 @@ namespace {
         ID_PY_APP_START_BUTTON,
         ID_PY_APP_STOP_BUTTON,
         ID_PY_OUT,
+        ID_PY_APP_MIC,
+        ID_PY_APP_LANG,
     };
-};
+
+    const wxString kMicChoices[] = {
+        "index",
+        "focusrite",
+        // ok now this is epic
+        "0",
+        "1",
+        "2",
+        "3",
+        "4",
+        "5",
+        "6",
+        "7",
+        "8",
+        "9",
+    };
+    const size_t kNumMicChoices = sizeof(kMicChoices) / sizeof(kMicChoices[0]);
+
+    // lifted from whisper/tokenizer.py
+	const wxString kLangChoices[] = {
+	    "english",
+		"chinese",
+		"german",
+		"spanish",
+		"russian",
+		"korean",
+		"french",
+		"japanese",
+		"portuguese",
+		"turkish",
+		"polish",
+		"catalan",
+		"dutch",
+		"arabic",
+		"swedish",
+		"italian",
+		"indonesian",
+		"hindi",
+		"finnish",
+		"vietnamese",
+		"hebrew",
+		"ukrainian",
+		"greek",
+		"malay",
+		"czech",
+		"romanian",
+		"danish",
+		"hungarian",
+		"tamil",
+		"norwegian",
+		"thai",
+		"urdu",
+		"croatian",
+		"bulgarian",
+		"lithuanian",
+		"latin",
+		"maori",
+		"malayalam",
+		"welsh",
+		"slovak",
+		"telugu",
+		"persian",
+		"latvian",
+		"bengali",
+		"serbian",
+		"azerbaijani",
+		"slovenian",
+		"kannada",
+		"estonian",
+		"macedonian",
+		"breton",
+		"basque",
+		"icelandic",
+		"armenian",
+		"nepali",
+		"mongolian",
+		"bosnian",
+		"kazakh",
+		"albanian",
+		"swahili",
+		"galician",
+		"marathi",
+		"punjabi",
+		"sinhala",
+		"khmer",
+		"shona",
+		"yoruba",
+		"somali",
+		"afrikaans",
+		"occitan",
+		"georgian",
+		"belarusian",
+		"tajik",
+		"sindhi",
+		"gujarati",
+		"amharic",
+		"yiddish",
+		"lao",
+		"uzbek",
+		"faroese",
+		"haitian creole",
+		"pashto",
+		"turkmen",
+		"nynorsk",
+		"maltese",
+		"sanskrit",
+		"luxembourgish",
+		"myanmar",
+		"tibetan",
+		"tagalog",
+		"malagasy",
+		"assamese",
+		"tatar",
+		"hawaiian",
+		"lingala",
+		"hausa",
+		"bashkir",
+		"javanese",
+		"sundanese"
+	};
+    const size_t kNumLangChoices = sizeof(kLangChoices) / sizeof(kLangChoices[0]);
+}  // namespace
 
 Frame::Frame()
-	: wxFrame(nullptr, wxID_ANY, "TaSTT"),
-	py_panel_(this, ID_PY_PANEL),
-	py_panel_sizer_(wxVERTICAL),
-	py_version_button_(&py_panel_, ID_PY_VERSION_BUTTON, "Check embedded Python version"),
-	py_setup_button_(&py_panel_, ID_PY_SETUP_BUTTON, "Set up Python virtual environment"),
-	py_app_start_button_(&py_panel_, ID_PY_APP_START_BUTTON, "Begin transcribing"),
-	py_app_stop_button_(&py_panel_, ID_PY_APP_STOP_BUTTON, "Stop transcribing"),
-	py_out_(&py_panel_, ID_PY_OUT, wxEmptyString, wxDefaultPosition,
-		wxSize(/*x_px=*/480, /*y_px=*/160), wxTE_MULTILINE),
-	py_app_(nullptr)
+    : wxFrame(nullptr, wxID_ANY, "TaSTT"),
+    py_panel_(this, ID_PY_PANEL),
+    py_panel_sizer_(wxVERTICAL),
+    py_version_button_(&py_panel_, ID_PY_VERSION_BUTTON, "Check embedded Python version"),
+    py_setup_button_(&py_panel_, ID_PY_SETUP_BUTTON, "Set up Python virtual environment"),
+    py_app_start_button_(&py_panel_, ID_PY_APP_START_BUTTON, "Begin transcribing"),
+    py_app_stop_button_(&py_panel_, ID_PY_APP_STOP_BUTTON, "Stop transcribing"),
+    py_out_(&py_panel_, ID_PY_OUT, wxEmptyString, wxDefaultPosition,
+        wxSize(/*x_px=*/480, /*y_px=*/160), wxTE_MULTILINE),
+    py_app_(nullptr),
+    py_app_mic_(&py_panel_, ID_PY_APP_MIC, wxDefaultPosition, wxDefaultSize, kNumMicChoices, kMicChoices),
+    py_app_lang_(&py_panel_, ID_PY_APP_LANG, wxDefaultPosition, wxDefaultSize, kNumLangChoices, kLangChoices)
 {
 	Bind(wxEVT_MENU, &Frame::OnExit, this, wxID_EXIT);
 	Bind(wxEVT_BUTTON, &Frame::OnGetPythonVersion, this, ID_PY_VERSION_BUTTON);
@@ -41,10 +166,14 @@ Frame::Frame()
 
     wxSize py_out_size(/*x=*/80, /*y=*/20);
     py_out_.SetSize(py_out_size);
+    py_app_mic_.SetSelection(0);
+    py_app_lang_.SetSelection(0);
 
 	py_panel_.SetSizer(&py_panel_sizer_);
     py_panel_sizer_.Add(&py_version_button_);
     py_panel_sizer_.Add(&py_setup_button_);
+    py_panel_sizer_.Add(&py_app_mic_);
+    py_panel_sizer_.Add(&py_app_lang_);
     py_panel_sizer_.Add(&py_app_start_button_);
     py_panel_sizer_.Add(&py_app_stop_button_);
     py_panel_sizer_.Add(&py_out_);
@@ -75,7 +204,7 @@ void Frame::OnSetupPython(wxCommandEvent& event)
     {
         std::string py_out;
         std::ostringstream py_out_oss;
-        py_out_oss << "Installing pip" << std::endl;
+        py_out_oss << "  Installing pip" << std::endl;
         py_out_.AppendText(py_out_oss.str());
         if (!py.InstallPip(&py_out)) {
             std::ostringstream py_out_oss;
@@ -85,22 +214,20 @@ void Frame::OnSetupPython(wxCommandEvent& event)
     }
 
     const std::vector<std::string> pip_deps{
+        "openvr",
         "pillow",
-        "pydub",
         "pyaudio",
+        "python-osc",
         "playsound==1.2.2",
         "torch --extra-index-url https://download.pytorch.org/whl/cu116",
         "git+https://github.com/openai/whisper.git",
-        "openvr",
         "editdistance",
-        "pydub",
-        "python-osc",
     };
 
     for (const auto& pip_dep : pip_deps) {
         {
             std::ostringstream py_out_oss;
-            py_out_oss << "Installing " << pip_dep << std::endl;
+            py_out_oss << "  Installing " << pip_dep << std::endl;
             py_out_.AppendText(py_out_oss.str());
         }
         std::string py_out;
@@ -137,7 +264,18 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 		return;
     };
 
-    wxProcess* p = py.StartApp(std::move(cb));
+    int which_mic = py_app_mic_.GetSelection();
+    if (which_mic == wxNOT_FOUND) {
+        which_mic = 0;
+    }
+    int which_lang = py_app_lang_.GetSelection();
+    if (which_lang == wxNOT_FOUND) {
+        which_lang = 0;
+    }
+
+    wxProcess* p = py.StartApp(std::move(cb),
+        kMicChoices[which_mic].ToStdString(),
+        kLangChoices[which_lang].ToStdString());
     if (!p) {
         py_out_.AppendText("Failed to launch transcription engine\n");
         return;
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 414d2b3..8132cce 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -22,6 +22,8 @@ private:
     wxButton py_app_start_button_;
     wxButton py_app_stop_button_;
     wxTextCtrl py_out_;
+    wxChoice py_app_mic_;
+    wxChoice py_app_lang_;
 
     wxProcess* py_app_;
 
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index 7270ab5..53fcc06 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -85,8 +85,14 @@ bool PythonWrapper::InstallPip(std::string* out) {
     return InvokeWithArgs({ pip_path }, out);
 }
 
-wxProcess* PythonWrapper::StartApp(std::function<void(wxProcess* proc, int ret)>&& exit_callback) {
-	return InvokeAsyncWithArgs({ "Resources/Scripts/transcribe.py" },
+wxProcess* PythonWrapper::StartApp(
+	std::function<void(wxProcess* proc, int ret)>&& exit_callback,
+	const std::string& mic, const std::string& lang) {
+	return InvokeAsyncWithArgs({
+		"Resources/Scripts/transcribe.py",
+		"--mic", mic,
+		"--lang", lang,
+		},
 		std::move(exit_callback));
 }
 
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 4407b5e..0fa3c94 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -34,6 +34,8 @@ public:
 	// Execute get-pip.py.
 	bool InstallPip(std::string* out);
 
-	wxProcess* StartApp(std::function<void(wxProcess* proc, int ret)>&& exit_callback);
+	wxProcess* StartApp(
+		std::function<void(wxProcess* proc, int ret)>&& exit_callback,
+		const std::string& mic, const std::string& lang);
 };
 
diff --git a/GUI/package.ps1 b/GUI/package.ps1
index 0c2cec0..0346e67 100644
--- a/GUI/package.ps1
+++ b/GUI/package.ps1
@@ -9,5 +9,6 @@ mkdir $install_dir/Resources > $null
 cp ../Images/logo.png TaSTT/Resources
 cp -Recurse ../Python TaSTT/Resources/Python
 cp -Recurse ../Scripts TaSTT/Resources/Scripts
+cp -Recurse ../Sounds TaSTT/Resources/Sounds
 cp GUI/x64/Release/GUI.exe TaSTT/TaSTT.exe
 
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 62e6add..48426e2 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -5,10 +5,7 @@ import copy
 from datetime import datetime
 import os
 import osc_ctrl
-# python3 -m pip install pydub
-# License: MIT.
-from pydub import AudioSegment as pydub_AudioSegment
-from pydub import effects as pydub_effects
+from functools import partial
 # python3 -m pip install pyaudio
 # License: MIT.
 import pyaudio
@@ -80,11 +77,49 @@ class AudioState:
 
     osc_client = osc_ctrl.getClient()
 
+def dumpMicDevices(audio_state):
+    info = audio_state.p.get_host_api_info_by_index(0)
+    numdevices = info.get('deviceCount')
+
+    for i in range(0, numdevices):
+        if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
+            device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
+            print("Input Device id ", i, " - ", device_name)
+
+def onAudioFramesAvailable(
+        audio_state,
+        input_rate,
+        frames,
+        frame_count,
+        time_info,
+        status_flags):
+
+    # Reduce sample rate from mic rate to Whisper rate by dropping frames.
+    decimated = b''
+    frame_len = int(len(frames) / frame_count)
+    next_frame = 0.0
+    keep_every = float(input_rate) / audio_state.RATE
+    i = 0
+    for i in range(0, frame_count):
+        if i >= next_frame:
+            decimated += frames[i*frame_len:(i+1)*frame_len]
+            next_frame += keep_every
+        i += 1
+
+    audio_state.frames.append(decimated)
+
+    max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
+    if len(audio_state.frames) > max_frames:
+        audio_state.frames = audio_state.frames[-1 * max_frames :]
+
+    return (frames, pyaudio.paContinue)
+
 def getMicStream(which_mic):
     audio_state = AudioState()
     audio_state.p = pyaudio.PyAudio()
 
-    print("Finding index mic...")
+    print("Finding mic {}...".format(which_mic))
+    dumpMicDevices(audio_state)
     got_match = False
     device_index = -1
     focusrite_str = "Focusrite"
@@ -94,15 +129,16 @@ def getMicStream(which_mic):
     elif which_mic == "focusrite":
         target_str = focusrite_str
     else:
-        raise Exception("Unrecognized mic requested: {}".format(which_mic))
+        print("Mic {} requested, treating it as a numerical device ID".format(which_mic))
+        device_index = int(which_mic)
+        got_match = True
+
     while got_match == False:
         info = audio_state.p.get_host_api_info_by_index(0)
         numdevices = info.get('deviceCount')
-
         for i in range(0, numdevices):
             if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
                 device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
-                print("Input Device id ", i, " - ", device_name)
                 if target_str in device_name:
                     print("Got match: {}".format(device_name))
                     device_index = i
@@ -112,29 +148,22 @@ def getMicStream(which_mic):
             print("No match, sleeping")
             time.sleep(3)
 
-    audio_state.stream = audio_state.p.open(format=audio_state.FORMAT,
-            channels=audio_state.CHANNELS, rate=audio_state.RATE,
-            input=True, frames_per_buffer=audio_state.CHUNK,
-            input_device_index=device_index)
-
-    return audio_state
+    info = audio_state.p.get_device_info_by_host_api_device_index(0, device_index)
+    input_rate = int(info['defaultSampleRate'])
+    print("input rate: {}".format(input_rate))
 
-# Continuously records audio as long as audio_state.run_app is set.
-def recordAudio(audio_state):
-    print("Recording audio")
-    while audio_state.run_app:
-        data = audio_state.stream.read(audio_state.CHUNK)
+    # Bind audio_state to onAudioFramesAvailable
+    callback = partial(onAudioFramesAvailable, audio_state, input_rate)
 
-        if audio_state.audio_paused:
-            time.sleep(0.1)
-            continue
+    audio_state.stream = audio_state.p.open(format=audio_state.FORMAT,
+            channels=audio_state.CHANNELS, rate=input_rate,
+            input=True, frames_per_buffer=audio_state.CHUNK,
+            input_device_index=device_index,
+            stream_callback=callback)
 
-        audio_state.frames.append(data)
-        max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
-        if len(audio_state.frames) > max_frames:
-            audio_state.frames = audio_state.frames[-1 * max_frames :]
+    audio_state.stream.start_stream()
 
-    print("Done recording")
+    return audio_state
 
 def resetAudioLocked(audio_state):
     audio_state.frames = []
@@ -241,7 +270,7 @@ def transcribeAudio(audio_state, model):
         old_text = audio_state.text
 
         audio_state.text = string_matcher.matchStrings(audio_state.text,
-                text, window_size = 30)
+                text, window_size = 20)
         if old_text != audio_state.text:
             # We think the user said something, so  reset the amount of
             # time we sleep between transcriptions to the minimum.
@@ -276,13 +305,13 @@ def readControllerInput(audio_state):
             if state == RECORD_STATE:
                 state = PAUSE_STATE
                 osc_ctrl.indicateSpeech(audio_state.osc_client, False)
-                playsound(os.path.abspath("Sounds/Noise_Off.wav"))
+                playsound(os.path.abspath("../Sounds/Noise_Off.wav"))
 
                 audio_state.audio_paused = True
             elif state == PAUSE_STATE:
                 state = RECORD_STATE
                 osc_ctrl.indicateSpeech(audio_state.osc_client, True)
-                playsound(os.path.abspath("Sounds/Noise_On.wav"))
+                playsound(os.path.abspath("../Sounds/Noise_On.wav"))
 
                 resetAudioLocked(audio_state)
                 resetDisplayLocked(audio_state)
@@ -293,10 +322,6 @@ def transcribeLoop(mic: str, language: str):
     audio_state = getMicStream(mic)
     audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]
 
-    record_audio_thd = threading.Thread(target = recordAudio, args = [audio_state])
-    record_audio_thd.daemon = True
-    record_audio_thd.start()
-
     print("Safe to start talking")
 
     #model = whisper.load_model("tiny")
@@ -331,13 +356,16 @@ def transcribeLoop(mic: str, language: str):
 
     print("Joining threads")
     audio_state.run_app = False
-    audio_state.run_app = False
-    record_audio_thd.join()
     transcribe_audio_thd.join()
     controller_input_thd.join()
 
 
 if __name__ == "__main__":
+    # Set cwd to the directory holding the script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
     parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.")
author	yum <yum.food.vr@gmail.com>	2022-12-18 14:46:53 -0800
committer	yum <yum.food.vr@gmail.com>	2022-12-18 15:08:28 -0800
commit	79f1b48042cbb724892301afdee842fb33ab2b37 (patch)
tree	46e588b1a12115892ab45e9853915b32c4bd15d6
parent	03fbf0e8ca409fe4c26e246286a975724ad0994b (diff)