Wire transcribe_v2.py into GUI

Also: * Enable SO_REUSEADDR on browser src socket * Temporarily add evaluation dependencies to requirements.txt * Fix browser src. It's now looking for a prefix that the python app actually uses.
author: yum <yum.food.vr@gmail.com> 2023-09-03 19:29:44 -0700
committer: yum <yum.food.vr@gmail.com> 2023-09-03 19:29:44 -0700
commit: 6020bc056d8992523ae62feb4edfbae10b169880 (patch)
tree: 06d970506dc28c68c26f713f29d7e687c6593efc
parent: ae5db8b21e7db2ab9941cca47a5d57352d3bb1fa (diff)
5 files changed, 23 insertions, 11 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index a4c186b..9a781e8 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -2295,7 +2295,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 				transcript_.SetFinalized(false);
 			}
 
-			std::regex pattern("^Transcription \\(([0-9]*\\.[0-9]+) seconds\\):");
+            std::regex pattern("^Transcript: ");
 			if (std::regex_search(out_line, pattern)) {
 				std::string filtered_transcript = std::regex_replace(out_line, pattern, "");
 				filtered_transcript.erase(std::remove_if(filtered_transcript.begin(), filtered_transcript.end(), [](char c) {
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index d98a5d4..7141037 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -480,7 +480,7 @@ std::future<bool> PythonWrapper::StartApp(
 
 				return InvokeWithArgs({
 					"-u",  // Unbuffered output
-					"Resources/Scripts/transcribe.py",
+					"Resources/Scripts/transcribe_v2.py",
 					"--config", config_path,
 					},
 					std::move(out_cb),
diff --git a/GUI/GUI/GUI/WebServer.cpp b/GUI/GUI/GUI/WebServer.cpp
index ba7eecd..2b589c6 100644
--- a/GUI/GUI/GUI/WebServer.cpp
+++ b/GUI/GUI/GUI/WebServer.cpp
@@ -70,6 +70,12 @@ namespace WebServer {
 			return false;
 		}
 
+		int optval = 1;
+		if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&optval, sizeof(optval)) == SOCKET_ERROR) {
+			Log(out_, "Failed to setsockopt(SO_REUSEADDR): {}", WSAGetLastError());
+			return 1;
+		}
+
 		u_long enable_nonblock = 1;
 		if (ioctlsocket(sock, FIONBIO, &enable_nonblock) == SOCKET_ERROR) {
 			Log(out_, "Failed to enable non-blocking socket: {}\n", WSAGetLastError());
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt
index cba3d15..d694f4a 100644
--- a/Scripts/requirements.txt
+++ b/Scripts/requirements.txt
@@ -13,3 +13,6 @@ python-osc
 pyyaml
 sentence_splitter
 transformers>=4.21.0
+
+git+https://github.com/openai/whisper.git 
+scipy
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 3f924dd..9812535 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -50,7 +50,7 @@ class DiskStream(AudioStream):
         else:
             raise NotImplementedError(f"Requested file type {path} " + \
                     "is not supported")
-        print(f"Loading audio data")
+        print(f"Loading audio data", file=sys.stderr)
         audio = AudioSegment.from_file(path, format=fmt)
         audio = audio.set_channels(1)
         # TODO(yum) replace manual decimation code with this!
@@ -60,7 +60,7 @@ class DiskStream(AudioStream):
 
         self.frames = frames
 
-        print(f"Loaded data")
+        print(f"Loaded data", file=sys.stderr)
 
     def getSamples(self) -> bytes:
         # Give out samples at a fixed rate to minimize
@@ -86,7 +86,7 @@ class MicStream(AudioStream):
         # If set, incoming frames are simply discarded.
         self.paused = False
 
-        print(f"Finding mic {which_mic}")
+        print(f"Finding mic {which_mic}", file=sys.stderr)
         self.dumpMicDevices()
 
         got_match = False
@@ -423,7 +423,8 @@ class FuzzyRepeatCommitter:
                 return TranscriptCommit("", preview, None)
             s0 = self.candidates[0]
             if s.wall_ts != s0.wall_ts:
-                print("Frames dropped, committer resetting candidates")
+                print("Frames dropped, committer resetting candidates",
+                        file=sys.stderr)
                 self.candidates = []
                 return TranscriptCommit("", preview, None)
             self.candidates.append(s)
@@ -462,7 +463,8 @@ class FuzzyRepeatCommitter:
         # Got a candidate! Commit it and return.
         self.candidates = []
         latency_s = self.collector.now() - (candidate.wall_ts + candidate.start_ts)
-        self.collector.dropAudioPrefix(candidate.end_ts)
+        # Measured to slightly improve performance in benchmark.
+        self.collector.dropAudioPrefix(candidate.end_ts + 0.10)
 
         return TranscriptCommit(candidate.transcript, preview, latency_s,
                 thresh_at_commit = edit_thresh)
@@ -627,7 +629,7 @@ def transcriptionThread(ctrl: ThreadControl):
         ctrl.transcript += commit.delta
 
         if len(commit.delta):
-            print(f"{ctrl.transcript}")
+            print(f"Transcript: {ctrl.transcript}")
             if cfg["enable_debug_mode"]:
                 print(f"commit latency: {commit.latency_s}", file=sys.stderr)
                 print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr)
@@ -715,7 +717,7 @@ def vrInputThread(ctrl: ThreadControl):
                         #audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF)
                         pass
                 elif state == PAUSE_STATE:
-                    print("RECORDING")
+                    print("RECORDING", file=sys.stderr)
                     state = RECORD_STATE
                     if not ctrl.cfg["use_builtin"]:
                         ctrl.pager.toggleBoard(True)
@@ -723,12 +725,13 @@ def vrInputThread(ctrl: ThreadControl):
                         ctrl.pager.ellipsis(True)
                     if ctrl.cfg["reset_on_toggle"]:
                         if ctrl.cfg["enable_debug_mode"]:
-                            print("Toggle detected, dropping transcript (3)")
+                            print("Toggle detected, dropping transcript (3)",
+                                    file=sys.stderr)
                         ctrl.transcript = ""
                         #audio_state.drop_transcription = True
                     else:
                         if ctrl.cfg["enable_debug_mode"]:
-                            print("Toggle detected, committing preview text (3)")
+                            print("Toggle detected, committing preview text (3)", file=sys.stderr)
                         #audio_state.text += audio_state.preview_text
 
                     ctrl.stream.pause(False)
author	yum <yum.food.vr@gmail.com>	2023-09-03 19:29:44 -0700
committer	yum <yum.food.vr@gmail.com>	2023-09-03 19:29:44 -0700
commit	6020bc056d8992523ae62feb4edfbae10b169880 (patch)
tree	06d970506dc28c68c26f713f29d7e687c6593efc
parent	ae5db8b21e7db2ab9941cca47a5d57352d3bb1fa (diff)