diff options
| author | yum <yum.food.vr@gmail.com> | 2023-09-03 19:29:44 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-09-03 19:29:44 -0700 |
| commit | 6020bc056d8992523ae62feb4edfbae10b169880 (patch) | |
| tree | 06d970506dc28c68c26f713f29d7e687c6593efc | |
| parent | ae5db8b21e7db2ab9941cca47a5d57352d3bb1fa (diff) | |
Wire transcribe_v2.py into GUI
Also:
* Enable SO_REUSEADDR on browser src socket
* Temporarily add evaluation dependencies to requirements.txt
* Fix browser src. It's now looking for a prefix that the python app
actually uses.
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 2 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.cpp | 2 | ||||
| -rw-r--r-- | GUI/GUI/GUI/WebServer.cpp | 6 | ||||
| -rw-r--r-- | Scripts/requirements.txt | 3 | ||||
| -rw-r--r-- | Scripts/transcribe_v2.py | 21 |
5 files changed, 23 insertions, 11 deletions
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index a4c186b..9a781e8 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -2295,7 +2295,7 @@ void Frame::OnAppStart(wxCommandEvent& event) { transcript_.SetFinalized(false);
}
- std::regex pattern("^Transcription \\(([0-9]*\\.[0-9]+) seconds\\):");
+ std::regex pattern("^Transcript: ");
if (std::regex_search(out_line, pattern)) {
std::string filtered_transcript = std::regex_replace(out_line, pattern, "");
filtered_transcript.erase(std::remove_if(filtered_transcript.begin(), filtered_transcript.end(), [](char c) {
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index d98a5d4..7141037 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -480,7 +480,7 @@ std::future<bool> PythonWrapper::StartApp( return InvokeWithArgs({ "-u", // Unbuffered output - "Resources/Scripts/transcribe.py", + "Resources/Scripts/transcribe_v2.py", "--config", config_path, }, std::move(out_cb), diff --git a/GUI/GUI/GUI/WebServer.cpp b/GUI/GUI/GUI/WebServer.cpp index ba7eecd..2b589c6 100644 --- a/GUI/GUI/GUI/WebServer.cpp +++ b/GUI/GUI/GUI/WebServer.cpp @@ -70,6 +70,12 @@ namespace WebServer { return false;
}
+ int optval = 1;
+ if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&optval, sizeof(optval)) == SOCKET_ERROR) {
+ Log(out_, "Failed to setsockopt(SO_REUSEADDR): {}", WSAGetLastError());
+ return 1;
+ }
+
u_long enable_nonblock = 1;
if (ioctlsocket(sock, FIONBIO, &enable_nonblock) == SOCKET_ERROR) {
Log(out_, "Failed to enable non-blocking socket: {}\n", WSAGetLastError());
diff --git a/Scripts/requirements.txt b/Scripts/requirements.txt index cba3d15..d694f4a 100644 --- a/Scripts/requirements.txt +++ b/Scripts/requirements.txt @@ -13,3 +13,6 @@ python-osc pyyaml sentence_splitter transformers>=4.21.0 + +git+https://github.com/openai/whisper.git +scipy diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py index 3f924dd..9812535 100644 --- a/Scripts/transcribe_v2.py +++ b/Scripts/transcribe_v2.py @@ -50,7 +50,7 @@ class DiskStream(AudioStream): else: raise NotImplementedError(f"Requested file type {path} " + \ "is not supported") - print(f"Loading audio data") + print(f"Loading audio data", file=sys.stderr) audio = AudioSegment.from_file(path, format=fmt) audio = audio.set_channels(1) # TODO(yum) replace manual decimation code with this! @@ -60,7 +60,7 @@ class DiskStream(AudioStream): self.frames = frames - print(f"Loaded data") + print(f"Loaded data", file=sys.stderr) def getSamples(self) -> bytes: # Give out samples at a fixed rate to minimize @@ -86,7 +86,7 @@ class MicStream(AudioStream): # If set, incoming frames are simply discarded. self.paused = False - print(f"Finding mic {which_mic}") + print(f"Finding mic {which_mic}", file=sys.stderr) self.dumpMicDevices() got_match = False @@ -423,7 +423,8 @@ class FuzzyRepeatCommitter: return TranscriptCommit("", preview, None) s0 = self.candidates[0] if s.wall_ts != s0.wall_ts: - print("Frames dropped, committer resetting candidates") + print("Frames dropped, committer resetting candidates", + file=sys.stderr) self.candidates = [] return TranscriptCommit("", preview, None) self.candidates.append(s) @@ -462,7 +463,8 @@ class FuzzyRepeatCommitter: # Got a candidate! Commit it and return. self.candidates = [] latency_s = self.collector.now() - (candidate.wall_ts + candidate.start_ts) - self.collector.dropAudioPrefix(candidate.end_ts) + # Measured to slightly improve performance in benchmark. + self.collector.dropAudioPrefix(candidate.end_ts + 0.10) return TranscriptCommit(candidate.transcript, preview, latency_s, thresh_at_commit = edit_thresh) @@ -627,7 +629,7 @@ def transcriptionThread(ctrl: ThreadControl): ctrl.transcript += commit.delta if len(commit.delta): - print(f"{ctrl.transcript}") + print(f"Transcript: {ctrl.transcript}") if cfg["enable_debug_mode"]: print(f"commit latency: {commit.latency_s}", file=sys.stderr) print(f"commit thresh: {commit.thresh_at_commit}", file=sys.stderr) @@ -715,7 +717,7 @@ def vrInputThread(ctrl: ThreadControl): #audio_state.audio_events.append(audio_state.AUDIO_EVENT_TOGGLE_OFF) pass elif state == PAUSE_STATE: - print("RECORDING") + print("RECORDING", file=sys.stderr) state = RECORD_STATE if not ctrl.cfg["use_builtin"]: ctrl.pager.toggleBoard(True) @@ -723,12 +725,13 @@ def vrInputThread(ctrl: ThreadControl): ctrl.pager.ellipsis(True) if ctrl.cfg["reset_on_toggle"]: if ctrl.cfg["enable_debug_mode"]: - print("Toggle detected, dropping transcript (3)") + print("Toggle detected, dropping transcript (3)", + file=sys.stderr) ctrl.transcript = "" #audio_state.drop_transcription = True else: if ctrl.cfg["enable_debug_mode"]: - print("Toggle detected, committing preview text (3)") + print("Toggle detected, committing preview text (3)", file=sys.stderr) #audio_state.text += audio_state.preview_text ctrl.stream.pause(False) |
