Add browser source, hardcoded to port 8097

Transcription output now streams to localhost:8097. In OBS: * Create a browser source. * url: localhost:8097 * width: 2200 * height: 400 TODO: * Put behind toggle. * Create input field for port. Misc cleanup: * transcribe.py: Drop frames from audio capture thread instead of the transcription thread. Doing it the other way would result in occasional data loss.
author: yum <yum.food.vr@gmail.com> 2023-06-26 00:58:58 -0700
committer: yum <yum.food.vr@gmail.com> 2023-06-26 01:46:42 -0700
commit: 0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch)
tree: 057ea9d58991ab71cd4f6eb4dfb33693f8914e03
parent: e35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff)
9 files changed, 70 insertions, 11 deletions
diff --git a/BrowserSource/index.html b/BrowserSource/index.html
index 253b9ef..d3ab777 100644
--- a/BrowserSource/index.html
+++ b/BrowserSource/index.html
@@ -10,11 +10,12 @@
     body {
       font-family: 'Noto Sans Mono', monospace;
       font-size: 96px;
-      font-weight: 700;
+      font-weight: 3200;
+      color: #000;
     }
     #transcript {
-      color: #fff;
-      -webkit-text-stroke: 1.0px #000;
+      color: #89CFF0;
+      -webkit-text-stroke: 3.0px #000;
     }
   </style>
   <body>
diff --git a/GUI/GUI/GUI/BrowserSource.cpp b/GUI/GUI/GUI/BrowserSource.cpp
index c43f1a0..62e3e43 100644
--- a/GUI/GUI/GUI/BrowserSource.cpp
+++ b/GUI/GUI/GUI/BrowserSource.cpp
@@ -61,6 +61,8 @@ void BrowserSource::Run(volatile bool* run)
 			resp_oss << "}";
 			payload = resp_oss.str();
 			type = WebServer::JSON;
+
+			//Log(out_, "Serving transcript to port {}: {}\n", port_, transcript_oss.str());
 		});
 
 	if (!ws.Run(run)) {
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 6bce67e..d39222b 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -1,3 +1,4 @@
+#include "BrowserSource.h"
 #include "Frame.h"
 #include "Logging.h"
 #include "PythonWrapper.h"
@@ -6,6 +7,7 @@
 
 #include <filesystem>
 #include <fstream>
+#include <regex>
 #include <string>
 #include <vector>
 #include <wx/filepicker.h>
@@ -553,6 +555,11 @@ Frame::Frame()
 	}
 	{
 		auto p = std::promise<bool>();
+		obs_app_ = p.get_future();
+		p.set_value(true);
+	}
+	{
+		auto p = std::promise<bool>();
 		unity_app_ = p.get_future();
 		p.set_value(true);
 	}
@@ -1930,6 +1937,12 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 		return;
 	}
 
+    status = obs_app_.wait_for(std::chrono::seconds(0));
+    if (status != std::future_status::ready) {
+		Log(transcribe_out_, "Transcription engine (OBS server) already running\n");
+		return;
+	}
+
 	Log(transcribe_out_, "Launching transcription engine\n");
 
     int which_mic = py_app_mic_->GetSelection();
@@ -2043,6 +2056,16 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     auto out_cb = [&](const std::string& out, const std::string& err) {
         Log(transcribe_out_, "{}", out);
         Log(transcribe_out_, "{}", err);
+
+		std::regex pattern("^Transcription \\(([0-9]*\\.[0-9]+) seconds\\):");
+        if (std::regex_search(out, pattern)) {
+            std::string filtered_transcript = std::regex_replace(out, pattern, "");
+            filtered_transcript.erase(std::remove_if(filtered_transcript.begin(), filtered_transcript.end(), [](char c) {
+                return c == '\n' || c == '\r';
+                }), filtered_transcript.end());
+            Log(transcribe_out_, "Got transcription line! Transcript: \"{}\"", filtered_transcript);
+            transcript_.Set(std::move(filtered_transcript));
+        }
     };
     auto in_cb = [&](std::string& in) {};
     auto run_cb = [&]() {
@@ -2052,8 +2075,17 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     auto prestart_cb = [this]() -> void {
         EnsureVirtualEnv(/*block=*/true);
     };
-    py_app_ = std::move(PythonWrapper::StartApp(*app_c_, std::move(out_cb),
-        std::move(in_cb), std::move(run_cb), std::move(prestart_cb)));
+
+	// TODO(yum) parameterize port
+    obs_app_ = std::async(std::launch::async,
+        [&]() -> bool {
+			BrowserSource browser_src(8097, transcribe_out_, &transcript_);
+            browser_src.Run(&run_py_app_);
+            return true;
+        });
+    py_app_ = std::move(PythonWrapper::StartApp(*app_c_, transcribe_out_,
+        std::move(out_cb), std::move(in_cb), std::move(run_cb),
+        std::move(prestart_cb)));
     Log(transcribe_out_, "py app valid: {}\n", py_app_.valid());
 }
 
@@ -2065,6 +2097,8 @@ void Frame::OnAppStop() {
     }
     run_py_app_ = false;
     py_app_.wait();
+    obs_app_.wait();
+    transcript_.Clear();
 	Log(transcribe_out_, "Stopped transcription engine\n");
 }
 
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 94f2a68..904df4f 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -8,6 +8,7 @@
 #endif
 
 #include "Config.h"
+#include "Transcript.h"
 
 #include <future>
 #include <memory>
@@ -93,6 +94,8 @@ private:
     wxCheckBox* whisper_enable_browser_src_;
 
     std::future<bool> py_app_;
+    std::future<bool> obs_app_;
+    Transcript transcript_;
     bool run_py_app_;
     std::future<bool> unity_app_;
     std::future<bool> dump_mics_;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index a061e34..037d961 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -448,16 +448,19 @@ bool PythonWrapper::InstallPip(
 
 std::future<bool> PythonWrapper::StartApp(
 		const AppConfig& config,
+		wxTextCtrl *out,
 		const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
 		const std::function<void(std::string& in)>&& in_cb,
 		const std::function<bool()>&& run_cb,
 		const std::function<void()>&& prestart_cb) {
+
 	return std::move(std::async(std::launch::async,
 		[&](
 			const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
 			const std::function<void(std::string& in)>&& in_cb,
 			const std::function<bool()>&& run_cb) -> bool {
 				prestart_cb();
+
 				return InvokeWithArgs({
 					"-u",  // Unbuffered output
 					"Resources/Scripts/transcribe.py",
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 17f5e1d..6366247 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -73,6 +73,7 @@ namespace PythonWrapper
 	// app restarts.
 	std::future<bool> StartApp(
 		const AppConfig& config,
+		wxTextCtrl *out,
 		const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
 		const std::function<void(std::string& in)>&& in_cb = [](std::string&) {},
 		const std::function<bool()>&& run_cb = []() { return true; },
diff --git a/GUI/GUI/GUI/Transcript.cpp b/GUI/GUI/GUI/Transcript.cpp
index 30f1f76..9ef607f 100644
--- a/GUI/GUI/GUI/Transcript.cpp
+++ b/GUI/GUI/GUI/Transcript.cpp
@@ -5,6 +5,12 @@ void Transcript::Append(std::string&& segment) {
 	segments_.push_back(std::move(segment));
 }
 
+void Transcript::Set(std::string&& segment) {
+	std::scoped_lock l(mu_);
+	segments_.clear();
+	segments_.push_back(std::move(segment));
+}
+
 void Transcript::Clear() {
 	std::scoped_lock l(mu_);
 	segments_.clear();
diff --git a/GUI/GUI/GUI/Transcript.h b/GUI/GUI/GUI/Transcript.h
index 09858b0..fae2bad 100644
--- a/GUI/GUI/GUI/Transcript.h
+++ b/GUI/GUI/GUI/Transcript.h
@@ -10,6 +10,7 @@ public:
 	Transcript() = default;
 
 	void Append(std::string&& segment);
+	void Set(std::string&& segment);
 	void Clear();
 
 	std::vector<std::string> Get();
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 9711d15..51d7e0a 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -57,6 +57,7 @@ class AudioState:
         #     Segment start time, end time, and text
         self.ranges_ls = []
         self.frames = []
+        self.drop_frames_till_i = -1
 
         # Locks access to `text`.
         self.transcribe_lock = threading.Lock()
@@ -131,6 +132,9 @@ def onAudioFramesAvailable(
             audio_state.CHUNK)
     if len(audio_state.frames) > max_frames:
         audio_state.frames = audio_state.frames[-1 * max_frames:]
+    if audio_state.drop_frames_till_i > 0:
+        audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1]
+        audio_state.drop_frames_till_i = -1
 
 
     return (frames, pyaudio.paContinue)
@@ -227,6 +231,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             audio,
             beam_size = 5,
             language = audio_state.language,
+            temperature = [0.0],
             vad_filter = True,
             condition_on_previous_text = True,
             without_timestamps = False)
@@ -244,17 +249,20 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             for segment in ranges:
                 first_segments.append(segment)
                 break
-        if len(first_segments) >= 3:
-            c0 = first_segments[-3]
+        if len(first_segments) >= 5:
+            # Hack: require convergence across many frames to give the
+            # algorithm a longer buffer to work with.
+            c0 = first_segments[-1]
             c1 = first_segments[-2]
-            c2 = first_segments[-1]
+            c2 = first_segments[-3]
+            c3 = first_segments[-4]
             #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
-            if c0 == c1 and c1 == c2:
+            if c0 == c1 and c1 == c2 and c2 == c3:
                 # For simplicity, completely reset saved audio ranges.
                 audio_state.ranges_ls = []
-                committed_text = c2[2]
+                committed_text = c0[2]
                 n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
-                del audio_state.frames[0:n_frames_to_drop]
+                audio_state.drop_frames_till_i = n_frames_to_drop
 
     preview_text = ""
     for seg in ranges:
author	yum <yum.food.vr@gmail.com>	2023-06-26 00:58:58 -0700
committer	yum <yum.food.vr@gmail.com>	2023-06-26 01:46:42 -0700
commit	0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch)
tree	057ea9d58991ab71cd4f6eb4dfb33693f8914e03
parent	e35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff)