diff options
| author | yum <yum.food.vr@gmail.com> | 2023-06-26 00:58:58 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-06-26 01:46:42 -0700 |
| commit | 0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch) | |
| tree | 057ea9d58991ab71cd4f6eb4dfb33693f8914e03 | |
| parent | e35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff) | |
Add browser source, hardcoded to port 8097
Transcription output now streams to localhost:8097.
In OBS:
* Create a browser source.
* url: localhost:8097
* width: 2200
* height: 400
TODO:
* Put behind toggle.
* Create input field for port.
Misc cleanup:
* transcribe.py: Drop frames from audio capture thread instead of the
transcription thread. Doing it the other way would result in
occasional data loss.
| -rw-r--r-- | BrowserSource/index.html | 7 | ||||
| -rw-r--r-- | GUI/GUI/GUI/BrowserSource.cpp | 2 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.cpp | 38 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Frame.h | 3 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.cpp | 3 | ||||
| -rw-r--r-- | GUI/GUI/GUI/PythonWrapper.h | 1 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Transcript.cpp | 6 | ||||
| -rw-r--r-- | GUI/GUI/GUI/Transcript.h | 1 | ||||
| -rw-r--r-- | Scripts/transcribe.py | 20 |
9 files changed, 70 insertions, 11 deletions
diff --git a/BrowserSource/index.html b/BrowserSource/index.html index 253b9ef..d3ab777 100644 --- a/BrowserSource/index.html +++ b/BrowserSource/index.html @@ -10,11 +10,12 @@ body { font-family: 'Noto Sans Mono', monospace; font-size: 96px; - font-weight: 700; + font-weight: 3200; + color: #000; } #transcript { - color: #fff; - -webkit-text-stroke: 1.0px #000; + color: #89CFF0; + -webkit-text-stroke: 3.0px #000; } </style> <body> diff --git a/GUI/GUI/GUI/BrowserSource.cpp b/GUI/GUI/GUI/BrowserSource.cpp index c43f1a0..62e3e43 100644 --- a/GUI/GUI/GUI/BrowserSource.cpp +++ b/GUI/GUI/GUI/BrowserSource.cpp @@ -61,6 +61,8 @@ void BrowserSource::Run(volatile bool* run) resp_oss << "}";
payload = resp_oss.str();
type = WebServer::JSON;
+
+ //Log(out_, "Serving transcript to port {}: {}\n", port_, transcript_oss.str());
});
if (!ws.Run(run)) {
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp index 6bce67e..d39222b 100644 --- a/GUI/GUI/GUI/Frame.cpp +++ b/GUI/GUI/GUI/Frame.cpp @@ -1,3 +1,4 @@ +#include "BrowserSource.h"
#include "Frame.h"
#include "Logging.h"
#include "PythonWrapper.h"
@@ -6,6 +7,7 @@ #include <filesystem>
#include <fstream>
+#include <regex>
#include <string>
#include <vector>
#include <wx/filepicker.h>
@@ -553,6 +555,11 @@ Frame::Frame() }
{
auto p = std::promise<bool>();
+ obs_app_ = p.get_future();
+ p.set_value(true);
+ }
+ {
+ auto p = std::promise<bool>();
unity_app_ = p.get_future();
p.set_value(true);
}
@@ -1930,6 +1937,12 @@ void Frame::OnAppStart(wxCommandEvent& event) { return;
}
+ status = obs_app_.wait_for(std::chrono::seconds(0));
+ if (status != std::future_status::ready) {
+ Log(transcribe_out_, "Transcription engine (OBS server) already running\n");
+ return;
+ }
+
Log(transcribe_out_, "Launching transcription engine\n");
int which_mic = py_app_mic_->GetSelection();
@@ -2043,6 +2056,16 @@ void Frame::OnAppStart(wxCommandEvent& event) { auto out_cb = [&](const std::string& out, const std::string& err) {
Log(transcribe_out_, "{}", out);
Log(transcribe_out_, "{}", err);
+
+ std::regex pattern("^Transcription \\(([0-9]*\\.[0-9]+) seconds\\):");
+ if (std::regex_search(out, pattern)) {
+ std::string filtered_transcript = std::regex_replace(out, pattern, "");
+ filtered_transcript.erase(std::remove_if(filtered_transcript.begin(), filtered_transcript.end(), [](char c) {
+ return c == '\n' || c == '\r';
+ }), filtered_transcript.end());
+ Log(transcribe_out_, "Got transcription line! Transcript: \"{}\"", filtered_transcript);
+ transcript_.Set(std::move(filtered_transcript));
+ }
};
auto in_cb = [&](std::string& in) {};
auto run_cb = [&]() {
@@ -2052,8 +2075,17 @@ void Frame::OnAppStart(wxCommandEvent& event) { auto prestart_cb = [this]() -> void {
EnsureVirtualEnv(/*block=*/true);
};
- py_app_ = std::move(PythonWrapper::StartApp(*app_c_, std::move(out_cb),
- std::move(in_cb), std::move(run_cb), std::move(prestart_cb)));
+
+ // TODO(yum) parameterize port
+ obs_app_ = std::async(std::launch::async,
+ [&]() -> bool {
+ BrowserSource browser_src(8097, transcribe_out_, &transcript_);
+ browser_src.Run(&run_py_app_);
+ return true;
+ });
+ py_app_ = std::move(PythonWrapper::StartApp(*app_c_, transcribe_out_,
+ std::move(out_cb), std::move(in_cb), std::move(run_cb),
+ std::move(prestart_cb)));
Log(transcribe_out_, "py app valid: {}\n", py_app_.valid());
}
@@ -2065,6 +2097,8 @@ void Frame::OnAppStop() { }
run_py_app_ = false;
py_app_.wait();
+ obs_app_.wait();
+ transcript_.Clear();
Log(transcribe_out_, "Stopped transcription engine\n");
}
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h index 94f2a68..904df4f 100644 --- a/GUI/GUI/GUI/Frame.h +++ b/GUI/GUI/GUI/Frame.h @@ -8,6 +8,7 @@ #endif
#include "Config.h"
+#include "Transcript.h"
#include <future>
#include <memory>
@@ -93,6 +94,8 @@ private: wxCheckBox* whisper_enable_browser_src_;
std::future<bool> py_app_;
+ std::future<bool> obs_app_;
+ Transcript transcript_;
bool run_py_app_;
std::future<bool> unity_app_;
std::future<bool> dump_mics_;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp index a061e34..037d961 100644 --- a/GUI/GUI/GUI/PythonWrapper.cpp +++ b/GUI/GUI/GUI/PythonWrapper.cpp @@ -448,16 +448,19 @@ bool PythonWrapper::InstallPip( std::future<bool> PythonWrapper::StartApp( const AppConfig& config, + wxTextCtrl *out, const std::function<void(const std::string& out, const std::string& err)>&& out_cb, const std::function<void(std::string& in)>&& in_cb, const std::function<bool()>&& run_cb, const std::function<void()>&& prestart_cb) { + return std::move(std::async(std::launch::async, [&]( const std::function<void(const std::string& out, const std::string& err)>&& out_cb, const std::function<void(std::string& in)>&& in_cb, const std::function<bool()>&& run_cb) -> bool { prestart_cb(); + return InvokeWithArgs({ "-u", // Unbuffered output "Resources/Scripts/transcribe.py", diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h index 17f5e1d..6366247 100644 --- a/GUI/GUI/GUI/PythonWrapper.h +++ b/GUI/GUI/GUI/PythonWrapper.h @@ -73,6 +73,7 @@ namespace PythonWrapper // app restarts. std::future<bool> StartApp( const AppConfig& config, + wxTextCtrl *out, const std::function<void(const std::string& out, const std::string& err)>&& out_cb, const std::function<void(std::string& in)>&& in_cb = [](std::string&) {}, const std::function<bool()>&& run_cb = []() { return true; }, diff --git a/GUI/GUI/GUI/Transcript.cpp b/GUI/GUI/GUI/Transcript.cpp index 30f1f76..9ef607f 100644 --- a/GUI/GUI/GUI/Transcript.cpp +++ b/GUI/GUI/GUI/Transcript.cpp @@ -5,6 +5,12 @@ void Transcript::Append(std::string&& segment) { segments_.push_back(std::move(segment));
}
+void Transcript::Set(std::string&& segment) {
+ std::scoped_lock l(mu_);
+ segments_.clear();
+ segments_.push_back(std::move(segment));
+}
+
void Transcript::Clear() {
std::scoped_lock l(mu_);
segments_.clear();
diff --git a/GUI/GUI/GUI/Transcript.h b/GUI/GUI/GUI/Transcript.h index 09858b0..fae2bad 100644 --- a/GUI/GUI/GUI/Transcript.h +++ b/GUI/GUI/GUI/Transcript.h @@ -10,6 +10,7 @@ public: Transcript() = default;
void Append(std::string&& segment);
+ void Set(std::string&& segment);
void Clear();
std::vector<std::string> Get();
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index 9711d15..51d7e0a 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -57,6 +57,7 @@ class AudioState: # Segment start time, end time, and text self.ranges_ls = [] self.frames = [] + self.drop_frames_till_i = -1 # Locks access to `text`. self.transcribe_lock = threading.Lock() @@ -131,6 +132,9 @@ def onAudioFramesAvailable( audio_state.CHUNK) if len(audio_state.frames) > max_frames: audio_state.frames = audio_state.frames[-1 * max_frames:] + if audio_state.drop_frames_till_i > 0: + audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1] + audio_state.drop_frames_till_i = -1 return (frames, pyaudio.paContinue) @@ -227,6 +231,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st audio, beam_size = 5, language = audio_state.language, + temperature = [0.0], vad_filter = True, condition_on_previous_text = True, without_timestamps = False) @@ -244,17 +249,20 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st for segment in ranges: first_segments.append(segment) break - if len(first_segments) >= 3: - c0 = first_segments[-3] + if len(first_segments) >= 5: + # Hack: require convergence across many frames to give the + # algorithm a longer buffer to work with. + c0 = first_segments[-1] c1 = first_segments[-2] - c2 = first_segments[-1] + c2 = first_segments[-3] + c3 = first_segments[-4] #print(f"c0: {c0}, c1: {c1}, c2: {c2}") - if c0 == c1 and c1 == c2: + if c0 == c1 and c1 == c2 and c2 == c3: # For simplicity, completely reset saved audio ranges. audio_state.ranges_ls = [] - committed_text = c2[2] + committed_text = c0[2] n_frames_to_drop = int(ceil(audio_state.RATE * c0[1])) - del audio_state.frames[0:n_frames_to_drop] + audio_state.drop_frames_till_i = n_frames_to_drop preview_text = "" for seg in ranges: |
