summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-06-26 00:58:58 -0700
committeryum <yum.food.vr@gmail.com>2023-06-26 01:46:42 -0700
commit0ed379f2c99ac5c126a6f101965ef1eaa58c017b (patch)
tree057ea9d58991ab71cd4f6eb4dfb33693f8914e03
parente35abb6b9a04c6bebd6875901dbf0671f5cd860d (diff)
Add browser source, hardcoded to port 8097
Transcription output now streams to localhost:8097. In OBS: * Create a browser source. * url: localhost:8097 * width: 2200 * height: 400 TODO: * Put behind toggle. * Create input field for port. Misc cleanup: * transcribe.py: Drop frames from audio capture thread instead of the transcription thread. Doing it the other way would result in occasional data loss.
-rw-r--r--BrowserSource/index.html7
-rw-r--r--GUI/GUI/GUI/BrowserSource.cpp2
-rw-r--r--GUI/GUI/GUI/Frame.cpp38
-rw-r--r--GUI/GUI/GUI/Frame.h3
-rw-r--r--GUI/GUI/GUI/PythonWrapper.cpp3
-rw-r--r--GUI/GUI/GUI/PythonWrapper.h1
-rw-r--r--GUI/GUI/GUI/Transcript.cpp6
-rw-r--r--GUI/GUI/GUI/Transcript.h1
-rw-r--r--Scripts/transcribe.py20
9 files changed, 70 insertions, 11 deletions
diff --git a/BrowserSource/index.html b/BrowserSource/index.html
index 253b9ef..d3ab777 100644
--- a/BrowserSource/index.html
+++ b/BrowserSource/index.html
@@ -10,11 +10,12 @@
body {
font-family: 'Noto Sans Mono', monospace;
font-size: 96px;
- font-weight: 700;
+ font-weight: 3200;
+ color: #000;
}
#transcript {
- color: #fff;
- -webkit-text-stroke: 1.0px #000;
+ color: #89CFF0;
+ -webkit-text-stroke: 3.0px #000;
}
</style>
<body>
diff --git a/GUI/GUI/GUI/BrowserSource.cpp b/GUI/GUI/GUI/BrowserSource.cpp
index c43f1a0..62e3e43 100644
--- a/GUI/GUI/GUI/BrowserSource.cpp
+++ b/GUI/GUI/GUI/BrowserSource.cpp
@@ -61,6 +61,8 @@ void BrowserSource::Run(volatile bool* run)
resp_oss << "}";
payload = resp_oss.str();
type = WebServer::JSON;
+
+ //Log(out_, "Serving transcript to port {}: {}\n", port_, transcript_oss.str());
});
if (!ws.Run(run)) {
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index 6bce67e..d39222b 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -1,3 +1,4 @@
+#include "BrowserSource.h"
#include "Frame.h"
#include "Logging.h"
#include "PythonWrapper.h"
@@ -6,6 +7,7 @@
#include <filesystem>
#include <fstream>
+#include <regex>
#include <string>
#include <vector>
#include <wx/filepicker.h>
@@ -553,6 +555,11 @@ Frame::Frame()
}
{
auto p = std::promise<bool>();
+ obs_app_ = p.get_future();
+ p.set_value(true);
+ }
+ {
+ auto p = std::promise<bool>();
unity_app_ = p.get_future();
p.set_value(true);
}
@@ -1930,6 +1937,12 @@ void Frame::OnAppStart(wxCommandEvent& event) {
return;
}
+ status = obs_app_.wait_for(std::chrono::seconds(0));
+ if (status != std::future_status::ready) {
+ Log(transcribe_out_, "Transcription engine (OBS server) already running\n");
+ return;
+ }
+
Log(transcribe_out_, "Launching transcription engine\n");
int which_mic = py_app_mic_->GetSelection();
@@ -2043,6 +2056,16 @@ void Frame::OnAppStart(wxCommandEvent& event) {
auto out_cb = [&](const std::string& out, const std::string& err) {
Log(transcribe_out_, "{}", out);
Log(transcribe_out_, "{}", err);
+
+ std::regex pattern("^Transcription \\(([0-9]*\\.[0-9]+) seconds\\):");
+ if (std::regex_search(out, pattern)) {
+ std::string filtered_transcript = std::regex_replace(out, pattern, "");
+ filtered_transcript.erase(std::remove_if(filtered_transcript.begin(), filtered_transcript.end(), [](char c) {
+ return c == '\n' || c == '\r';
+ }), filtered_transcript.end());
+ Log(transcribe_out_, "Got transcription line! Transcript: \"{}\"", filtered_transcript);
+ transcript_.Set(std::move(filtered_transcript));
+ }
};
auto in_cb = [&](std::string& in) {};
auto run_cb = [&]() {
@@ -2052,8 +2075,17 @@ void Frame::OnAppStart(wxCommandEvent& event) {
auto prestart_cb = [this]() -> void {
EnsureVirtualEnv(/*block=*/true);
};
- py_app_ = std::move(PythonWrapper::StartApp(*app_c_, std::move(out_cb),
- std::move(in_cb), std::move(run_cb), std::move(prestart_cb)));
+
+ // TODO(yum) parameterize port
+ obs_app_ = std::async(std::launch::async,
+ [&]() -> bool {
+ BrowserSource browser_src(8097, transcribe_out_, &transcript_);
+ browser_src.Run(&run_py_app_);
+ return true;
+ });
+ py_app_ = std::move(PythonWrapper::StartApp(*app_c_, transcribe_out_,
+ std::move(out_cb), std::move(in_cb), std::move(run_cb),
+ std::move(prestart_cb)));
Log(transcribe_out_, "py app valid: {}\n", py_app_.valid());
}
@@ -2065,6 +2097,8 @@ void Frame::OnAppStop() {
}
run_py_app_ = false;
py_app_.wait();
+ obs_app_.wait();
+ transcript_.Clear();
Log(transcribe_out_, "Stopped transcription engine\n");
}
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 94f2a68..904df4f 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -8,6 +8,7 @@
#endif
#include "Config.h"
+#include "Transcript.h"
#include <future>
#include <memory>
@@ -93,6 +94,8 @@ private:
wxCheckBox* whisper_enable_browser_src_;
std::future<bool> py_app_;
+ std::future<bool> obs_app_;
+ Transcript transcript_;
bool run_py_app_;
std::future<bool> unity_app_;
std::future<bool> dump_mics_;
diff --git a/GUI/GUI/GUI/PythonWrapper.cpp b/GUI/GUI/GUI/PythonWrapper.cpp
index a061e34..037d961 100644
--- a/GUI/GUI/GUI/PythonWrapper.cpp
+++ b/GUI/GUI/GUI/PythonWrapper.cpp
@@ -448,16 +448,19 @@ bool PythonWrapper::InstallPip(
std::future<bool> PythonWrapper::StartApp(
const AppConfig& config,
+ wxTextCtrl *out,
const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
const std::function<void(std::string& in)>&& in_cb,
const std::function<bool()>&& run_cb,
const std::function<void()>&& prestart_cb) {
+
return std::move(std::async(std::launch::async,
[&](
const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
const std::function<void(std::string& in)>&& in_cb,
const std::function<bool()>&& run_cb) -> bool {
prestart_cb();
+
return InvokeWithArgs({
"-u", // Unbuffered output
"Resources/Scripts/transcribe.py",
diff --git a/GUI/GUI/GUI/PythonWrapper.h b/GUI/GUI/GUI/PythonWrapper.h
index 17f5e1d..6366247 100644
--- a/GUI/GUI/GUI/PythonWrapper.h
+++ b/GUI/GUI/GUI/PythonWrapper.h
@@ -73,6 +73,7 @@ namespace PythonWrapper
// app restarts.
std::future<bool> StartApp(
const AppConfig& config,
+ wxTextCtrl *out,
const std::function<void(const std::string& out, const std::string& err)>&& out_cb,
const std::function<void(std::string& in)>&& in_cb = [](std::string&) {},
const std::function<bool()>&& run_cb = []() { return true; },
diff --git a/GUI/GUI/GUI/Transcript.cpp b/GUI/GUI/GUI/Transcript.cpp
index 30f1f76..9ef607f 100644
--- a/GUI/GUI/GUI/Transcript.cpp
+++ b/GUI/GUI/GUI/Transcript.cpp
@@ -5,6 +5,12 @@ void Transcript::Append(std::string&& segment) {
segments_.push_back(std::move(segment));
}
+void Transcript::Set(std::string&& segment) {
+ std::scoped_lock l(mu_);
+ segments_.clear();
+ segments_.push_back(std::move(segment));
+}
+
void Transcript::Clear() {
std::scoped_lock l(mu_);
segments_.clear();
diff --git a/GUI/GUI/GUI/Transcript.h b/GUI/GUI/GUI/Transcript.h
index 09858b0..fae2bad 100644
--- a/GUI/GUI/GUI/Transcript.h
+++ b/GUI/GUI/GUI/Transcript.h
@@ -10,6 +10,7 @@ public:
Transcript() = default;
void Append(std::string&& segment);
+ void Set(std::string&& segment);
void Clear();
std::vector<std::string> Get();
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 9711d15..51d7e0a 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -57,6 +57,7 @@ class AudioState:
# Segment start time, end time, and text
self.ranges_ls = []
self.frames = []
+ self.drop_frames_till_i = -1
# Locks access to `text`.
self.transcribe_lock = threading.Lock()
@@ -131,6 +132,9 @@ def onAudioFramesAvailable(
audio_state.CHUNK)
if len(audio_state.frames) > max_frames:
audio_state.frames = audio_state.frames[-1 * max_frames:]
+ if audio_state.drop_frames_till_i > 0:
+ audio_state.frames = audio_state.frames[audio_state.drop_frames_till_i:-1]
+ audio_state.drop_frames_till_i = -1
return (frames, pyaudio.paContinue)
@@ -227,6 +231,7 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
audio,
beam_size = 5,
language = audio_state.language,
+ temperature = [0.0],
vad_filter = True,
condition_on_previous_text = True,
without_timestamps = False)
@@ -244,17 +249,20 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
for segment in ranges:
first_segments.append(segment)
break
- if len(first_segments) >= 3:
- c0 = first_segments[-3]
+ if len(first_segments) >= 5:
+ # Hack: require convergence across many frames to give the
+ # algorithm a longer buffer to work with.
+ c0 = first_segments[-1]
c1 = first_segments[-2]
- c2 = first_segments[-1]
+ c2 = first_segments[-3]
+ c3 = first_segments[-4]
#print(f"c0: {c0}, c1: {c1}, c2: {c2}")
- if c0 == c1 and c1 == c2:
+ if c0 == c1 and c1 == c2 and c2 == c3:
# For simplicity, completely reset saved audio ranges.
audio_state.ranges_ls = []
- committed_text = c2[2]
+ committed_text = c0[2]
n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
- del audio_state.frames[0:n_frames_to_drop]
+ audio_state.drop_frames_till_i = n_frames_to_drop
preview_text = ""
for seg in ranges: