Add UI for max speech duration

Also fix bug when not using previews. Audio buffer no longer grows without bound while there's no speech.
author: yum <yum.food.vr@gmail.com> 2023-09-09 22:26:09 -0700
committer: yum <yum.food.vr@gmail.com> 2023-09-09 22:32:35 -0700
commit: ae866f553d3db67030e37ce315707d72982f4063 (patch)
tree: 8b8977cad5ff9c443a86868efcad3d5e4fb612ac
parent: 286dcae5e087db817f3350cf442145107b25bc9c (diff)
5 files changed, 45 insertions, 12 deletions
diff --git a/GUI/GUI/GUI/Config.cpp b/GUI/GUI/GUI/Config.cpp
index 1fc1aee..dd3d55c 100644
--- a/GUI/GUI/GUI/Config.cpp
+++ b/GUI/GUI/GUI/Config.cpp
@@ -86,6 +86,7 @@ AppConfig::AppConfig(wxTextCtrl* out)
 	enable_lock_at_spawn(true),
 	gpu_idx(0),
 	min_silence_duration_ms(250),
+	max_speech_duration_s(5),
 	keybind("ctrl+x"),
 
 	chars_per_sync(8),
@@ -129,6 +130,7 @@ bool AppConfig::Serialize(const std::filesystem::path& path) {
 	cm.Set("enable_lock_at_spawn", enable_lock_at_spawn);
 	cm.Set("gpu_idx", gpu_idx);
 	cm.Set("min_silence_duration_ms", min_silence_duration_ms);
+	cm.Set("max_speech_duration_s", max_speech_duration_s);
 	cm.Set("keybind", keybind);
 
 	cm.Set("chars_per_sync", chars_per_sync);
@@ -185,6 +187,7 @@ bool AppConfig::Deserialize(const std::filesystem::path& path) {
 	cm.Get("enable_lock_at_spawn", c.enable_lock_at_spawn);
 	cm.Get("gpu_idx", c.gpu_idx);
 	cm.Get("min_silence_duration_ms", c.min_silence_duration_ms);
+	cm.Get("max_speech_duration_s", c.max_speech_duration_s);
 	cm.Get("keybind", c.keybind);
 
 	cm.Get("chars_per_sync", c.chars_per_sync);
diff --git a/GUI/GUI/GUI/Config.h b/GUI/GUI/GUI/Config.h
index 808cf9e..a366090 100644
--- a/GUI/GUI/GUI/Config.h
+++ b/GUI/GUI/GUI/Config.h
@@ -72,6 +72,7 @@ public:
 	bool enable_lock_at_spawn;
 	int gpu_idx;
 	int min_silence_duration_ms;
+	int max_speech_duration_s;
 	std::string keybind;
 
 	// Unity and transcription shared settings.
diff --git a/GUI/GUI/GUI/Frame.cpp b/GUI/GUI/GUI/Frame.cpp
index d110a0c..f2fb140 100644
--- a/GUI/GUI/GUI/Frame.cpp
+++ b/GUI/GUI/GUI/Frame.cpp
@@ -76,6 +76,7 @@ namespace {
         ID_PY_APP_COLS,
         ID_PY_APP_GPU_IDX,
         ID_PY_APP_MIN_SILENCE_DURATION_MS,
+        ID_PY_APP_MAX_SPEECH_DURATION_S,
         ID_PY_APP_KEYBIND,
         ID_PY_APP_BROWSER_SRC_PORT,
         ID_PY_APP_COMMIT_FUZZ_THRESHOLD,
@@ -763,6 +764,17 @@ Frame::Frame()
                         "used to segment speech.");
 					py_app_min_silence_duration_ms_ = py_app_min_silence_duration_ms;
 
+					auto* py_app_max_speech_duration_s = new wxTextCtrl(
+                        py_app_config_panel_pairs, ID_PY_APP_MAX_SPEECH_DURATION_S,
+                        std::to_string(app_c_->max_speech_duration_s), wxDefaultPosition,
+						wxDefaultSize, /*style=*/0);
+                    py_app_max_speech_duration_s->SetToolTip(
+                        "The maximum duration, in seconds, of any segment of "
+                        "speech. Continuous speech longer than this is split "
+                        "at the last pause lasting longer than 100 "
+                        "milliseconds.");
+					py_app_max_speech_duration_s_ = py_app_max_speech_duration_s;
+
 					auto* py_app_keybind = new wxTextCtrl(
 						py_app_config_panel_pairs, ID_PY_APP_KEYBIND,
 						app_c_->keybind, wxDefaultPosition,
@@ -853,6 +865,11 @@ Frame::Frame()
                         /*flags=*/wxEXPAND);
 
                     sizer->Add(new wxStaticText(py_app_config_panel_pairs,
+                        wxID_ANY, /*label=*/"Maximum speech duration (s):"));
+                    sizer->Add(py_app_max_speech_duration_s, /*proportion=*/0,
+                        /*flags=*/wxEXPAND);
+
+                    sizer->Add(new wxStaticText(py_app_config_panel_pairs,
                         wxID_ANY, /*label=*/"Browser source port:"));
                     sizer->Add(py_app_browser_src_port, /*proportion=*/0,
                         /*flags=*/wxEXPAND);
@@ -1550,6 +1567,10 @@ void Frame::ApplyConfigToInputFields()
 	py_app_min_silence_duration_ms->Clear();
 	py_app_min_silence_duration_ms->AppendText(std::to_string(app_c_->min_silence_duration_ms));
 
+	auto* py_app_max_speech_duration_s = static_cast<wxTextCtrl*>(FindWindowById(ID_PY_APP_MAX_SPEECH_DURATION_S));
+	py_app_max_speech_duration_s->Clear();
+	py_app_max_speech_duration_s->AppendText(std::to_string(app_c_->max_speech_duration_s));
+
     auto* py_app_enable_local_beep = static_cast<wxCheckBox*>(FindWindowById(ID_PY_APP_ENABLE_LOCAL_BEEP));
     py_app_enable_local_beep->SetValue(app_c_->enable_local_beep);
 
@@ -2304,6 +2325,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
 	ASSIGN_OR_RETURN_VOID(int, bytes_per_char, stoiInRange(transcribe_out_, kBytesPerChar[bytes_per_char_idx].ToStdString(), "bytes_per_char", 1, 2));
 	ASSIGN_OR_RETURN_VOID(int, gpu_idx, stoiInRange(transcribe_out_, py_app_gpu_idx_->GetValue().ToStdString(), "gpu_idx", 0, 10));
 	ASSIGN_OR_RETURN_VOID(int, min_silence_duration_ms, stoiInRange(transcribe_out_, py_app_min_silence_duration_ms_->GetValue().ToStdString(), "min_silence_duration_ms", 50, 5000));
+	ASSIGN_OR_RETURN_VOID(int, max_speech_duration_s, stoiInRange(transcribe_out_, py_app_max_speech_duration_s_->GetValue().ToStdString(), "max_speech_duration_s", 1, 30));
 	ASSIGN_OR_RETURN_VOID(int, browser_src_port, stoiInRange(transcribe_out_, py_app_browser_src_port_->GetValue().ToStdString(), "browser_src_port", 1024, 65535));
 
     std::string keybind = py_app_keybind_->GetValue().ToStdString();
@@ -2334,6 +2356,7 @@ void Frame::OnAppStart(wxCommandEvent& event) {
     app_c_->enable_lock_at_spawn = enable_lock_at_spawn;
     app_c_->gpu_idx = gpu_idx;
     app_c_->min_silence_duration_ms = min_silence_duration_ms;
+    app_c_->max_speech_duration_s = max_speech_duration_s;
     app_c_->keybind = keybind;
     app_c_->Serialize(AppConfig::kConfigPath);
 
diff --git a/GUI/GUI/GUI/Frame.h b/GUI/GUI/GUI/Frame.h
index 72ba6c4..615726a 100644
--- a/GUI/GUI/GUI/Frame.h
+++ b/GUI/GUI/GUI/Frame.h
@@ -39,6 +39,7 @@ private:
     wxTextCtrl* py_app_cols_;
     wxTextCtrl* py_app_gpu_idx_;
     wxTextCtrl* py_app_min_silence_duration_ms_;
+    wxTextCtrl* py_app_max_speech_duration_s_;
     wxTextCtrl* py_app_keybind_;
     wxTextCtrl* py_app_browser_src_port_;
     wxTextCtrl* py_app_commit_fuzz_threshold_;
diff --git a/Scripts/transcribe_v2.py b/Scripts/transcribe_v2.py
index 81a4bf2..541ff23 100644
--- a/Scripts/transcribe_v2.py
+++ b/Scripts/transcribe_v2.py
@@ -314,9 +314,11 @@ class CompressingAudioCollector(AudioCollectorFilter):
 
 class AudioSegmenter:
     def __init__(self,
-            min_silence_ms=250):
+            min_silence_ms=250,
+            max_speech_s=5):
         self.vad_options = vad.VadOptions(
-                min_silence_duration_ms=min_silence_ms)
+                min_silence_duration_ms=min_silence_ms,
+                max_speech_duration_s=max_speech_s)
         pass
 
     def segmentAudio(self, audio: bytes):
@@ -332,6 +334,7 @@ class AudioSegmenter:
 
         last_end = None
         segments = self.segmentAudio(audio)
+
         for i in range(len(segments)):
             s = segments[i]
             #print(f"s: {s}")
@@ -349,7 +352,8 @@ class AudioSegmenter:
                 now = int(len(audio) / AudioStream.FRAME_SZ)
                 #print(f"now: {now}")
                 #print(f"min d: {min_delta_frames}")
-                if now - s['end'] > min_delta_frames:
+                delta_frames = now - s['end']
+                if delta_frames > min_delta_frames:
                     cutoff = now - int(min_delta_frames / 2)
 
         return (cutoff, len(segments) > 0)
@@ -480,7 +484,7 @@ class VadCommitter:
         delta = ""
         commit_audio = None
         latency_s = None
-        if stable_cutoff:
+        if has_audio and stable_cutoff:
             #print(f"stable cutoff get: {stable_cutoff}", file=sys.stderr)
             latency_s = self.collector.now() - self.collector.begin()
             commit_audio = self.collector.dropAudioPrefixByFrames(stable_cutoff)
@@ -497,13 +501,13 @@ class VadCommitter:
             #saveAudio(commit_audio, filename)
 
         preview = ""
-        if self.cfg["enable_previews"]:
-            if has_audio:
-                segments = self.whisper.transcribe(audio)
-                preview = "".join(s.transcript for s in segments)
-            else:
-                #print("VAD detects no audio, skip transcription", file=sys.stderr)
-                self.collector.keepLast(1.0)
+        if self.cfg["enable_previews"] and has_audio:
+            segments = self.whisper.transcribe(audio)
+            preview = "".join(s.transcript for s in segments)
+
+        if not has_audio:
+            #print("VAD detects no audio, skip transcription", file=sys.stderr)
+            self.collector.keepLast(1.0)
 
         return TranscriptCommit(
                 delta,
@@ -907,7 +911,8 @@ def run(cfg):
     #collector = NormalizingAudioCollector(collector)
     collector = CompressingAudioCollector(collector)
     whisper = Whisper(collector, cfg)
-    segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"])
+    segmenter = AudioSegmenter(min_silence_ms=cfg["min_silence_duration_ms"],
+            max_speech_s=cfg["max_speech_duration_s"])
     committer = VadCommitter(cfg, collector, whisper, segmenter)
     pager = OscPager(cfg)
author	yum <yum.food.vr@gmail.com>	2023-09-09 22:26:09 -0700
committer	yum <yum.food.vr@gmail.com>	2023-09-09 22:32:35 -0700
commit	ae866f553d3db67030e37ce315707d72982f4063 (patch)
tree	8b8977cad5ff9c443a86868efcad3d5e4fb612ac
parent	286dcae5e087db817f3350cf442145107b25bc9c (diff)