Add ability to preserve transcript while using push to talk

This is useful when streaming. Occasionally the STT can get into a bad state, and manually segmenting clears it up. However doing so would clear your accumulated transcript, which isn't always desired. Add ability to preserve the transcript. A small wrinkle: the new commit logic requires N consecutive identical windows before committing. To make this feature play nicely with it, I had to forcibly commit any preview text that hasn't yet been committed. Failing to do this would usually cause short utterances / the most recently said stuff to get wiped out.
author: yum <yum.food.vr@gmail.com> 2023-06-27 15:16:06 -0700
committer: yum <yum.food.vr@gmail.com> 2023-06-27 15:16:06 -0700
commit: cf75998dab6db1b1d21ca06bde18a56b5e896937 (patch)
tree: 6ad77c5e32ad8f06b3a0ff507d1efa9f84c8b2a6 /Scripts/transcribe.py
parent: 8ff761153b7de23a9556f2af179ab3bf4b9849a5 (diff)
1 files changed, 36 insertions, 15 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 8fe6190..694fd0b 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -49,9 +49,15 @@ class AudioState:
         # PyAudio stream object
         self.stream = None
 
-        self.committed_text = ""
+        self.preview_text = ""
         self.text = ""
         self.filtered_text = ""
+
+        # If set to true, then the transcript strings (`text` and friends) will
+        # be reset whenever transcription is toggled on. At time of writing,
+        # this only applies to keyboard controls.
+        self.reset_on_toggle = True
+
         # List of:
         #   List of tuples of:
         #     Segment start time, end time, and text
@@ -198,20 +204,15 @@ def resetAudioLocked(audio_state):
     audio_state.transcribe_sleep_duration = \
             audio_state.transcribe_sleep_duration_min_s
 
-    audio_state.text = ""
-    audio_state.preview_text = ""
-    audio_state.filtered_text = ""
+    if audio_state.reset_on_toggle:
+        print("resetAudioLocked resetting text")
+        audio_state.text = ""
+        audio_state.preview_text = ""
+        audio_state.filtered_text = ""
 
 def resetDisplayLocked(audio_state):
     osc_ctrl.clear(audio_state.osc_state)
 
-def resetAudio(audio_state):
-    audio_state.transcribe_lock.acquire()
-    audio_state.audio_lock.acquire()
-    resetAudioLocked(audio_state)
-    audio_state.audio_lock.release()
-    audio_state.transcribe_lock.release()
-
 # Transcribe the audio recorded in a file.
 # Returns two strings: committed text, and preview text.
 # Committed text is temporally stable. Preview text is *not* temporally stable,
@@ -422,7 +423,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
             #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
 
-            audio_state.drop_transcription = True
+            if audio_state.reset_on_toggle:
+                print("Toggle detected, dropping transcript (-2)")
+                audio_state.drop_transcription = True
+            else:
+                print("Toggle detected, committing preview text (2)")
+                audio_state.text += audio_state.preview_text
             audio_state.audio_paused = True
             resetAudioLocked(audio_state)
             resetDisplayLocked(audio_state)
@@ -448,7 +454,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
                 osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
                 osc_ctrl.lockWorld(audio_state.osc_state.client, False)
-            audio_state.drop_transcription = True
+            if audio_state.reset_on_toggle:
+                print("Toggle detected, dropping transcript (2)")
+                audio_state.drop_transcription = True
+            else:
+                print("Toggle detected, committing preview text (2)")
+                audio_state.text += audio_state.preview_text
             audio_state.audio_paused = False
 
             resetAudioLocked(audio_state)
@@ -585,11 +596,13 @@ def transcribeLoop(mic: str,
         estate: EmotesState,
         window_duration_s: int,
         gpu_idx: int,
-        keyboard_hotkey: str):
+        keyboard_hotkey: str,
+        reset_on_toggle: bool):
     audio_state = getMicStream(mic)
     audio_state.whisper_language = language
     audio_state.language = langcodes.find(language).language
     audio_state.MAX_LENGTH_S = window_duration_s
+    audio_state.reset_on_toggle = reset_on_toggle
 
     lang_bits = language_target.split(" | ")
     if len(lang_bits) == 2:
@@ -748,6 +761,7 @@ if __name__ == "__main__":
     parser.add_argument("--emotes_pickle", type=str, help="The path to emotes pickle. See emotes_v2.py for details.")
     parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
     parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
+    parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
     args = parser.parse_args()
 
     if not args.mic:
@@ -796,6 +810,11 @@ if __name__ == "__main__":
     else:
         args.cpu = False
 
+    if args.reset_on_toggle == 1:
+        args.reset_on_toggle = True
+    else:
+        args.reset_on_toggle = False
+
     if args.use_builtin == 1:
         args.use_builtin = True
     else:
@@ -844,5 +863,7 @@ if __name__ == "__main__":
             args.enable_lowercase_filter,
             args.button,
             estate, window_duration_s,
-            args.gpu_idx, args.keybind)
+            args.gpu_idx,
+            args.keybind,
+            args.reset_on_toggle)
author	yum <yum.food.vr@gmail.com>	2023-06-27 15:16:06 -0700
committer	yum <yum.food.vr@gmail.com>	2023-06-27 15:16:06 -0700
commit	cf75998dab6db1b1d21ca06bde18a56b5e896937 (patch)
tree	6ad77c5e32ad8f06b3a0ff507d1efa9f84c8b2a6 /Scripts/transcribe.py
parent	8ff761153b7de23a9556f2af179ab3bf4b9849a5 (diff)