From ff7eb3c212195af71cd0ce4a3cd0c9a081d6ebda Mon Sep 17 00:00:00 2001
From: yum <yum.food.vr@gmail.com>
Date: Wed, 28 Jun 2023 20:35:10 -0700
Subject: Add toggle for debug mode

Most transcription output is now gone by default. Users can enable a
more verbose output by toggling `Enable debug mode`.

Bugfix: Toggling off transcription would reset audio state, frequently
resulting in the loss of the last few words spoken.
---
 Scripts/transcribe.py | 55 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'Scripts/transcribe.py')

diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index cc1944c..28b6ca0 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -38,6 +38,9 @@ class AudioState:
         # This matches the framerate expected by whisper.
         self.RATE = 16000
 
+        # If set, print additional information to stdout while transcribing.
+        self.enable_debug_mode = False
+
         # The maximum length that recordAudio() will put into frames before it
         # starts dropping from the start.
         self.MAX_LENGTH_S = 300
@@ -211,7 +214,8 @@ def resetAudioLocked(audio_state):
             audio_state.transcribe_sleep_duration_min_s
 
     if audio_state.reset_on_toggle:
-        print("resetAudioLocked resetting text")
+        if audio_state.enable_debug_mode:
+            print("resetAudioLocked resetting text")
         audio_state.text = ""
         audio_state.preview_text = ""
         audio_state.filtered_text = ""
@@ -244,7 +248,8 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
             without_timestamps = False)
     ranges = []
     for s in segments:
-        #print(f"Segment: {s}")
+        if audio_state.enable_debug_mode:
+            print(f"Segment: {s}")
         ranges.append((s.start, s.end, s.text))
     audio_state.ranges_ls.append(ranges)
 
@@ -270,13 +275,14 @@ def transcribe(audio_state, model, frames, use_cpu: bool) -> typing.Tuple[str,st
 
             max_edit = audio_state.commit_fuzz_threshold
 
-            #print(f"c0: {c0}, c1: {c1}, c2: {c2}")
-            #if c0 == c1 and c1 == c2 and c2 == c3:
+            if audio_state.enable_debug_mode:
+                print(f"c0: {c0}, c1: {c1}, c2: {c2}")
             if c0_c1_d < max_edit and c1_c2_d < max_edit and c2_c3_d < max_edit:
                 # For simplicity, completely reset saved audio ranges.
                 audio_state.ranges_ls = []
                 committed_text = c0[2]
-                print(f"Dropping frames until {c0[1]}")
+                if audio_state.enable_debug_mode:
+                    print(f"Dropping frames until {c0[1]}")
                 n_frames_to_drop = int(ceil(audio_state.RATE * c0[1]))
                 audio_state.drop_frames_till_i = n_frames_to_drop
 
@@ -320,7 +326,8 @@ def transcribeAudio(audio_state,
 
         text, preview_text = transcribe(audio_state, model, audio_state.frames, use_cpu)
         if len(text) == 0 and len(preview_text) == 0:
-            print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
+            if audio_state.enable_debug_mode:
+                print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
 
@@ -329,7 +336,8 @@ def transcribeAudio(audio_state,
             audio_state.text = ""
             audio_state.preview_text = ""
             audio_state.filtered_text = ""
-            print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
+            if audio_state.enable_debug_mode:
+                print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
             last_transcribe_time = time.time()
             continue
 
@@ -436,13 +444,14 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             if not use_builtin:
                 osc_ctrl.indicateSpeech(audio_state.osc_state.client, False)
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
-            #playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
 
             if audio_state.reset_on_toggle:
-                print("Toggle detected, dropping transcript (-2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, dropping transcript (1)")
                 audio_state.drop_transcription = True
             else:
-                print("Toggle detected, committing preview text (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, committing preview text (1)")
                 audio_state.text += audio_state.preview_text
             audio_state.audio_paused = True
             resetAudioLocked(audio_state)
@@ -458,7 +467,6 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
             audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s
 
             audio_state.audio_paused = True
-            resetAudioLocked(audio_state)
 
             if enable_local_beep == 1:
                 playsound(os.path.abspath("Resources/Sounds/Noise_Off_Quiet.wav"),
@@ -470,10 +478,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
                 osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
                 osc_ctrl.lockWorld(audio_state.osc_state.client, False)
             if audio_state.reset_on_toggle:
-                print("Toggle detected, dropping transcript (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, dropping transcript (2)")
                 audio_state.drop_transcription = True
             else:
-                print("Toggle detected, committing preview text (2)")
+                if audio_state.enable_debug_mode:
+                    print("Toggle detected, committing preview text (2)")
                 audio_state.text += audio_state.preview_text
             audio_state.audio_paused = False
 
@@ -492,7 +502,8 @@ def readControllerInput(audio_state, enable_local_beep: bool,
         try:
             session = steamvr.SessionState()
         except:
-            print("steamvr is off, no controller input")
+            if audio_state.enable_debug_mode:
+                print("steamvr is off, no controller input")
             session = None
             time.sleep(5)
 
@@ -607,6 +618,7 @@ def transcribeLoop(mic: str,
         remove_trailing_period: bool,
         enable_uppercase_filter: bool,
         enable_lowercase_filter: bool,
+        enable_debug_mode: bool,
         button: str,
         estate: EmotesState,
         window_duration_s: int,
@@ -620,6 +632,7 @@ def transcribeLoop(mic: str,
     audio_state.MAX_LENGTH_S = window_duration_s
     audio_state.reset_on_toggle = reset_on_toggle
     audio_state.commit_fuzz_threshold = commit_fuzz_threshold
+    audio_state.enable_debug_mode = enable_debug_mode
 
     lang_bits = language_target.split(" | ")
     if len(lang_bits) == 2:
@@ -721,7 +734,6 @@ def transcribeLoop(mic: str,
     keyboard_input_thd.daemon = True
     keyboard_input_thd.start()
 
-    print("Press enter to start a new message.")
     for line in sys.stdin:
         audio_state.transcribe_lock.acquire()
         audio_state.audio_lock.acquire()
@@ -745,12 +757,6 @@ if __name__ == "__main__":
 
     print("args: {}".format(" ".join(sys.argv)))
 
-    # Set cwd to TaSTT/
-    abspath = os.path.abspath(__file__)
-    dname = os.path.dirname(abspath)
-    dname = os.path.dirname(dname)
-    dname = os.path.dirname(dname)
-    #os.chdir(dname)
     print(f"Set cwd to {os.getcwd()}")
 
     parser = argparse.ArgumentParser()
@@ -780,6 +786,7 @@ if __name__ == "__main__":
     parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
     parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
     parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
+    parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.")
     args = parser.parse_args()
 
     if not args.mic:
@@ -863,6 +870,11 @@ if __name__ == "__main__":
     else:
         args.enable_lowercase_filter = False
 
+    if args.enable_debug_mode == 1:
+        args.enable_debug_mode = True
+    else:
+        args.enable_debug_mode = False
+
     estate = EmotesState()
     estate.load(args.emotes_pickle)
 
@@ -884,6 +896,7 @@ if __name__ == "__main__":
             args.remove_trailing_period,
             args.enable_uppercase_filter,
             args.enable_lowercase_filter,
+            args.enable_debug_mode,
             args.button,
             estate, window_duration_s,
             args.gpu_idx,
-- 
cgit v1.2.3