Rework input controls

Press joystick once to start recording, again to stop. When you start recording, any previous text on the board is cleared. Add 2 visual indicators: one to indicate speech, another to indicate that audio is paging.
author: yum <yum.food.vr@gmail.com> 2022-11-22 15:36:19 -0800
committer: yum <yum.food.vr@gmail.com> 2022-11-22 18:13:18 -0800
commit: bd8b63a357bb374f5875f0fedf2d677589419810 (patch)
tree: 0bb804459ebb7ad50e2a817bd842afd946339b30 /transcribe.py
parent: 27a67666c320bed3b4a18e415eb9702b03f8f0b5 (diff)
1 files changed, 31 insertions, 24 deletions
diff --git a/transcribe.py b/transcribe.py
index 1aabf6f..9290bdc 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -70,9 +70,7 @@ class AudioState:
     # this to whatever they want.
     language = whisper.tokenizer.TO_LANGUAGE_CODE["english"]
 
-    # When the user says `over`, we stop displaying new transcriptions until
-    # they clear the board again.
-    display_paused = False
+    audio_paused = False
 
     osc_client = osc_ctrl.getClient()
 
@@ -121,6 +119,10 @@ def recordAudio(audio_state):
     while audio_state.run_app:
         data = audio_state.stream.read(audio_state.CHUNK)
 
+        if audio_state.audio_paused:
+            time.sleep(0.1)
+            continue
+
         audio_state.lock.acquire()
         audio_state.frames.append(data)
         max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
@@ -199,7 +201,8 @@ def transcribe(audio_state, model, filename):
 
     result = None
     #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
-    for temp in (0.00, 0.05):
+    #for temp in (0.00, 0.05):
+    for temp in (0.00,):
         print("temp: {}".format(temp))
         options = whisper.DecodingOptions(language = audio_state.language,
                 beam_size = 5, temperature = temp)
@@ -256,11 +259,6 @@ def transcribeAudio(audio_state, model):
 
         words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()
 
-        if len(words) > 0:
-            if words[-1] == "over":
-                words = words[0:-1]
-                audio_state.display_paused = True
-
         print("Transcription: {}".format(audio_state.text))
 
         old_text = audio_state.text
@@ -280,14 +278,12 @@ def transcribeAudio(audio_state, model):
 
 def sendAudio(audio_state):
     while audio_state.run_app == True:
-        if audio_state.display_paused:
-            time.sleep(0.1)
-            continue
-
         audio_state.lock.acquire()
 
         text = audio_state.committed_text + " " + audio_state.text
-        osc_ctrl.sendMessageLazy(audio_state.osc_client, text, audio_state.tx_state)
+        is_paging = not osc_ctrl.sendMessageLazy(audio_state.osc_client, text,
+                audio_state.tx_state)
+        osc_ctrl.indicatePaging(audio_state.osc_client, is_paging)
         audio_state.lock.release()
 
         # Pace this out
@@ -295,19 +291,31 @@ def sendAudio(audio_state):
 
 def readControllerInput(audio_state):
     session = steamvr.SessionState()
+    RECORD_STATE = 0
+    PAUSE_STATE = 1
+    state = PAUSE_STATE
     while audio_state.run_app == True:
         time.sleep(0.05)
 
         event = steamvr.pollButtonPress(session)
 
         if event == steamvr.EVENT_RISING_EDGE:
-            print("event get")
-            audio_state.lock.acquire()
-            resetAudioLocked(audio_state)
-            resetDisplayLocked(audio_state)
-            audio_state.drop_transcription = True
-            audio_state.display_paused = False
-            audio_state.lock.release()
+            if state == RECORD_STATE:
+                state = PAUSE_STATE
+                osc_ctrl.indicateSpeech(audio_state.osc_client, False)
+
+                audio_state.audio_paused = True
+            elif state == PAUSE_STATE:
+                state = RECORD_STATE
+                osc_ctrl.indicateSpeech(audio_state.osc_client, True)
+
+                audio_state.lock.acquire()
+                resetAudioLocked(audio_state)
+                resetDisplayLocked(audio_state)
+                audio_state.drop_transcription = True
+                audio_state.audio_paused = False
+                audio_state.lock.release()
+
 
 def transcribeLoop(mic: str, language: str):
     audio_state = getMicStream(mic)
@@ -337,14 +345,13 @@ def transcribeLoop(mic: str, language: str):
     controller_input_thd.daemon = True
     controller_input_thd.start()
 
-    print("Press enter or say 'Clear' to start a new message. Say 'Over' to " +
-            "pause the display (saying 'Clear' resets it again).")
+    print("Press enter to start a new message.")
     for line in sys.stdin:
         audio_state.lock.acquire()
         resetAudioLocked(audio_state)
         resetDisplayLocked(audio_state)
         audio_state.drop_transcription = True
-        audio_state.display_paused = False
+        audio_state.audio_paused = False
         audio_state.lock.release()
         if "exit" in line or "quit" in line:
             break
author	yum <yum.food.vr@gmail.com>	2022-11-22 15:36:19 -0800
committer	yum <yum.food.vr@gmail.com>	2022-11-22 18:13:18 -0800
commit	bd8b63a357bb374f5875f0fedf2d677589419810 (patch)
tree	0bb804459ebb7ad50e2a817bd842afd946339b30 /transcribe.py
parent	27a67666c320bed3b4a18e415eb9702b03f8f0b5 (diff)