summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-06-27 15:16:06 -0700
committeryum <yum.food.vr@gmail.com>2023-06-27 15:16:06 -0700
commitcf75998dab6db1b1d21ca06bde18a56b5e896937 (patch)
tree6ad77c5e32ad8f06b3a0ff507d1efa9f84c8b2a6 /Scripts/transcribe.py
parent8ff761153b7de23a9556f2af179ab3bf4b9849a5 (diff)
Add ability to preserve transcript while using push to talk
This is useful when streaming. Occasionally the STT can get into a bad state, and manually segmenting clears it up. However doing so would clear your accumulated transcript, which isn't always desired. Add ability to preserve the transcript. A small wrinkle: the new commit logic requires N consecutive identical windows before committing. To make this feature play nicely with it, I had to forcibly commit any preview text that hasn't yet been committed. Failing to do this would usually cause short utterances / the most recently said stuff to get wiped out.
Diffstat (limited to 'Scripts/transcribe.py')
-rw-r--r--Scripts/transcribe.py51
1 files changed, 36 insertions, 15 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index 8fe6190..694fd0b 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -49,9 +49,15 @@ class AudioState:
# PyAudio stream object
self.stream = None
- self.committed_text = ""
+ self.preview_text = ""
self.text = ""
self.filtered_text = ""
+
+ # If set to true, then the transcript strings (`text` and friends) will
+ # be reset whenever transcription is toggled on. At time of writing,
+ # this only applies to keyboard controls.
+ self.reset_on_toggle = True
+
# List of:
# List of tuples of:
# Segment start time, end time, and text
@@ -198,20 +204,15 @@ def resetAudioLocked(audio_state):
audio_state.transcribe_sleep_duration = \
audio_state.transcribe_sleep_duration_min_s
- audio_state.text = ""
- audio_state.preview_text = ""
- audio_state.filtered_text = ""
+ if audio_state.reset_on_toggle:
+ print("resetAudioLocked resetting text")
+ audio_state.text = ""
+ audio_state.preview_text = ""
+ audio_state.filtered_text = ""
def resetDisplayLocked(audio_state):
osc_ctrl.clear(audio_state.osc_state)
-def resetAudio(audio_state):
- audio_state.transcribe_lock.acquire()
- audio_state.audio_lock.acquire()
- resetAudioLocked(audio_state)
- audio_state.audio_lock.release()
- audio_state.transcribe_lock.release()
-
# Transcribe the audio recorded in a file.
# Returns two strings: committed text, and preview text.
# Committed text is temporally stable. Preview text is *not* temporally stable,
@@ -422,7 +423,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
osc_ctrl.toggleBoard(audio_state.osc_state.client, False)
#playsound(os.path.abspath("../Sounds/Noise_Off_Quiet.wav"))
- audio_state.drop_transcription = True
+ if audio_state.reset_on_toggle:
+ print("Toggle detected, dropping transcript (-2)")
+ audio_state.drop_transcription = True
+ else:
+ print("Toggle detected, committing preview text (2)")
+ audio_state.text += audio_state.preview_text
audio_state.audio_paused = True
resetAudioLocked(audio_state)
resetDisplayLocked(audio_state)
@@ -448,7 +454,12 @@ def readKeyboardInput(audio_state, enable_local_beep: bool,
osc_ctrl.indicateSpeech(audio_state.osc_state.client, True)
osc_ctrl.toggleBoard(audio_state.osc_state.client, True)
osc_ctrl.lockWorld(audio_state.osc_state.client, False)
- audio_state.drop_transcription = True
+ if audio_state.reset_on_toggle:
+ print("Toggle detected, dropping transcript (2)")
+ audio_state.drop_transcription = True
+ else:
+ print("Toggle detected, committing preview text (2)")
+ audio_state.text += audio_state.preview_text
audio_state.audio_paused = False
resetAudioLocked(audio_state)
@@ -585,11 +596,13 @@ def transcribeLoop(mic: str,
estate: EmotesState,
window_duration_s: int,
gpu_idx: int,
- keyboard_hotkey: str):
+ keyboard_hotkey: str,
+ reset_on_toggle: bool):
audio_state = getMicStream(mic)
audio_state.whisper_language = language
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
+ audio_state.reset_on_toggle = reset_on_toggle
lang_bits = language_target.split(" | ")
if len(lang_bits) == 2:
@@ -748,6 +761,7 @@ if __name__ == "__main__":
parser.add_argument("--emotes_pickle", type=str, help="The path to emotes pickle. See emotes_v2.py for details.")
parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
+ parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
args = parser.parse_args()
if not args.mic:
@@ -796,6 +810,11 @@ if __name__ == "__main__":
else:
args.cpu = False
+ if args.reset_on_toggle == 1:
+ args.reset_on_toggle = True
+ else:
+ args.reset_on_toggle = False
+
if args.use_builtin == 1:
args.use_builtin = True
else:
@@ -844,5 +863,7 @@ if __name__ == "__main__":
args.enable_lowercase_filter,
args.button,
estate, window_duration_s,
- args.gpu_idx, args.keybind)
+ args.gpu_idx,
+ args.keybind,
+ args.reset_on_toggle)