diff options
| author | yum <yum.food.vr@gmail.com> | 2022-12-27 16:07:10 -0800 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2022-12-27 16:07:10 -0800 |
| commit | f48ae0fffcd06f3cddd6cfc99b4c3d3a18c20038 (patch) | |
| tree | 16c22867f2d7c51ac50efd74f042c615f8f35d0e /Scripts/transcribe.py | |
| parent | 3659518cb0ba5e8298d13215441a18ad8b275465 (diff) | |
Encapsulate paging & text wrapping logic
Define proper interfaces for these things. Simplify osc_ctrl,
temporarily dropping support for emotes (they were broken anyway).
* Bugfix: Japanese no longer crashes transcribe.py, but it still doesn't
show up in the wxTextCtrl
Diffstat (limited to 'Scripts/transcribe.py')
| -rw-r--r-- | Scripts/transcribe.py | 108 |
1 files changed, 55 insertions, 53 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py index ba7d8fe..2729331 100644 --- a/Scripts/transcribe.py +++ b/Scripts/transcribe.py @@ -26,57 +26,58 @@ import wave import whisper class AudioState: - CHUNK = 1024 - FORMAT = pyaudio.paInt16 - CHANNELS = 1 - # This matches the framerate expected by whisper. - RATE = 16000 + def __init__(self): + self.CHUNK = 1024 + self.FORMAT = pyaudio.paInt16 + self.CHANNELS = 1 + # This matches the framerate expected by whisper. + self.RATE = 16000 - # The maximum length that recordAudio() will put into frames before it - # starts dropping from the start. - MAX_LENGTH_S = 10 - MAX_LENGTH_S_WHISPER = 30 - # The minimum length that recordAudio() will wait for before saving audio. - MIN_LENGTH_S = 1 + # The maximum length that recordAudio() will put into frames before it + # starts dropping from the start. + self.MAX_LENGTH_S = 10 + self.MAX_LENGTH_S_WHISPER = 30 + # The minimum length that recordAudio() will wait for before saving audio. + self.MIN_LENGTH_S = 1 - # PyAudio object - p = None + # PyAudio object + self.p = None - # PyAudio stream object - stream = None + # PyAudio stream object + self.stream = None - text = "" - committed_text = "" - frames = [] + self.text = "" + self.committed_text = "" + self.frames = [] - # Locks access to `text`. - transcribe_lock = threading.Lock() + # Locks access to `text`. + self.transcribe_lock = threading.Lock() - # Locks access to `frames`, and audio stored on disk. - audio_lock = threading.Lock() + # Locks access to `frames`, and audio stored on disk. + self.audio_lock = threading.Lock() - # Used to tell the threads when to stop. - run_app = True + # Used to tell the threads when to stop. + self.run_app = True - transcribe_sleep_duration_min_s = 0.05 - transcribe_sleep_duration_max_s = 5.00 - transcribe_no_change_count = 0 - transcribe_sleep_duration = transcribe_sleep_duration_min_s + self.transcribe_sleep_duration_min_s = 0.05 + self.transcribe_sleep_duration_max_s = 5.00 + self.transcribe_no_change_count = 0 + self.transcribe_sleep_duration = self.transcribe_sleep_duration_min_s - tx_state = osc_ctrl.OscTxState() + # The transcription thread transcribes without holding locks, then + # blocks on it. Thus we need some way to tell the transcription + # thread to drop that transcription. + self.drop_transcription = False - # The transcription thread transcribes without holding locks, then - # blocks on it. Thus we need some way to tell the transcription - # thread to drop that transcription. - drop_transcription = False + # The language the user is speaking in. Default is English but user may set + # this to whatever they want. + self.language = whisper.tokenizer.TO_LANGUAGE_CODE["english"] - # The language the user is speaking in. Default is English but user may set - # this to whatever they want. - language = whisper.tokenizer.TO_LANGUAGE_CODE["english"] + self.audio_paused = False - audio_paused = False - - osc_client = osc_ctrl.getClient() + self.osc_state = osc_ctrl.OscState(generate_utils.config.CHARS_PER_SYNC, + generate_utils.config.BOARD_ROWS, + generate_utils.config.BOARD_COLS) def sleepInterruptible(self, dur_s, stride_ms = 5): dur_ms = dur_s * 1000.0 @@ -185,7 +186,7 @@ def resetAudioLocked(audio_state): audio_state.text = "" def resetDisplayLocked(audio_state): - osc_ctrl.clear(audio_state.osc_client, audio_state.tx_state) + osc_ctrl.clear(audio_state.osc_state) def resetAudio(audio_state): audio_state.transcribe_lock.acquire() @@ -292,10 +293,9 @@ def transcribeAudio(audio_state, model): def sendAudio(audio_state): while audio_state.run_app == True: text = audio_state.committed_text + " " + audio_state.text - ret = osc_ctrl.sendMessageLazy(audio_state.osc_client, text, - audio_state.tx_state) - is_paging = (ret == osc_ctrl.SEND_MSG_LAZY_SENT_NON_EMPTY) - osc_ctrl.indicatePaging(audio_state.osc_client, is_paging) + ret = osc_ctrl.pageMessage(audio_state.osc_state, text) + is_paging = (ret == False) + osc_ctrl.indicatePaging(audio_state.osc_state.client, is_paging) # Pace this out time.sleep(0.01) @@ -305,8 +305,8 @@ def readControllerInput(audio_state): RECORD_STATE = 0 PAUSE_STATE = 1 state = PAUSE_STATE - osc_ctrl.indicateSpeech(audio_state.osc_client, False) - osc_ctrl.indicatePaging(audio_state.osc_client, False) + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.indicatePaging(audio_state.osc_state.client, False) last_rising = time.time() while audio_state.run_app == True: @@ -321,8 +321,8 @@ def readControllerInput(audio_state): if now - last_rising > 0.5: # Long hold state = PAUSE_STATE - osc_ctrl.indicateSpeech(audio_state.osc_client, False) - osc_ctrl.toggleBoard(audio_state.osc_client, False) + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.toggleBoard(audio_state.osc_state.client, False) #playsound(os.path.abspath("../Sounds/Noise_Off.wav")) resetAudioLocked(audio_state) @@ -333,17 +333,17 @@ def readControllerInput(audio_state): # Short hold if state == RECORD_STATE: state = PAUSE_STATE - osc_ctrl.indicateSpeech(audio_state.osc_client, False) - osc_ctrl.lockWorld(audio_state.osc_client, True) + osc_ctrl.indicateSpeech(audio_state.osc_state.client, False) + osc_ctrl.lockWorld(audio_state.osc_state.client, True) audio_state.audio_paused = True playsound(os.path.abspath("../Sounds/Noise_Off.wav")) elif state == PAUSE_STATE: state = RECORD_STATE - osc_ctrl.indicateSpeech(audio_state.osc_client, True) - osc_ctrl.toggleBoard(audio_state.osc_client, True) - osc_ctrl.lockWorld(audio_state.osc_client, False) + osc_ctrl.indicateSpeech(audio_state.osc_state.client, True) + osc_ctrl.toggleBoard(audio_state.osc_state.client, True) + osc_ctrl.lockWorld(audio_state.osc_state.client, False) resetAudioLocked(audio_state) resetDisplayLocked(audio_state) @@ -398,6 +398,8 @@ def transcribeLoop(mic: str, language: str, model: str): controller_input_thd.join() if __name__ == "__main__": + sys.stdout.reconfigure(encoding="utf-8") + # Set cwd to the directory holding the script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) |
