summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--osc_ctrl.py8
-rw-r--r--transcribe.py36
2 files changed, 35 insertions, 9 deletions
diff --git a/osc_ctrl.py b/osc_ctrl.py
index 40cb7d1..0352e6f 100644
--- a/osc_ctrl.py
+++ b/osc_ctrl.py
@@ -161,6 +161,7 @@ class OscTxState:
# The message last sent to the board.
last_msg_encoded = []
empty_cells_to_send_per_call = 1
+ nonempty_cells_to_send_per_call = 2
# 0 indicates it's closed. 1 indicates half size. 2 indicates full size.
board_size = 0
@@ -246,6 +247,7 @@ def sendMessageLazy(client, msg, tx_state):
msg_encoded_len = len(msg_encoded)
empty_cells_sent = 0
+ nonempty_cells_sent = 0
n_cells = ceil(msg_encoded_len / NUM_LAYERS)
for cell in range(0, n_cells):
cell_begin = cell * NUM_LAYERS
@@ -272,6 +274,12 @@ def sendMessageLazy(client, msg, tx_state):
tx_state.last_msg_encoded = msg_encoded[0:cell_end]
return False
empty_cells_sent += 1
+ else:
+ if nonempty_cells_sent >= tx_state.nonempty_cells_to_send_per_call:
+ print("nonempty cell budget exceeded")
+ tx_state.last_msg_encoded = msg_encoded[0:cell_end]
+ return False
+ nonempty_cells_sent += 1
sendMessageCellDiscrete(client, cell_msg, cell)
diff --git a/transcribe.py b/transcribe.py
index dc36541..8369c43 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -12,8 +12,7 @@ import time
import wave
# python3 -m pip install git+https://github.com/openai/whisper.git
# python3 -m pip install torch -f https://download.pytorch.org/whl/torch_stable.html
-from whisper import transcribe as whisper_transcribe
-from whisper import load_model as whisper_load_model
+import whisper
class AudioState:
CHUNK = 1024
@@ -24,9 +23,9 @@ class AudioState:
# The maximum length that recordAudio() will put into frames before it
# starts dropping from the start.
- MAX_LENGTH_S = 90
+ MAX_LENGTH_S = 25
# The minimum length that recordAudio() will wait for before saving audio.
- MIN_LENGTH_S = 3
+ MIN_LENGTH_S = 1
# PyAudio object
p = None
@@ -38,6 +37,9 @@ class AudioState:
frames_lock = threading.Lock()
text = ""
+ # To improve temporal stability, we require two consecutive identical
+ # transcriptions before "committing" to a transcription.
+ text_candidate = ""
text_lock = threading.Lock()
record_audio = True
@@ -118,6 +120,8 @@ def saveAudio(audio_state, filename):
wf.writeframes(b''.join(frames))
wf.close()
+ print("audio save")
+
def resetAudio(audio_state):
audio_state.frames_lock.acquire()
audio_state.frames = []
@@ -125,8 +129,16 @@ def resetAudio(audio_state):
# Transcribe the audio recorded in a file.
def transcribe(model, filename):
- result = whisper_transcribe(model=model, audio=filename, language="en")
- return result["text"]
+
+ audio = whisper.load_audio(filename)
+ audio = whisper.pad_or_trim(audio)
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
+ _, probs = model.detect_language(mel)
+ print(f"Detected language: {max(probs, key=probs.get)}")
+ options = whisper.DecodingOptions()
+ result = whisper.decode(model, mel, options)
+
+ return result.text
def transcribeAudio(audio_state, model):
while audio_state.transcribe_audio == True:
@@ -140,13 +152,18 @@ def transcribeAudio(audio_state, model):
text = transcribe(model, "audio.wav")
audio_state.text_lock.acquire()
- audio_state.text = text
+
+ if text == audio_state.text_candidate or text.startswith(audio_state.text_candidate):
+ audio_state.text = text
+ audio_state.text_candidate = text
+
audio_state.text_lock.release()
print("Transcription: {}".format(audio_state.text))
+ print("Candidate: {}".format(audio_state.text_candidate))
# Pace this out
- time.sleep(0.2)
+ time.sleep(0.05)
def sendAudio(audio_state):
tx_state = osc_ctrl.OscTxState()
@@ -155,6 +172,7 @@ def sendAudio(audio_state):
text = copy.deepcopy(audio_state.text)
audio_state.text_lock.release()
+ print("here")
osc_ctrl.sendMessageLazy(audio_state.osc_client, text, tx_state)
# Pace this out
@@ -179,7 +197,7 @@ if __name__ == "__main__":
print("Safe to start talking")
- model = whisper_load_model("base")
+ model = whisper.load_model("base")
transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model])
transcribe_audio_thd.daemon = True