summaryrefslogtreecommitdiffstats
path: root/Scripts/transcribe.py
diff options
context:
space:
mode:
Diffstat (limited to 'Scripts/transcribe.py')
-rw-r--r--Scripts/transcribe.py25
1 files changed, 9 insertions, 16 deletions
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index cea2da0..c718f73 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -170,10 +170,6 @@ def onAudioFramesAvailable(
else:
audio_state.commit_fuzz_threshold = 1
- max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
- audio_state.CHUNK)
- if len(audio_state.frames) > max_frames:
- audio_state.frames = audio_state.frames[-1 * max_frames:]
if audio_state.drop_samples_till_i > 0:
# Caller wants us to keep this many *whisper* samples, assuming that
# we're getting one full frame every (1024 / 16KHz) seconds.
@@ -198,6 +194,11 @@ def onAudioFramesAvailable(
audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
audio_state.drop_samples_till_i = -1
+ max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
+ audio_state.CHUNK)
+ if len(audio_state.frames) > max_frames:
+ audio_state.frames = audio_state.frames[-1 * max_frames:]
+
# Now enforce a minimum duration on frames. This reduces cases where the
# STT hallucinates random things. In the Whisper paper, they enforce a
# minimum audio buffer duration of 5.0 seconds, so I do the same here.
@@ -393,9 +394,9 @@ def transcribeAudio(audio_state,
if audio_state.enable_debug_mode:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
- # Prevent audio buffer from holding more than 1 second of silence
+ # Prevent audio buffer from holding more than a few seconds of silence
# before real speech.
- audio_state.MAX_LENGTH_S = 1
+ audio_state.MAX_LENGTH_S = 5
continue
else:
audio_state.MAX_LENGTH_S = 300
@@ -731,14 +732,12 @@ def transcribeLoop(mic: str,
window_duration_s: int,
gpu_idx: int,
keyboard_hotkey: str,
- reset_on_toggle: bool,
- commit_fuzz_threshold: int):
+ reset_on_toggle: bool):
audio_state = getMicStream(mic)
audio_state.whisper_language = language
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
audio_state.reset_on_toggle = reset_on_toggle
- #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
audio_state.enable_debug_mode = enable_debug_mode
audio_state.enable_profanity_filter = enable_profanity_filter
@@ -905,7 +904,6 @@ if __name__ == "__main__":
parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
- parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.")
args = parser.parse_args()
@@ -945,10 +943,6 @@ if __name__ == "__main__":
print("--gpu_idx required", file=sys.stderr)
sys.exit(1)
- if not args.commit_fuzz_threshold:
- print("--commit_fuzz_threshold required", file=sys.stderr)
- sys.exit(1)
-
args.gpu_idx = int(args.gpu_idx)
window_duration_s = 120
@@ -1027,6 +1021,5 @@ if __name__ == "__main__":
estate, window_duration_s,
args.gpu_idx,
args.keybind,
- args.reset_on_toggle,
- args.commit_fuzz_threshold)
+ args.reset_on_toggle)