summaryrefslogtreecommitdiffstats
path: root/app.py
diff options
context:
space:
mode:
Diffstat (limited to 'app.py')
-rw-r--r--app.py170
1 files changed, 167 insertions, 3 deletions
diff --git a/app.py b/app.py
index 064aef3..21fbbf7 100644
--- a/app.py
+++ b/app.py
@@ -6,6 +6,7 @@ import math
import numpy as np
import os
import pyaudio
+import subprocess
import sys
import time
import typing
@@ -157,6 +158,36 @@ class MicStream(AudioStream):
result = b''.join(chunks)
return result
+class DiskStream(AudioStream):
+ def __init__(self, path: str):
+ fmt = None
+ if path.endswith(".mp3"):
+ fmt = "mp3"
+ elif path.endswith(".wav"):
+ fmt = "wav"
+ else:
+ raise NotImplementedError(f"Requested file type {path} " + \
+ "is not supported")
+ print(f"Loading audio data", file=sys.stderr)
+ audio = AudioSegment.from_file(path, format=fmt)
+ audio = audio.set_channels(1)
+ audio = audio.set_frame_rate(16000)
+ frames = np.array(audio.get_array_of_samples())
+ frames = np.int16(frames).tobytes()
+
+ self.frames = frames
+ self.fps = 16000
+
+ def getSamples(self) -> bytes:
+ frames = self.frames
+ self.frames = b''
+ return frames
+
+ if len(frames) < nframes:
+ frames += np.zeros(nframes - len(frames), dtype=np.int16).tobytes()
+
+ return frames
+
class AudioCollector:
def __init__(self, stream: AudioStream):
self.stream = stream
@@ -365,7 +396,7 @@ class AppControl:
run = True
app_ctrl = AppControl()
-def recordMeDaddy(
+def recordAudio(
mic_device: str,
min_volume: float = -1.3,
max_volume: float = -0.8
@@ -383,7 +414,7 @@ def recordMeDaddy(
#collector_hd = NormalizingAudioCollector(collector_hd)
collector_hd = CompressingAudioCollector(collector_hd)
- min_silence_ms = 1000
+ min_silence_ms = 250
max_speech_s = 30
segmenter = AudioSegmenter(
min_silence_ms=min_silence_ms,
@@ -438,6 +469,104 @@ def recordMeDaddy(
collector_hd.keepLast(1.0)
print("Stopped recording")
+class Segment:
+ def __init__(self,
+ transcript: str,
+ start_ts: float,
+ end_ts: float,
+ wall_ts: float,
+ avg_logprob: float,
+ no_speech_prob: float,
+ compression_ratio: float):
+ self.transcript = transcript
+ # start_ts, end_ts are timestamps in seconds relative to `wall_ts`.
+ self.start_ts = start_ts
+ self.end_ts = end_ts
+ # wall_ts is the time.time() at which the oldest audio sample leading
+ # to this transcript was collected.
+ self.wall_ts = wall_ts
+ self.avg_logprob = avg_logprob
+ self.no_speech_prob = no_speech_prob
+ self.compression_ratio = compression_ratio
+
+ def __str__(self):
+ ts = f"(ts: {self.start_ts}-{self.end_ts}) "
+
+ wall_ts_start = datetime.utcfromtimestamp(self.start_ts + self.wall_ts).strftime('%H:%M:%S')
+ wall_ts_end = datetime.utcfromtimestamp(self.end_ts + self.wall_ts).strftime('%H:%M:%S')
+ wall_ts = f"(wall ts: {wall_ts_start}-{wall_ts_end}) "
+
+ no_speech = f"(no_speech: {self.no_speech_prob}) "
+ avg_logprob = f"(avg_logprob: {self.avg_logprob}) "
+ return f"{self.transcript} " + ts + wall_ts + no_speech + avg_logprob
+
+def pipInstall(pkgs: typing.List[str]) -> bool:
+ pkgs_str = " ".join(pkgs)
+ print(f"Installing {pkgs_str}")
+ env = os.environ.copy()
+ # cwd is set at top of __main__. We set PATH to ensure that installed
+ # Python packages have access to any binaries that come with them.
+ env["PATH"] = os.getcwd() + "/Python/Scripts;" + env['PATH']
+ pip_proc = subprocess.Popen(
+ f"./Python/python.exe -m pip install {pkgs_str} --no-warn-script-location".split(),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=env)
+ pip_stdout, pip_stderr = pip_proc.communicate()
+ pip_stdout = pip_stdout.decode("utf-8")
+ pip_stderr = pip_stderr.decode("utf-8")
+ print(pip_stdout, file=sys.stderr)
+ print(pip_stderr, file=sys.stderr)
+ if pip_proc.returncode != 0:
+ print(f"`pip install {pkgs_str}` exited with {pip_proc.returncode}",
+ file=sys.stderr)
+ return False
+ return True
+
+class Whisper:
+ def __init__(self,
+ collector: AudioCollector):
+ self.collector = collector
+
+ import torch
+ from transformers import pipeline
+
+ whisper_model = "openai/whisper-large-v2"
+ print(f"Loading pipeline for {whisper_model}...")
+ self.pipe = pipeline(
+ "automatic-speech-recognition",
+ model="distil-whisper/distil-large-v2",
+ torch_dtype=torch.float16,
+ device="cuda",
+ )
+ print(f"Done.")
+
+ def transcribe(self, frames: bytes = None) -> typing.List[Segment]:
+ if frames is None:
+ frames = self.collector.getAudio()
+ # Convert from signed 16-bit int [-32768, 32767] to signed 32-bit float on
+ # [-1, 1].
+ audio = np.frombuffer(frames,
+ dtype=np.int16).flatten().astype(np.float32) / 32768.0
+
+ t0 = time.time()
+ res = self.pipe(
+ audio,
+ chunk_length_s=30,
+ batch_size=1)
+
+ result = [Segment(res["text"],
+ 0,
+ 0,
+ self.collector.begin(),
+ 0,
+ 0,
+ 0)]
+
+ t1 = time.time()
+ print(f"Transcription latency (s): {t1 - t0}: {result[0].transcript}")
+ return result
+
def getOutput() -> str:
sys.stdout.flush()
with open("output.log", "r") as f:
@@ -447,6 +576,37 @@ def stopApp():
print("Requesting app stop")
app_ctrl.run = False
+def transcribeAudio(concatenated_path: str):
+ # Step 1: Install Whisper requirements
+ with open("whisper_requirements.txt", "r") as file:
+ requirements = file.read().splitlines()
+ if not pipInstall(requirements):
+ return
+
+ # Step 2: Iterate over .wav files in the current working directory
+ whisper = Whisper(None)
+ for wav_file in os.listdir('.'):
+ if wav_file.endswith('.wav'):
+ if wav_file.endswith(os.path.basename(concatenated_path)):
+ print("Skipping concatenated file")
+ continue
+ # Step 3: Transcription pipeline
+ # TODO parameterize high fidelity framerate
+ print(f"Transcribing {wav_file}")
+ disk_stream = DiskStream(wav_file)
+ collector = CompressingAudioCollector(AudioCollector(disk_stream))
+ whisper.collector = collector
+
+ # Transcribe the audio
+ segments = whisper.transcribe()
+
+ # Step 4: Save transcriptions
+ transcript_filename = wav_file.replace('.wav', '.txt')
+ with open(transcript_filename, 'w') as txt_file:
+ for segment in segments:
+ txt_file.write(segment.transcript + '\n')
+ print(f"Transcript generated at {transcript_filename}")
+
if __name__ == "__main__":
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
@@ -463,15 +623,19 @@ if __name__ == "__main__":
max_volume = gr.Number(label="Maximum volume", value=-0.8)
record_audio = gr.Button("Record audio")
stop_recording = gr.Button("Stop recording")
+ transcribe_audio = gr.Button("Transcribe audio")
concatenated_path = gr.Text(label="Combined audio filename", value="combined.wav")
min_length = gr.Number(label="Minimum length (seconds)", value=3.0)
concatenate_audio = gr.Button("Combine audio files")
dbg_output = gr.Text(label="Output")
- record_audio.click(recordMeDaddy, [mic_device, min_volume, max_volume],
+ record_audio.click(recordAudio, [mic_device, min_volume, max_volume],
dbg_output)
stop_recording.click(stopApp, [], dbg_output)
+
+ transcribe_audio.click(transcribeAudio, [concatenated_path], dbg_output)
+
concatenate_audio.click(concatenateWavFiles, [concatenated_path],
dbg_output)