#!/usr/bin/env python3

import argparse
import copy
from datetime import datetime
import os
import osc_ctrl
from functools import partial
# python3 -m pip install pyaudio
# License: MIT.
import pyaudio
import numpy as np
# python3 -m pip install playsound==1.2.2
# License: MIT.
from playsound import playsound
import steamvr
import string_matcher
import sys
import threading
import time
import wave
# python3 -m pip install git+https://github.com/openai/whisper.git
# python3 -m pip install torch -f https://download.pytorch.org/whl/torch_stable.html
# License: MIT.
import whisper

class AudioState:
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    # This matches the framerate expected by whisper.
    RATE = 16000

    # The maximum length that recordAudio() will put into frames before it
    # starts dropping from the start.
    MAX_LENGTH_S = 10
    MAX_LENGTH_S_WHISPER = 30
    # The minimum length that recordAudio() will wait for before saving audio.
    MIN_LENGTH_S = 1

    # PyAudio object
    p = None

    # PyAudio stream object
    stream = None

    text = ""
    committed_text = ""
    frames = []

    # Locks access to `text`.
    transcribe_lock = threading.Lock()

    # Locks access to `frames`, and audio stored on disk.
    audio_lock = threading.Lock()

    # Used to tell the threads when to stop.
    run_app = True

    transcribe_sleep_duration_min_s = 0.05
    transcribe_sleep_duration_max_s = 1.50
    transcribe_no_change_count = 0
    transcribe_sleep_duration = transcribe_sleep_duration_min_s

    tx_state = osc_ctrl.OscTxState()

    # The transcription thread transcribes without holding locks, then
    # blocks on it. Thus we need some way to tell the transcription
    # thread to drop that transcription.
    drop_transcription = False

    # The language the user is speaking in. Default is English but user may set
    # this to whatever they want.
    language = whisper.tokenizer.TO_LANGUAGE_CODE["english"]

    audio_paused = False

    osc_client = osc_ctrl.getClient()

def dumpMicDevices():
    p = pyaudio.PyAudio()
    info = p.get_host_api_info_by_index(0)
    numdevices = info.get('deviceCount')

    for i in range(0, numdevices):
        if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
            device_name = p.get_device_info_by_host_api_device_index(0, i).get('name')
            print("Input Device id ", i, " - ", device_name)

def onAudioFramesAvailable(
        audio_state,
        input_rate,
        frames,
        frame_count,
        time_info,
        status_flags):
    if audio_state.audio_paused:
        return (frames, pyaudio.paContinue)

    # Reduce sample rate from mic rate to Whisper rate by dropping frames.
    decimated = b''
    frame_len = int(len(frames) / frame_count)
    next_frame = 0.0
    keep_every = float(input_rate) / audio_state.RATE
    i = 0
    for i in range(0, frame_count):
        if i >= next_frame:
            decimated += frames[i*frame_len:(i+1)*frame_len]
            next_frame += keep_every
        i += 1

    audio_state.frames.append(decimated)

    max_frames = int(input_rate * audio_state.MAX_LENGTH_S / audio_state.CHUNK)
    if len(audio_state.frames) > max_frames:
        audio_state.frames = audio_state.frames[-1 * max_frames :]

    return (frames, pyaudio.paContinue)

def getMicStream(which_mic):
    audio_state = AudioState()
    audio_state.p = pyaudio.PyAudio()

    print("Finding mic {}...".format(which_mic))
    dumpMicDevices()
    got_match = False
    device_index = -1
    focusrite_str = "Focusrite"
    index_str = "Digital Audio Interface"
    if which_mic == "index":
        target_str = index_str
    elif which_mic == "focusrite":
        target_str = focusrite_str
    else:
        print("Mic {} requested, treating it as a numerical device ID".format(which_mic))
        device_index = int(which_mic)
        got_match = True

    while got_match == False:
        info = audio_state.p.get_host_api_info_by_index(0)
        numdevices = info.get('deviceCount')
        for i in range(0, numdevices):
            if (audio_state.p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
                device_name = audio_state.p.get_device_info_by_host_api_device_index(0, i).get('name')
                if target_str in device_name:
                    print("Got match: {}".format(device_name))
                    device_index = i
                    got_match = True
                    break
        if got_match == False:
            print("No match, sleeping")
            time.sleep(3)

    info = audio_state.p.get_device_info_by_host_api_device_index(0, device_index)
    input_rate = int(info['defaultSampleRate'])
    print("input rate: {}".format(input_rate))

    # Bind audio_state to onAudioFramesAvailable
    callback = partial(onAudioFramesAvailable, audio_state, input_rate)

    audio_state.stream = audio_state.p.open(format=audio_state.FORMAT,
            channels=audio_state.CHANNELS, rate=input_rate,
            input=True, frames_per_buffer=audio_state.CHUNK,
            input_device_index=device_index,
            stream_callback=callback)

    audio_state.stream.start_stream()

    return audio_state

def resetAudioLocked(audio_state):
    audio_state.frames = []
    audio_state.transcribe_no_change_count = 0
    audio_state.transcribe_sleep_duration = \
            audio_state.transcribe_sleep_duration_min_s

    audio_state.committed_text = ""
    audio_state.text = ""

def resetDisplayLocked(audio_state):
    osc_ctrl.clear(audio_state.osc_client, audio_state.tx_state)

def resetAudio(audio_state):
    audio_state.transcribe_lock.acquire()
    audio_state.audio_lock.acquire()
    resetAudioLocked(audio_state)
    audio_state.audio_lock.release()
    audio_state.transcribe_lock.release()

# Transcribe the audio recorded in a file.
def transcribe(audio_state, model, frames):

    start_time = time.time()

    frames = audio_state.frames
    # Convert from signed 16-bit int [-32768, 32767] to signed 16-bit float on
    # [-1, 1].
    # We should technically acquire a lock to protect frames, but this is
    # really slow and in practice it doesn't make the app crash, so who cares.
    frames = np.asarray(audio_state.frames)
    audio = np.frombuffer(frames, np.int16).flatten().astype(np.float32) / 32768.0

    audio = whisper.pad_or_trim(audio, length = audio_state.RATE *
            audio_state.MAX_LENGTH_S_WHISPER)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    result = None
    #for temp in (0.00, 0.05, 0.10, 0.15, 0.20):
    #for temp in (0.00, 0.05):
    for temp in (0.00,):
        options = whisper.DecodingOptions(language = audio_state.language,
                beam_size = 5, temperature = temp, without_timestamps = True)
        result = whisper.decode(model, mel, options)

        if result.avg_logprob < -1.0:
            print("avg logprob: {}".format(result.avg_logprob))
            result = None
            continue

        if result.compression_ratio > 2.4:
            print("compression ratio: {}".format(result.compression_ratio))
            result = None
            continue

        if result.no_speech_prob > 0.60:
            print("no speech prob: {}".format(result.no_speech_prob))
            result = None
            continue

        result = result.text
        break

    return result

def transcribeAudio(audio_state, model):
    last_transcribe_time = time.time()
    while audio_state.run_app == True:
        # Pace this out
        time.sleep(audio_state.transcribe_sleep_duration)

        # Increase sleep time. Code below will set sleep time back to minimum
        # if a change is detected.
        if audio_state.transcribe_no_change_count < 10:
            audio_state.transcribe_no_change_count += 1
        longer_sleep_dur = audio_state.transcribe_sleep_duration
        longer_sleep_dur += audio_state.transcribe_sleep_duration_min_s * (1.3**audio_state.transcribe_no_change_count)
        audio_state.transcribe_sleep_duration = min(
                audio_state.transcribe_sleep_duration_max_s,
                longer_sleep_dur)

        text = transcribe(audio_state, model, audio_state.frames)
        if not text:
            print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
            last_transcribe_time = time.time()
            continue

        if audio_state.drop_transcription:
            audio_state.drop_transcription = False
            print("drop transcription ({} seconds)".format(time.time() - last_transcribe_time))
            last_transcribe_time = time.time()
            continue

        words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split()

        now = time.time()
        print("Transcription ({} seconds): {}".format(
            now - last_transcribe_time,
            audio_state.text))
        last_transcribe_time = now

        old_text = audio_state.text

        audio_state.text = string_matcher.matchStrings(audio_state.text,
                text, window_size = 20)
        if old_text != audio_state.text:
            # We think the user said something, so  reset the amount of
            # time we sleep between transcriptions to the minimum.
            audio_state.transcribe_no_change_count = 0
            audio_state.transcribe_sleep_duration = audio_state.transcribe_sleep_duration_min_s

def sendAudio(audio_state):
    while audio_state.run_app == True:
        text = audio_state.committed_text + " " + audio_state.text
        ret = osc_ctrl.sendMessageLazy(audio_state.osc_client, text,
                audio_state.tx_state)
        is_paging = (ret == osc_ctrl.SEND_MSG_LAZY_SENT_NON_EMPTY)
        osc_ctrl.indicatePaging(audio_state.osc_client, is_paging)

        # Pace this out
        time.sleep(0.01)

def readControllerInput(audio_state):
    session = steamvr.SessionState()
    RECORD_STATE = 0
    PAUSE_STATE = 1
    state = PAUSE_STATE
    osc_ctrl.indicateSpeech(audio_state.osc_client, False)
    osc_ctrl.indicatePaging(audio_state.osc_client, False)
    while audio_state.run_app == True:
        time.sleep(0.05)

        event = steamvr.pollButtonPress(session)

        if event == steamvr.EVENT_RISING_EDGE:
            print("event get")
            if state == RECORD_STATE:
                state = PAUSE_STATE
                osc_ctrl.indicateSpeech(audio_state.osc_client, False)
                playsound(os.path.abspath("../Sounds/Noise_Off.wav"))

                audio_state.audio_paused = True
            elif state == PAUSE_STATE:
                state = RECORD_STATE
                osc_ctrl.indicateSpeech(audio_state.osc_client, True)
                playsound(os.path.abspath("../Sounds/Noise_On.wav"))

                resetAudioLocked(audio_state)
                resetDisplayLocked(audio_state)
                audio_state.drop_transcription = True
                audio_state.audio_paused = False

# model should correspond to one of the Whisper models defined in
# whisper/__init__.py. Examples: tiny, base, small, medium.
def transcribeLoop(mic: str, language: str, model: str):
    audio_state = getMicStream(mic)
    audio_state.language = whisper.tokenizer.TO_LANGUAGE_CODE[language]

    print("Safe to start talking")

    abspath = os.path.abspath(__file__)
    dname = os.path.dirname(abspath)
    model_root = os.path.join(dname, "Models")

    print("Model {} will be saved to {}".format(model, model_root))
    model = whisper.load_model(model, download_root=model_root)

    transcribe_audio_thd = threading.Thread(target = transcribeAudio, args = [audio_state, model])
    transcribe_audio_thd.daemon = True
    transcribe_audio_thd.start()

    send_audio_thd = threading.Thread(target = sendAudio, args = [audio_state])
    send_audio_thd.daemon = True
    send_audio_thd.start()

    controller_input_thd = threading.Thread(target = readControllerInput, args = [audio_state])
    controller_input_thd.daemon = True
    controller_input_thd.start()

    print("Press enter to start a new message.")
    for line in sys.stdin:
        audio_state.transcribe_lock.acquire()
        audio_state.audio_lock.acquire()
        resetAudioLocked(audio_state)
        resetDisplayLocked(audio_state)
        audio_state.drop_transcription = True
        audio_state.audio_paused = False
        audio_state.audio_lock.release()
        audio_state.transcribe_lock.release()
        if "exit" in line or "quit" in line:
            break

    print("Joining threads")
    audio_state.run_app = False
    transcribe_audio_thd.join()
    controller_input_thd.join()


if __name__ == "__main__":
    # Set cwd to the directory holding the script
    abspath = os.path.abspath(__file__)
    dname = os.path.dirname(abspath)
    os.chdir(dname)

    parser = argparse.ArgumentParser()
    parser.add_argument("--mic", type=str, help="Which mic to use. Options: index, focusrite. Default: index")
    parser.add_argument("--language", type=str, help="Which language to use. Ex: english, japanese, chinese, french, german.")
    parser.add_argument("--model", type=str, help="Which AI model to use. Ex: tiny, base, small, medium")
    args = parser.parse_args()

    if not args.mic:
        args.mic = "index"

    if not args.language:
        args.language = "english"

    if not args.model:
        args.language = "base"

    transcribeLoop(args.mic, args.language, args.model)