import gradio as gr 
import numpy as np
import pytubefix as pt
import os, time, librosa, torch
from pyannote.audio import Pipeline
from transformers import pipeline
import spaces


def second_to_timecode(x: float) -> str:
    """Float x second to HH:MM:SS.DDD format."""
    hour, x = divmod(x, 3600)
    minute, x = divmod(x, 60)
    second, x = divmod(x, 1)
    millisecond = int(x * 1000.)

    return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond)


def download_from_youtube(youtube_link: str) -> str:
    yt = pt.YouTube(youtube_link)
    available_streams = yt.streams.filter(only_audio=True)
    print('available streams:')
    print(available_streams)
    stream = available_streams.first()
    # , audio_codec='wav'
    
    stream.download(filename="audio.wav")
    return "audio.wav"


MODEL_NAME = 'Dorjzodovsuren/whisper-large-v2-mn'
#MODEL_NAME = 'Dorjzodovsuren/whisper-large-v3-turbo-mn-2'
lang = 'mn'

chunk_length_s = 9
vad_activation_min_duration = 9 # sec
device = 0 if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 16_000

######## LOAD MODELS FROM HUB ########
dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.environ['TOKEN'])
vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=os.environ['TOKEN'])

dia_model = dia_model.to(torch.device('cuda'))
vad_model = vad_model.to(torch.device('cuda'))

import torch
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

if MODEL_NAME == 'Dorjzodovsuren/whisper-large-v2-mn':
  processor = AutoProcessor.from_pretrained(MODEL_NAME)

else:
  processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=chunk_length_s, 
    device_map="auto"
)

lang = 'mn'
asr_pipeline.model.config.forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

print("----------> Loaded models <-----------")

gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
@spaces.GPU(duration=gpu_timeout)
def generator(file_upload, num_speakers, max_duration, history):
    history = history or ""
    
    path = file_upload
    waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration)

    print(waveform.shape, sampling_rate)
    waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device)

    dia_result = dia_model({
        "waveform": waveform_tensor,
        "sample_rate": sampling_rate,
    }, num_speakers=num_speakers)

    counter = 1
    
    for speech_turn, track, speaker in dia_result.itertracks(yield_label=True):
        print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}")
        _start = int(sampling_rate * speech_turn.start)
        _end = int(sampling_rate * speech_turn.end)
        data = waveform[_start: _end]

        if speech_turn.end - speech_turn.start > vad_activation_min_duration:
            print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD')
            vad_output = vad_model({
                'waveform': waveform_tensor[:, _start:_end],
                'sample_rate': sampling_rate})
            for vad_turn in vad_output.get_timeline().support():
                vad_start = _start + int(sampling_rate * vad_turn.start)
                vad_end = _start + int(sampling_rate * vad_turn.end)
                prediction = asr_pipeline(waveform[vad_start: vad_end])['text']
                history +=  f"{counter}\n" + \
                            f"{second_to_timecode(speech_turn.start + vad_turn.start)} --> {second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \
                            f"{prediction}\n\n"
                            # f">> {speaker}: {prediction}\n\n"
                yield history, history, None
                counter += 1

        else:
            prediction = asr_pipeline(data)['text']
            history +=  f"{counter}\n" + \
                        f"{second_to_timecode(speech_turn.start)} --> {second_to_timecode(speech_turn.end)}\n" + \
                        f"{prediction}\n\n"
                        # f">> {speaker}: {prediction}\n\n"
            counter += 1
            yield history, history, None
    
    # https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats%2Csubrip-srt-example%2Csubviewer-sbv-example
    file_name = 'transcript.srt'
    with open(file_name, 'w') as fp:
        fp.write(history)
    
    yield history, history, file_name


# demo = gr.Interface(
#     generator, 
#     inputs=[
#         gr.Audio(type="filepath"),
#         gr.Number(value=1, label="Number of Speakers"),
#         gr.Number(value=120, label="Maximum Duration (Seconds)"),
#         'state',
#     ],
#     outputs=['text', 'state', 'file'],
#     title="Mongolian Whisper 🇲🇳",
#     description=(
#         "Transcribe Microphone / Uploaded File in Mongolian Whisper Model."
#     )
# )

with gr.Blocks(title="Mongolian Whisper 🇲🇳") as demo:
    with gr.Column():  # everything in one vertical stack
        # ---- inputs ----
        audio_input      = gr.Audio(type="filepath", label="Дуу")
        
        with gr.Accordion("Нэмэлт тохиргоо", open=False):
            speakers_input = gr.Number(value=1, label="Яриж буй нийт хүний тоо хэд вэ?")
            duration_input = gr.Slider(
            minimum=0,
            maximum=300,
            step=1,
            value=120,
            label="Дууны хамгийн урт хэмжээ (Seconds) хэд вэ?"
        )
        state_input      = gr.State()  # hidden, but part of the workflow

        # ---- outputs ----
        text_output      = gr.Textbox(label="Текст хөрвүүлгэ")
        state_output     = gr.State()  # hidden, passed through
        file_output      = gr.File(label="Үр дүнгийн файл")

        # hook it all up
        audio_input.change(
            fn=generator,
            inputs=[audio_input, speakers_input, duration_input, state_input],
            outputs=[text_output, state_output, file_output],
        )


# define queue - required for generators
demo.queue()

demo.launch(debug=True)