import os
import gradio as gr
import torch
import torchaudio
from pydub import AudioSegment
from pyannote.audio import Pipeline
from huggingface_hub import login
import numpy as np
import json

# Authenticate with Huggingface
AUTH_TOKEN = os.getenv("HF_TOKEN")

# Load the diarization pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token = AUTH_TOKEN).to(device)

def preprocess_audio(audio_path):
    """Convert audio to mono, 16kHz WAV format suitable for pyannote."""
    try:
        # Load audio with pydub
        audio = AudioSegment.from_file(audio_path)
        # Convert to mono and set sample rate to 16kHz
        audio = audio.set_channels(1).set_frame_rate(16000)
        # Export to temporary WAV file
        temp_wav = "temp_audio.wav"
        audio.export(temp_wav, format="wav")
        return temp_wav
    except Exception as e:
        raise ValueError(f"Error preprocessing audio: {str(e)}")

def diarize_audio(audio_path, num_speakers):
    """Perform speaker diarization and return formatted results."""
    try:
        # Validate inputs
        if not os.path.exists(audio_path):
            raise ValueError("Audio file not found.")
        if not isinstance(num_speakers, int) or num_speakers < 1:
            raise ValueError("Number of speakers must be a positive integer.")

        # Preprocess audio
        wav_path = preprocess_audio(audio_path)

        # Load audio for pyannote
        waveform, sample_rate = torchaudio.load(wav_path)
        audio_dict = {"waveform": waveform, "sample_rate": sample_rate}

        # Configure pipeline with number of speakers
        pipeline_params = {"num_speakers": num_speakers}
        diarization = pipeline(audio_dict, **pipeline_params)

        # Format results
        results = []
        text_output = ""
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            result = {
                "start": round(turn.start, 3),
                "end": round(turn.end, 3),
                "speaker_id": speaker
            }
            results.append(result)
            text_output += f"Speaker {speaker}: {result['start']}s - {result['end']}s\n"

        # Clean up temporary file
        if os.path.exists(wav_path):
            os.remove(wav_path)

        # Return text and JSON results
        json_output = json.dumps(results, indent=2)
        return text_output, json_output

    except Exception as e:
        return f"Error: {str(e)}", ""

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Speaker Diarization with Pyannote 3.0")
    gr.Markdown("Upload an audio file and specify the number of speakers to diarize the audio.")
    
    with gr.Row():
        audio_input = gr.Audio(label="Upload Audio File", type="filepath")
        num_speakers = gr.Slider(minimum=1, maximum=10, step=1, label="Number of Speakers", value=2)
    
    submit_btn = gr.Button("Diarize")
    
    with gr.Row():
        text_output = gr.Textbox(label="Diarization Results (Text)")
        json_output = gr.Textbox(label="Diarization Results (JSON)")
    
    submit_btn.click(
        fn=diarize_audio,
        inputs=[audio_input, num_speakers],
        outputs=[text_output, json_output]
    )

# Launch the Gradio app
demo.launch()