import torch
import gradio as gr
from PIL import Image
import numpy as np
import os
from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
import scipy.io.wavfile as wavfile

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize the image captioning pipeline
caption_image = pipeline("image-to-text",
                       model="Salesforce/blip-image-captioning-large",
                       device=device)

# Initialize TTS with Coqui TTS
try:
    from TTS.api import TTS
    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
except ImportError:
    print("Installing TTS...")
    import subprocess
    subprocess.check_call(["pip", "install", "TTS"])
    from TTS.api import TTS
    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)

def ensure_output_dir():
    """Ensure the output directory exists"""
    output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def generate_audio(text):
    """
    Generate audio from text and save it
    """
    try:
        # Create output directory and file path
        output_dir = ensure_output_dir()
        output_path = os.path.join(output_dir, "caption_audio.wav")
        
        # Generate speech using Coqui TTS
        tts.tts_to_file(text=text, file_path=output_path)
        
        return output_path
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        raise gr.Error(f"Failed to generate audio: {str(e)}")

def caption_my_image(image):
    """
    Generate caption for image and convert it to speech
    """
    try:
        if image is None:
            raise gr.Error("Please upload an image")
            
        # Generate caption
        captions = caption_image(images=image)
        if not captions or len(captions) == 0:
            raise gr.Error("Could not generate caption for this image")
            
        caption_text = captions[0]['generated_text']
        print(f"Generated caption: {caption_text}")
        
        # Generate audio from caption
        audio_path = generate_audio(caption_text)
        
        return [audio_path, caption_text]
    except Exception as e:
        print(f"Error in caption_my_image: {str(e)}")
        raise gr.Error(f"Failed to process image: {str(e)}")

# Create the Gradio interface
demo = gr.Interface(
    fn=caption_my_image,
    inputs=[
        gr.Image(label="Upload Image", type="pil")
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Image Captioning with Audio",
    description="""
    Upload an image and the application will:
    1. Generate a descriptive caption for the image
    2. Convert the caption to speech
    """,
    examples=[],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()