import torch
import gradio as gr
from PIL import Image
import numpy as np
import os

# Use a pipeline as a high-level helper
from transformers import pipeline

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize the pipelines
caption_image = pipeline("image-to-text",
                       model="Salesforce/blip-image-captioning-large",
                       device=device)

# Using a different TTS model that's more stable
narrator = pipeline("text-to-speech",
                   model="microsoft/speecht5_tts",
                   device=device)

def ensure_output_dir():
    """Ensure the output directory exists"""
    output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def generate_audio(text):
    """
    Generate audio from text and save it
    """
    try:
        # Generate the speech
        speech = narrator(text)
        
        # Create output directory and file path
        output_dir = ensure_output_dir()
        output_path = os.path.join(output_dir, "caption_audio.wav")
        
        # Save the audio file
        with open(output_path, "wb") as f:
            f.write(speech["audio"])
        
        return output_path
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        raise gr.Error(f"Failed to generate audio: {str(e)}")

def caption_my_image(image):
    """
    Generate caption for image and convert it to speech
    """
    try:
        if image is None:
            raise gr.Error("Please upload an image")
            
        # Generate caption
        captions = caption_image(images=image)
        if not captions or len(captions) == 0:
            raise gr.Error("Could not generate caption for this image")
            
        caption_text = captions[0]['generated_text']
        print(f"Generated caption: {caption_text}")
        
        # Generate audio from caption
        audio_path = generate_audio(caption_text)
        
        return [audio_path, caption_text]
    except Exception as e:
        print(f"Error in caption_my_image: {str(e)}")
        raise gr.Error(f"Failed to process image: {str(e)}")

# Create the Gradio interface
demo = gr.Interface(
    fn=caption_my_image,
    inputs=[
        gr.Image(label="Upload Image", type="pil")
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Image Captioning with Audio",
    description="""
    Upload an image and the application will:
    1. Generate a descriptive caption for the image
    2. Convert the caption to speech
    """,
    examples=[],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()