|
import torch |
|
import gradio as gr |
|
from PIL import Image |
|
import numpy as np |
|
import os |
|
|
|
|
|
from transformers import pipeline |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Using device: {device}") |
|
|
|
|
|
caption_image = pipeline("image-to-text", |
|
model="Salesforce/blip-image-captioning-large", |
|
device=device) |
|
|
|
|
|
narrator = pipeline("text-to-speech", |
|
model="microsoft/speecht5_tts", |
|
device=device) |
|
|
|
def ensure_output_dir(): |
|
"""Ensure the output directory exists""" |
|
output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions") |
|
os.makedirs(output_dir, exist_ok=True) |
|
return output_dir |
|
|
|
def generate_audio(text): |
|
""" |
|
Generate audio from text and save it |
|
""" |
|
try: |
|
|
|
speech = narrator(text) |
|
|
|
|
|
output_dir = ensure_output_dir() |
|
output_path = os.path.join(output_dir, "caption_audio.wav") |
|
|
|
|
|
with open(output_path, "wb") as f: |
|
f.write(speech["audio"]) |
|
|
|
return output_path |
|
except Exception as e: |
|
print(f"Error generating audio: {str(e)}") |
|
raise gr.Error(f"Failed to generate audio: {str(e)}") |
|
|
|
def caption_my_image(image): |
|
""" |
|
Generate caption for image and convert it to speech |
|
""" |
|
try: |
|
if image is None: |
|
raise gr.Error("Please upload an image") |
|
|
|
|
|
captions = caption_image(images=image) |
|
if not captions or len(captions) == 0: |
|
raise gr.Error("Could not generate caption for this image") |
|
|
|
caption_text = captions[0]['generated_text'] |
|
print(f"Generated caption: {caption_text}") |
|
|
|
|
|
audio_path = generate_audio(caption_text) |
|
|
|
return [audio_path, caption_text] |
|
except Exception as e: |
|
print(f"Error in caption_my_image: {str(e)}") |
|
raise gr.Error(f"Failed to process image: {str(e)}") |
|
|
|
|
|
demo = gr.Interface( |
|
fn=caption_my_image, |
|
inputs=[ |
|
gr.Image(label="Upload Image", type="pil") |
|
], |
|
outputs=[ |
|
gr.Audio(label="Generated Audio"), |
|
gr.Textbox(label="Generated Caption") |
|
], |
|
title="Image Captioning with Audio", |
|
description=""" |
|
Upload an image and the application will: |
|
1. Generate a descriptive caption for the image |
|
2. Convert the caption to speech |
|
""", |
|
examples=[], |
|
cache_examples=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |