Muhammad Anas Akhtar
Update app.py
83ed3dd verified
raw
history blame
2.79 kB
import torch
import gradio as gr
from PIL import Image
import numpy as np
import os
# Use a pipeline as a high-level helper
from transformers import pipeline
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize the pipelines
caption_image = pipeline("image-to-text",
model="Salesforce/blip-image-captioning-large",
device=device)
# Using a different TTS model that's more stable
narrator = pipeline("text-to-speech",
model="microsoft/speecht5_tts",
device=device)
def ensure_output_dir():
"""Ensure the output directory exists"""
output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
os.makedirs(output_dir, exist_ok=True)
return output_dir
def generate_audio(text):
"""
Generate audio from text and save it
"""
try:
# Generate the speech
speech = narrator(text)
# Create output directory and file path
output_dir = ensure_output_dir()
output_path = os.path.join(output_dir, "caption_audio.wav")
# Save the audio file
with open(output_path, "wb") as f:
f.write(speech["audio"])
return output_path
except Exception as e:
print(f"Error generating audio: {str(e)}")
raise gr.Error(f"Failed to generate audio: {str(e)}")
def caption_my_image(image):
"""
Generate caption for image and convert it to speech
"""
try:
if image is None:
raise gr.Error("Please upload an image")
# Generate caption
captions = caption_image(images=image)
if not captions or len(captions) == 0:
raise gr.Error("Could not generate caption for this image")
caption_text = captions[0]['generated_text']
print(f"Generated caption: {caption_text}")
# Generate audio from caption
audio_path = generate_audio(caption_text)
return [audio_path, caption_text]
except Exception as e:
print(f"Error in caption_my_image: {str(e)}")
raise gr.Error(f"Failed to process image: {str(e)}")
# Create the Gradio interface
demo = gr.Interface(
fn=caption_my_image,
inputs=[
gr.Image(label="Upload Image", type="pil")
],
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption")
],
title="Image Captioning with Audio",
description="""
Upload an image and the application will:
1. Generate a descriptive caption for the image
2. Convert the caption to speech
""",
examples=[],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()