Spaces:

ANASAKHTAR
/

Image_Captions_With_Audio

Running

Muhammad Anas Akhtar

Update app.py

83ed3dd verified 5 months ago

2.79 kB

	import torch
	import gradio as gr
	from PIL import Image
	import numpy as np
	import os

	# Use a pipeline as a high-level helper
	from transformers import pipeline

	# Set device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Initialize the pipelines
	caption_image = pipeline("image-to-text",
	model="Salesforce/blip-image-captioning-large",
	device=device)

	# Using a different TTS model that's more stable
	narrator = pipeline("text-to-speech",
	model="microsoft/speecht5_tts",
	device=device)

	def ensure_output_dir():
	"""Ensure the output directory exists"""
	output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
	os.makedirs(output_dir, exist_ok=True)
	return output_dir

	def generate_audio(text):
	"""
	Generate audio from text and save it
	"""
	try:
	# Generate the speech
	speech = narrator(text)

	# Create output directory and file path
	output_dir = ensure_output_dir()
	output_path = os.path.join(output_dir, "caption_audio.wav")

	# Save the audio file
	with open(output_path, "wb") as f:
	f.write(speech["audio"])

	return output_path
	except Exception as e:
	print(f"Error generating audio: {str(e)}")
	raise gr.Error(f"Failed to generate audio: {str(e)}")

	def caption_my_image(image):
	"""
	Generate caption for image and convert it to speech
	"""
	try:
	if image is None:
	raise gr.Error("Please upload an image")

	# Generate caption
	captions = caption_image(images=image)
	if not captions or len(captions) == 0:
	raise gr.Error("Could not generate caption for this image")

	caption_text = captions[0]['generated_text']
	print(f"Generated caption: {caption_text}")

	# Generate audio from caption
	audio_path = generate_audio(caption_text)

	return [audio_path, caption_text]
	except Exception as e:
	print(f"Error in caption_my_image: {str(e)}")
	raise gr.Error(f"Failed to process image: {str(e)}")

	# Create the Gradio interface
	demo = gr.Interface(
	fn=caption_my_image,
	inputs=[
	gr.Image(label="Upload Image", type="pil")
	],
	outputs=[
	gr.Audio(label="Generated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="Image Captioning with Audio",
	description="""
	Upload an image and the application will:
	1. Generate a descriptive caption for the image
	2. Convert the caption to speech
	""",
	examples=[],
	cache_examples=False
	)

	if __name__ == "__main__":
	demo.launch()