import torch import gradio as gr from PIL import Image import numpy as np import os from transformers import pipeline, AutoProcessor, AutoModelForCausalLM import scipy.io.wavfile as wavfile # Set device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Initialize the image captioning pipeline caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) # Initialize TTS with Coqui TTS try: from TTS.api import TTS tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) except ImportError: print("Installing TTS...") import subprocess subprocess.check_call(["pip", "install", "TTS"]) from TTS.api import TTS tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) def ensure_output_dir(): """Ensure the output directory exists""" output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions") os.makedirs(output_dir, exist_ok=True) return output_dir def generate_audio(text): """ Generate audio from text and save it """ try: # Create output directory and file path output_dir = ensure_output_dir() output_path = os.path.join(output_dir, "caption_audio.wav") # Generate speech using Coqui TTS tts.tts_to_file(text=text, file_path=output_path) return output_path except Exception as e: print(f"Error generating audio: {str(e)}") raise gr.Error(f"Failed to generate audio: {str(e)}") def caption_my_image(image): """ Generate caption for image and convert it to speech """ try: if image is None: raise gr.Error("Please upload an image") # Generate caption captions = caption_image(images=image) if not captions or len(captions) == 0: raise gr.Error("Could not generate caption for this image") caption_text = captions[0]['generated_text'] print(f"Generated caption: {caption_text}") # Generate audio from caption audio_path = generate_audio(caption_text) return [audio_path, caption_text] except Exception as e: print(f"Error in caption_my_image: {str(e)}") raise gr.Error(f"Failed to process image: {str(e)}") # Create the Gradio interface demo = gr.Interface( fn=caption_my_image, inputs=[ gr.Image(label="Upload Image", type="pil") ], outputs=[ gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Caption") ], title="Image Captioning with Audio", description=""" Upload an image and the application will: 1. Generate a descriptive caption for the image 2. Convert the caption to speech """, examples=[], cache_examples=False ) if __name__ == "__main__": demo.launch()