import torch import gradio as gr from PIL import Image import numpy as np import os # Use a pipeline as a high-level helper from transformers import pipeline # Set device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Initialize the pipelines caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) # Using a different TTS model that's more stable narrator = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device) def ensure_output_dir(): """Ensure the output directory exists""" output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions") os.makedirs(output_dir, exist_ok=True) return output_dir def generate_audio(text): """ Generate audio from text and save it """ try: # Generate the speech speech = narrator(text) # Create output directory and file path output_dir = ensure_output_dir() output_path = os.path.join(output_dir, "caption_audio.wav") # Save the audio file with open(output_path, "wb") as f: f.write(speech["audio"]) return output_path except Exception as e: print(f"Error generating audio: {str(e)}") raise gr.Error(f"Failed to generate audio: {str(e)}") def caption_my_image(image): """ Generate caption for image and convert it to speech """ try: if image is None: raise gr.Error("Please upload an image") # Generate caption captions = caption_image(images=image) if not captions or len(captions) == 0: raise gr.Error("Could not generate caption for this image") caption_text = captions[0]['generated_text'] print(f"Generated caption: {caption_text}") # Generate audio from caption audio_path = generate_audio(caption_text) return [audio_path, caption_text] except Exception as e: print(f"Error in caption_my_image: {str(e)}") raise gr.Error(f"Failed to process image: {str(e)}") # Create the Gradio interface demo = gr.Interface( fn=caption_my_image, inputs=[ gr.Image(label="Upload Image", type="pil") ], outputs=[ gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Caption") ], title="Image Captioning with Audio", description=""" Upload an image and the application will: 1. Generate a descriptive caption for the image 2. Convert the caption to speech """, examples=[], cache_examples=False ) if __name__ == "__main__": demo.launch()