Spaces:

ANASAKHTAR
/

Image_Captions_With_Audio

Running

App Files Files Community

Muhammad Anas Akhtar commited on Dec 7, 2024

Commit

83ed3dd

verified ·

1 Parent(s): 9ad8324

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -24

app.py CHANGED Viewed

@@ -1,39 +1,96 @@
 import torch
 import gradio as gr
 from PIL import Image
-import scipy.io.wavfile as wavfile
 # Use a pipeline as a high-level helper
 from transformers import pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 caption_image = pipeline("image-to-text",
-                model="Salesforce/blip-image-captioning-large", device=device)
 narrator = pipeline("text-to-speech",
-                    model="kakao-enterprise/vits-ljs")
 def generate_audio(text):
-    # Generate the narrated text
-    narrated_text = narrator(text)
-    # Save the audio to a WAV file
-    wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
-                  data=narrated_text["audio"][0])
-    # Return the path to the saved audio file
-    return "output.wav"
-def caption_my_image(pil_image):
-    semantics = caption_image(images=pil_image)[0]['generated_text']
-    return generate_audio(semantics)
-demo = gr.Interface(fn=caption_my_image,
-                    inputs=[gr.Image(label="Select Image",type="pil")],
-                    outputs=[gr.Audio(label="Image Caption")],
-                    title="@GenAILearniverse Project 8: Image Captioning",
-                    description="THIS APPLICATION WILL BE USED TO CAPTION THE IMAGE.")
-demo.launch()

 import torch
 import gradio as gr
 from PIL import Image
+import numpy as np
+import os
 # Use a pipeline as a high-level helper
 from transformers import pipeline
+# Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Initialize the pipelines
 caption_image = pipeline("image-to-text",
+                       model="Salesforce/blip-image-captioning-large",
+                       device=device)
+# Using a different TTS model that's more stable
 narrator = pipeline("text-to-speech",
+                   model="microsoft/speecht5_tts",
+                   device=device)
+def ensure_output_dir():
+    """Ensure the output directory exists"""
+    output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
+    os.makedirs(output_dir, exist_ok=True)
+    return output_dir
 def generate_audio(text):
+    """
+    Generate audio from text and save it
+    """
+    try:
+        # Generate the speech
+        speech = narrator(text)
+        # Create output directory and file path
+        output_dir = ensure_output_dir()
+        output_path = os.path.join(output_dir, "caption_audio.wav")
+        # Save the audio file
+        with open(output_path, "wb") as f:
+            f.write(speech["audio"])
+        return output_path
+    except Exception as e:
+        print(f"Error generating audio: {str(e)}")
+        raise gr.Error(f"Failed to generate audio: {str(e)}")
+def caption_my_image(image):
+    """
+    Generate caption for image and convert it to speech
+    """
+    try:
+        if image is None:
+            raise gr.Error("Please upload an image")
+        # Generate caption
+        captions = caption_image(images=image)
+        if not captions or len(captions) == 0:
+            raise gr.Error("Could not generate caption for this image")
+        caption_text = captions[0]['generated_text']
+        print(f"Generated caption: {caption_text}")
+        # Generate audio from caption
+        audio_path = generate_audio(caption_text)
+        return [audio_path, caption_text]
+    except Exception as e:
+        print(f"Error in caption_my_image: {str(e)}")
+        raise gr.Error(f"Failed to process image: {str(e)}")
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=caption_my_image,
+    inputs=[
+        gr.Image(label="Upload Image", type="pil")
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio"),
+        gr.Textbox(label="Generated Caption")
+    ],
+    title="Image Captioning with Audio",
+    description="""
+    Upload an image and the application will:
+    1. Generate a descriptive caption for the image
+    2. Convert the caption to speech
+    """,
+    examples=[],
+    cache_examples=False
+)
+if __name__ == "__main__":
+    demo.launch()