Muhammad Anas Akhtar commited on
Commit
83ed3dd
·
verified ·
1 Parent(s): 9ad8324

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -24
app.py CHANGED
@@ -1,39 +1,96 @@
1
  import torch
2
  import gradio as gr
3
  from PIL import Image
4
- import scipy.io.wavfile as wavfile
 
5
 
6
  # Use a pipeline as a high-level helper
7
  from transformers import pipeline
8
 
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
10
 
11
-
12
  caption_image = pipeline("image-to-text",
13
- model="Salesforce/blip-image-captioning-large", device=device)
 
14
 
 
15
  narrator = pipeline("text-to-speech",
16
- model="kakao-enterprise/vits-ljs")
 
17
 
 
 
 
 
 
18
 
19
  def generate_audio(text):
20
- # Generate the narrated text
21
- narrated_text = narrator(text)
22
-
23
- # Save the audio to a WAV file
24
- wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
25
- data=narrated_text["audio"][0])
26
- # Return the path to the saved audio file
27
- return "output.wav"
28
-
29
-
30
- def caption_my_image(pil_image):
31
- semantics = caption_image(images=pil_image)[0]['generated_text']
32
- return generate_audio(semantics)
33
-
34
- demo = gr.Interface(fn=caption_my_image,
35
- inputs=[gr.Image(label="Select Image",type="pil")],
36
- outputs=[gr.Audio(label="Image Caption")],
37
- title="@GenAILearniverse Project 8: Image Captioning",
38
- description="THIS APPLICATION WILL BE USED TO CAPTION THE IMAGE.")
39
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import gradio as gr
3
  from PIL import Image
4
+ import numpy as np
5
+ import os
6
 
7
  # Use a pipeline as a high-level helper
8
  from transformers import pipeline
9
 
10
+ # Set device
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print(f"Using device: {device}")
13
 
14
+ # Initialize the pipelines
15
  caption_image = pipeline("image-to-text",
16
+ model="Salesforce/blip-image-captioning-large",
17
+ device=device)
18
 
19
+ # Using a different TTS model that's more stable
20
  narrator = pipeline("text-to-speech",
21
+ model="microsoft/speecht5_tts",
22
+ device=device)
23
 
24
+ def ensure_output_dir():
25
+ """Ensure the output directory exists"""
26
+ output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
27
+ os.makedirs(output_dir, exist_ok=True)
28
+ return output_dir
29
 
30
  def generate_audio(text):
31
+ """
32
+ Generate audio from text and save it
33
+ """
34
+ try:
35
+ # Generate the speech
36
+ speech = narrator(text)
37
+
38
+ # Create output directory and file path
39
+ output_dir = ensure_output_dir()
40
+ output_path = os.path.join(output_dir, "caption_audio.wav")
41
+
42
+ # Save the audio file
43
+ with open(output_path, "wb") as f:
44
+ f.write(speech["audio"])
45
+
46
+ return output_path
47
+ except Exception as e:
48
+ print(f"Error generating audio: {str(e)}")
49
+ raise gr.Error(f"Failed to generate audio: {str(e)}")
50
+
51
+ def caption_my_image(image):
52
+ """
53
+ Generate caption for image and convert it to speech
54
+ """
55
+ try:
56
+ if image is None:
57
+ raise gr.Error("Please upload an image")
58
+
59
+ # Generate caption
60
+ captions = caption_image(images=image)
61
+ if not captions or len(captions) == 0:
62
+ raise gr.Error("Could not generate caption for this image")
63
+
64
+ caption_text = captions[0]['generated_text']
65
+ print(f"Generated caption: {caption_text}")
66
+
67
+ # Generate audio from caption
68
+ audio_path = generate_audio(caption_text)
69
+
70
+ return [audio_path, caption_text]
71
+ except Exception as e:
72
+ print(f"Error in caption_my_image: {str(e)}")
73
+ raise gr.Error(f"Failed to process image: {str(e)}")
74
+
75
+ # Create the Gradio interface
76
+ demo = gr.Interface(
77
+ fn=caption_my_image,
78
+ inputs=[
79
+ gr.Image(label="Upload Image", type="pil")
80
+ ],
81
+ outputs=[
82
+ gr.Audio(label="Generated Audio"),
83
+ gr.Textbox(label="Generated Caption")
84
+ ],
85
+ title="Image Captioning with Audio",
86
+ description="""
87
+ Upload an image and the application will:
88
+ 1. Generate a descriptive caption for the image
89
+ 2. Convert the caption to speech
90
+ """,
91
+ examples=[],
92
+ cache_examples=False
93
+ )
94
+
95
+ if __name__ == "__main__":
96
+ demo.launch()