Spaces:

dipta-dataist
/

Image-to-Story-Generation

Running

App Files Files Community

Diptaraj Sen commited on Apr 22

Commit

d87e8d0

0 Parent(s):

First Commit

Browse files

Files changed (13) hide show

.gitignore +5 -0
.streamlit/config.toml +2 -0
app/__init__.py +0 -0
app/captioning.py +36 -0
app/logger.py +31 -0
app/storytelling.py +53 -0
app/tts.py +38 -0
run_pipeline.py +26 -0
streamlit_app.py +42 -0
tests/__init__.py +0 -0
tests/test_captioning.py +9 -0
tests/test_story.py +7 -0
tests/test_tts.py +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+venv/
+__pycache__/
+outputs/
+logs/
+*.pyc

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [server]
2	+ runOnSave = true

app/__init__.py ADDED Viewed

File without changes

app/captioning.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from app.logger import get_logger
+logger = get_logger(__name__)
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from PIL import Image
+import torch
+# Load processor and model (ViT)
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+# Move model to GPU if available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE:--------> ",device)
+model.to(device)
+def generate_caption(image_path: str) -> str:
+    logger.info("Generating caption...")
+    try:
+        # Open and convert image to RGB
+        image = Image.open(image_path).convert('RGB')
+        # Preprocess image and prepare inputs
+        inputs = processor(images=image, return_tensors="pt").to(device)
+        # Generate caption (greedy decoding for now)
+        output = model.generate(**inputs, max_length=16, num_beams=1)
+        # Decode output to text
+        caption = tokenizer.decode(output[0], skip_special_tokens=True)
+        logger.info(f"Caption generated: {caption}")
+        return caption
+    except Exception as e:
+        logger.exception("Failed to generate caption")
+        raise

app/logger.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import logging
+import os
+def get_logger(name: str):
+    logs_dir = "logs"
+    os.makedirs(logs_dir, exist_ok=True)
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    # File handler
+    file_handler = logging.FileHandler(os.path.join(logs_dir, "pipeline.log"))
+    file_handler.setLevel(logging.DEBUG)
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    # Formatter
+    formatter = logging.Formatter(
+        "[%(asctime)s] [%(levelname)s] - %(name)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+    )
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+    # Avoid duplicate handlers
+    if not logger.handlers:
+        logger.addHandler(file_handler)
+        logger.addHandler(console_handler)
+    return logger

app/storytelling.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from app.logger import get_logger
+logger = get_logger(__name__)
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+model = "google/flan-t5-small"
+# Load tokenizer and model
+tokenizer =AutoTokenizer.from_pretrained(model)
+model = AutoModelForSeq2SeqLM.from_pretrained(model)
+model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def generate_story(caption: str, max_length: int = 256) -> str:
+    logger.info("Generating story...")
+    try:
+        # Turn caption into a story prompt
+        prompt = f"""
+You are a creative storyteller who writes engaging short stories.
+- Length: The story should have around 200-300 words
+- Your job is to take the image caption and expand it into a vivid short story.
+- Start with an engaging hook, build a little conflict, and wrap up with a satisfying ending.
+- Use descriptive language and maintain a consistent tone.
+Caption: "{caption}"
+Write the story below:
+""".strip()
+        # Tokenize and run through model
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        outputs = model.generate(
+        **inputs,
+        max_length=max_length,
+        do_sample=True,
+        top_k=50,
+        top_p=0.95,
+        temperature=0.7,
+        num_return_sequences=1,
+        pad_token_id=tokenizer.pad_token_id,
+        early_stopping=True,
+        repetition_penalty=1.2,
+        length_penalty=1.0)
+        # Decode generated text
+        story = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return story.replace(prompt, "").strip()
+    except Exception as e:
+        logger.exception(f"Failed to generate story: {str(e)}")
+        raise

app/tts.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from app.logger import get_logger
+logger = get_logger(__name__)
+from gtts import gTTS
+from playsound import playsound
+import os
+import uuid
+def speak_story(story: str, lang: str = 'en') -> str:
+    """
+    Converts the story text to speech and saves it as an audio file.
+    Optionally plays the audio.
+    Returns the path to the audio file.
+    """
+    logger.info("Generating audio...")
+    try:
+        # Generate a unique filename
+        filename = f"story_{uuid.uuid4().hex}.mp3"
+        filepath = os.path.join("outputs", filename)
+        # Ensure outputs/ directory exists
+        os.makedirs("outputs", exist_ok=True)
+        # Generate TTS from text
+        tts = gTTS(text=story, lang=lang)
+        tts.save(filepath)
+        # Play the audio (optional)
+        try:
+            playsound(filepath)
+        except Exception as e:
+            logger.exception("Couldn't play audio: {e}")
+        return filepath
+    except Exception as e:
+        logger.exception("Failed to generate audio:{e}")
+        raise

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import argparse
+from app.captioning import generate_caption
+from app.storytelling import generate_story
+from app.tts import speak_story
+def main(file_name):
+    image_path = os.path.join(os.path.dirname(__file__), "assets",file_name)
+    print("🔍 Generating caption from image...")
+    caption = generate_caption(image_path)
+    print(f"\n🖼️  Caption: {caption}")
+    print("\n✍️  Generating story from caption...")
+    story = generate_story(caption)
+    print(f"\n📖 Story:\n{story}")
+    print("\n🔊 Converting story to speech...")
+    audio_path = speak_story(story)
+    print(f"\n✅ Audio saved at: {audio_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run image → caption → story → speech pipeline")
+    parser.add_argument("image_path", type=str, help="Path to the input image")
+    args = parser.parse_args()
+    main(args.image_path)

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import streamlit as st
+from app.captioning import generate_caption
+from app.storytelling import generate_story
+from app.tts import speak_story
+import tempfile
+from PIL import Image
+st.set_page_config(page_title="GenAI Storyteller", layout="centered")
+st.title("📸🧠 GenAI Storyteller")
+st.markdown("Upload an image, get a caption, a story, and hear it spoken aloud!")
+uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+if uploaded_file:
+    # Show uploaded image
+    image = Image.open(uploaded_file)
+    st.image(image, caption="Uploaded Image", use_container_width=True)
+    # Process the pipeline on button click
+    if st.button("Generate Story"):
+        with st.spinner("Generating caption..."):
+            # Save uploaded image to a temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+                image.save(tmp.name)
+                caption = generate_caption(tmp.name)
+        st.success("Caption Generated!")
+        st.write(f"**Caption**: {caption}")
+        with st.spinner("Generating story..."):
+            story = generate_story(caption)
+        st.success("Story Generated!")
+        st.text_area("📖 Story", story, height=250)
+        with st.spinner("Generating audio..."):
+            audio_path = speak_story(story)
+        st.success("Done! Here's the story in audio:")
+        audio_file = open(audio_path, "rb")
+        st.audio(audio_file.read(), format="audio/mp3")

tests/__init__.py ADDED Viewed

File without changes

tests/test_captioning.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from app.captioning import generate_caption
+# Build image path relative to this file
+file_name = "IMG_3736.jpg"
+image_path = os.path.join(os.path.dirname(__file__), file_name)
+caption = generate_caption(image_path)  # Put a real image path here
+print("Caption:", caption)

tests/test_story.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# test_story.py
+from app.storytelling import generate_story
+caption = "a group of people standing in a pool"
+story = generate_story(caption)
+print("Generated Story:\n", story)

tests/test_tts.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.tts import speak_story
+story = """Once upon a time in a quiet village, a curious cat named Whiskers loved to watch the birds from his favorite spot by the window..."""
+audio_path = speak_story(story)
+print("Audio saved to:", audio_path)