Spaces:

SinghAbhinav04
/

InternVL2.5

Sleeping

App Files Files Community

Abhinav Singh commited on 28 days ago

Commit

8464b63

1 Parent(s): 0ebbc88

init

Browse files

Files changed (2) hide show

app.py +460 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import torch
+import gradio as gr
+import numpy as np
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, AutoModel
+from decord import VideoReader, cpu
+import tempfile
+import json
+from typing import List, Tuple, Optional, Union
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Constants
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+MODEL_PATH = "OpenGVLab/InternVL2_5-4B"
+class InternVLChatBot:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.generation_config = dict(max_new_tokens=1024, do_sample=True)
+        self.load_model()
+    def load_model(self):
+        """Load the InternVL model and tokenizer"""
+        try:
+            logger.info("Loading InternVL2.5-4B model...")
+            self.model = AutoModel.from_pretrained(
+                MODEL_PATH,
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                use_flash_attn=True if self.device == "cuda" else False,
+                device_map="auto" if self.device == "cuda" else None
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                MODEL_PATH, trust_remote_code=True
+            )
+            logger.info("Model loaded successfully!")
+        except Exception as e:
+            logger.error(f"Error loading model: {str(e)}")
+            raise e
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        """Find the closest aspect ratio from target ratios"""
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        """Dynamically preprocess image based on aspect ratio"""
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # Calculate target ratios
+        target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # Find closest aspect ratio
+        target_aspect_ratio = self.find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
+        # Calculate target dimensions
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # Resize and split image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def build_transform(self, input_size):
+        """Build image transformation pipeline"""
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+        ])
+        return transform
+    def load_image(self, image_path, input_size=448, max_num=12):
+        """Load and preprocess image"""
+        if isinstance(image_path, str):
+            image = Image.open(image_path).convert('RGB')
+        else:
+            image = image_path.convert('RGB')
+        transform = self.build_transform(input_size=input_size)
+        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(img) for img in images]
+        pixel_values = torch.stack(pixel_values)
+        return pixel_values
+    def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
+        """Get frame indices for video processing"""
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+    def load_video(self, video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+        """Load and preprocess video"""
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        pixel_values_list, num_patches_list = [], []
+        transform = self.build_transform(input_size=input_size)
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            img = self.dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            pixel_values = [transform(tile) for tile in img]
+            pixel_values = torch.stack(pixel_values)
+            num_patches_list.append(pixel_values.shape[0])
+            pixel_values_list.append(pixel_values)
+        pixel_values = torch.cat(pixel_values_list)
+        return pixel_values, num_patches_list
+    def chat(self, message, history, image=None, video=None):
+        """Main chat function"""
+        try:
+            pixel_values = None
+            num_patches_list = None
+            # Process image if provided
+            if image is not None:
+                pixel_values = self.load_image(image, max_num=12)
+                if self.device == "cuda":
+                    pixel_values = pixel_values.to(torch.bfloat16).cuda()
+                message = f"<image>\n{message}"
+            # Process video if provided
+            elif video is not None:
+                pixel_values, num_patches_list = self.load_video(video, num_segments=8, max_num=1)
+                if self.device == "cuda":
+                    pixel_values = pixel_values.to(torch.bfloat16).cuda()
+                video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
+                message = f"{video_prefix}{message}"
+            # Convert history to the expected format
+            chat_history = []
+            if history:
+                for item in history:
+                    if len(item) == 2:
+                        chat_history.append((item[0], item[1]))
+            # Generate response
+            if num_patches_list is not None:
+                response, new_history = self.model.chat(
+                    self.tokenizer,
+                    pixel_values,
+                    message,
+                    self.generation_config,
+                    num_patches_list=num_patches_list,
+                    history=chat_history,
+                    return_history=True
+                )
+            else:
+                response, new_history = self.model.chat(
+                    self.tokenizer,
+                    pixel_values,
+                    message,
+                    self.generation_config,
+                    history=chat_history,
+                    return_history=True
+                )
+            # Update history
+            history.append([message, response])
+            return "", history, None, None
+        except Exception as e:
+            logger.error(f"Error in chat: {str(e)}")
+            error_msg = f"Sorry, I encountered an error: {str(e)}"
+            history.append([message, error_msg])
+            return "", history, None, None
+# Initialize the chatbot
+chatbot = InternVLChatBot()
+# Create Gradio interface
+def create_interface():
+    """Create the Gradio interface"""
+    # Custom CSS for better styling
+    custom_css = """
+    .gradio-container {
+        font-family: 'Arial', sans-serif;
+    }
+    .chat-message {
+        padding: 10px;
+        margin: 5px 0;
+        border-radius: 10px;
+    }
+    .user-message {
+        background-color: #e3f2fd;
+        margin-left: 20px;
+    }
+    .bot-message {
+        background-color: #f5f5f5;
+        margin-right: 20px;
+    }
+    """
+    with gr.Blocks(css=custom_css, title="InternVL2.5-4B Chat") as interface:
+        gr.Markdown("""
+        # 🤖 InternVL2.5-4B Multimodal Chat
+        Welcome to the InternVL2.5-4B chat interface! This AI assistant can:
+        - 💬 Have conversations with text
+        - 🖼️ Analyze and describe images
+        - 🎥 Process and understand videos
+        - 📝 Extract text from images (OCR)
+        - 🎯 Answer questions about visual content
+        **Instructions:**
+        1. Type your message in the text box
+        2. Optionally upload an image or video
+        3. Click Send to get a response
+        4. Use "Clear" to reset the conversation
+        """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot_interface = gr.Chatbot(
+                    label="Chat History",
+                    height=500,
+                    show_copy_button=True,
+                    avatar_images=["👤", "🤖"]
+                )
+                with gr.Row():
+                    msg = gr.Textbox(
+                        label="Your Message",
+                        placeholder="Type your message here... You can ask about images, videos, or just chat!",
+                        lines=2,
+                        scale=4
+                    )
+                    send_btn = gr.Button("Send 📤", scale=1, variant="primary")
+                with gr.Row():
+                    clear_btn = gr.Button("Clear 🗑️", scale=1)
+            with gr.Column(scale=1):
+                gr.Markdown("### 📎 Upload Media")
+                image_input = gr.Image(
+                    label="Upload Image",
+                    type="pil",
+                    height=200
+                )
+                video_input = gr.Video(
+                    label="Upload Video",
+                    height=200
+                )
+                gr.Markdown("""
+                **Supported formats:**
+                - Images: JPG, PNG, WEBP, GIF
+                - Videos: MP4, AVI, MOV, WEBM
+                **Tips:**
+                - For images: Ask about content, extract text, or describe what you see
+                - For videos: Ask for descriptions, analysis, or specific details
+                - You can upload one media file at a time
+                """)
+        # Example prompts
+        gr.Markdown("### 💡 Example Prompts")
+        with gr.Row():
+            example_btn1 = gr.Button("👋 Hello, introduce yourself")
+            example_btn2 = gr.Button("🖼️ Describe this image")
+            example_btn3 = gr.Button("📝 Extract text from image")
+            example_btn4 = gr.Button("🎥 Analyze this video")
+        # Event handlers
+        def submit_message(message, history, image, video):
+            if not message.strip():
+                return "", history, image, video
+            return chatbot.chat(message, history, image, video)
+        def clear_chat():
+            return [], None, None
+        def set_example_prompt(prompt):
+            return prompt
+        # Wire up the interface
+        send_btn.click(
+            fn=submit_message,
+            inputs=[msg, chatbot_interface, image_input, video_input],
+            outputs=[msg, chatbot_interface, image_input, video_input]
+        )
+        msg.submit(
+            fn=submit_message,
+            inputs=[msg, chatbot_interface, image_input, video_input],
+            outputs=[msg, chatbot_interface, image_input, video_input]
+        )
+        clear_btn.click(
+            fn=clear_chat,
+            outputs=[chatbot_interface, image_input, video_input]
+        )
+        # Example button handlers
+        example_btn1.click(
+            fn=set_example_prompt,
+            inputs=[gr.State("Hello, who are you?")],
+            outputs=[msg]
+        )
+        example_btn2.click(
+            fn=set_example_prompt,
+            inputs=[gr.State("Please describe this image in detail.")],
+            outputs=[msg]
+        )
+        example_btn3.click(
+            fn=set_example_prompt,
+            inputs=[gr.State("Extract the exact text provided in the image.")],
+            outputs=[msg]
+        )
+        example_btn4.click(
+            fn=set_example_prompt,
+            inputs=[gr.State("Describe this video in detail.")],
+            outputs=[msg]
+        )
+        # Footer
+        gr.Markdown("""
+        ---
+        **About InternVL2.5-4B:** A powerful multimodal AI model developed by Shanghai AI Lab, Tsinghua University and partners.
+        **API Usage:** This interface supports API calls. The chat endpoint accepts JSON with `message`, `image`, and `video` fields.
+        """)
+    return interface
+# API endpoint for external integrations
+def api_chat(message: str, image: Optional[str] = None, video: Optional[str] = None, history: Optional[List] = None):
+    """
+    API endpoint for chat functionality
+    Args:
+        message: Text message
+        image: Base64 encoded image or image path
+        video: Video file path
+        history: Chat history as list of [user_msg, bot_msg] pairs
+    Returns:
+        Dictionary with response and updated history
+    """
+    try:
+        if history is None:
+            history = []
+        # Process image if provided (handle base64 or file path)
+        image_obj = None
+        if image:
+            try:
+                if image.startswith('data:image'):
+                    # Handle base64 image
+                    import base64
+                    from io import BytesIO
+                    image_data = image.split(',')[1]
+                    image_bytes = base64.b64decode(image_data)
+                    image_obj = Image.open(BytesIO(image_bytes))
+                else:
+                    # Handle file path
+                    image_obj = Image.open(image)
+            except Exception as e:
+                logger.error(f"Error processing image: {str(e)}")
+        # Chat with the model
+        _, updated_history, _, _ = chatbot.chat(message, history, image_obj, video)
+        return {
+            "response": updated_history[-1][1] if updated_history else "",
+            "history": updated_history,
+            "status": "success"
+        }
+    except Exception as e:
+        logger.error(f"API Error: {str(e)}")
+        return {
+            "response": f"Error: {str(e)}",
+            "history": history,
+            "status": "error"
+        }
+if __name__ == "__main__":
+    # Create and launch the interface
+    interface = create_interface()
+    # Launch with API access enabled
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_api=True,
+        enable_queue=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>=2.0.0
+transformers>=4.37.0
+gradio>=4.0.0
+torchvision>=0.15.0
+pillow>=9.0.0
+numpy>=1.21.0
+decord>=0.6.0
+accelerate>=0.20.0
+bitsandbytes>=0.41.0
+flash-attn>=2.3.0