Upload Callisto_OCR3_2B_Instruct.ipynb

Browse files

Files changed (1) hide show

Callisto-OCR3-2B-Instruct-Demo/Callisto_OCR3_2B_Instruct.ipynb +327 -0

Callisto-OCR3-2B-Instruct-Demo/Callisto_OCR3_2B_Instruct.ipynb ADDED Viewed

	@@ -0,0 +1,327 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YQWbceFBfAzQ"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install -q gradio spaces transformers accelerate\n",
+        "!pip install -q numpy requests torch torchvision\n",
+        "!pip install -q qwen-vl-utils av ipython reportlab\n",
+        "!pip install -q fpdf python-docx pillow huggingface_hub"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#Demo\n",
+        "import gradio as gr\n",
+        "import spaces\n",
+        "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer\n",
+        "from qwen_vl_utils import process_vision_info\n",
+        "import torch\n",
+        "from PIL import Image\n",
+        "import os\n",
+        "import uuid\n",
+        "import io\n",
+        "from threading import Thread\n",
+        "from reportlab.lib.pagesizes import A4\n",
+        "from reportlab.lib.styles import getSampleStyleSheet\n",
+        "from reportlab.lib import colors\n",
+        "from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
+        "from reportlab.lib.units import inch\n",
+        "from reportlab.pdfbase import pdfmetrics\n",
+        "from reportlab.pdfbase.ttfonts import TTFont\n",
+        "import docx\n",
+        "from docx.enum.text import WD_ALIGN_PARAGRAPH\n",
+        "\n",
+        "# Define model options\n",
+        "MODEL_OPTIONS = {\n",
+        "    \"Callisto-OCR3-2B-Instruct\": \"prithivMLmods/Callisto-OCR3-2B-Instruct\",\n",
+        "}\n",
+        "\n",
+        "# Preload models and processors into CUDA\n",
+        "models = {}\n",
+        "processors = {}\n",
+        "for name, model_id in MODEL_OPTIONS.items():\n",
+        "    print(f\"Loading {name}...\")\n",
+        "    models[name] = Qwen2VLForConditionalGeneration.from_pretrained(\n",
+        "        model_id,\n",
+        "        trust_remote_code=True,\n",
+        "        torch_dtype=torch.float16\n",
+        "    ).to(\"cuda\").eval()\n",
+        "    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
+        "\n",
+        "image_extensions = Image.registered_extensions()\n",
+        "\n",
+        "def identify_and_save_blob(blob_path):\n",
+        "    \"\"\"Identifies if the blob is an image and saves it.\"\"\"\n",
+        "    try:\n",
+        "        with open(blob_path, 'rb') as file:\n",
+        "            blob_content = file.read()\n",
+        "            try:\n",
+        "                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image\n",
+        "                extension = \".png\"  # Default to PNG for saving\n",
+        "                media_type = \"image\"\n",
+        "            except (IOError, SyntaxError):\n",
+        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
+        "\n",
+        "            filename = f\"temp_{uuid.uuid4()}_media{extension}\"\n",
+        "            with open(filename, \"wb\") as f:\n",
+        "                f.write(blob_content)\n",
+        "\n",
+        "            return filename, media_type\n",
+        "\n",
+        "    except FileNotFoundError:\n",
+        "        raise ValueError(f\"The file {blob_path} was not found.\")\n",
+        "    except Exception as e:\n",
+        "        raise ValueError(f\"An error occurred while processing the file: {e}\")\n",
+        "\n",
+        "@spaces.GPU\n",
+        "def qwen_inference(model_name, media_input, text_input=None):\n",
+        "    \"\"\"Handles inference for the selected model.\"\"\"\n",
+        "    model = models[model_name]\n",
+        "    processor = processors[model_name]\n",
+        "\n",
+        "    if isinstance(media_input, str):\n",
+        "        media_path = media_input\n",
+        "        if media_path.endswith(tuple([i for i in image_extensions.keys()])):\n",
+        "            media_type = \"image\"\n",
+        "        else:\n",
+        "            try:\n",
+        "                media_path, media_type = identify_and_save_blob(media_input)\n",
+        "            except Exception as e:\n",
+        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
+        "\n",
+        "    messages = [\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": media_type,\n",
+        "                    media_type: media_path\n",
+        "                },\n",
+        "                {\"type\": \"text\", \"text\": text_input},\n",
+        "            ],\n",
+        "        }\n",
+        "    ]\n",
+        "\n",
+        "    text = processor.apply_chat_template(\n",
+        "        messages, tokenize=False, add_generation_prompt=True\n",
+        "    )\n",
+        "    image_inputs, _ = process_vision_info(messages)\n",
+        "    inputs = processor(\n",
+        "        text=[text],\n",
+        "        images=image_inputs,\n",
+        "        padding=True,\n",
+        "        return_tensors=\"pt\",\n",
+        "    ).to(\"cuda\")\n",
+        "\n",
+        "    streamer = TextIteratorStreamer(\n",
+        "        processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
+        "    )\n",
+        "    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)\n",
+        "\n",
+        "    thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
+        "    thread.start()\n",
+        "\n",
+        "    buffer = \"\"\n",
+        "    for new_text in streamer:\n",
+        "        buffer += new_text\n",
+        "        # Remove <|im_end|> or similar tokens from the output\n",
+        "        buffer = buffer.replace(\"<|im_end|>\", \"\")\n",
+        "        yield buffer\n",
+        "\n",
+        "def format_plain_text(output_text):\n",
+        "    \"\"\"Formats the output text as plain text without LaTeX delimiters.\"\"\"\n",
+        "    # Remove LaTeX delimiters and convert to plain text\n",
+        "    plain_text = output_text.replace(\"\\\\(\", \"\").replace(\"\\\\)\", \"\").replace(\"\\\\[\", \"\").replace(\"\\\\]\", \"\")\n",
+        "    return plain_text\n",
+        "\n",
+        "def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a document with the input image and plain text output.\"\"\"\n",
+        "    plain_text = format_plain_text(output_text)\n",
+        "    if file_format == \"pdf\":\n",
+        "        return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
+        "    elif file_format == \"docx\":\n",
+        "        return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
+        "\n",
+        "def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a PDF document.\"\"\"\n",
+        "    filename = f\"output_{uuid.uuid4()}.pdf\"\n",
+        "    doc = SimpleDocTemplate(\n",
+        "        filename,\n",
+        "        pagesize=A4,\n",
+        "        rightMargin=inch,\n",
+        "        leftMargin=inch,\n",
+        "        topMargin=inch,\n",
+        "        bottomMargin=inch\n",
+        "    )\n",
+        "    styles = getSampleStyleSheet()\n",
+        "    styles[\"Normal\"].fontSize = int(font_size)\n",
+        "    styles[\"Normal\"].leading = int(font_size) * line_spacing\n",
+        "    styles[\"Normal\"].alignment = {\n",
+        "        \"Left\": 0,\n",
+        "        \"Center\": 1,\n",
+        "        \"Right\": 2,\n",
+        "        \"Justified\": 4\n",
+        "    }[alignment]\n",
+        "\n",
+        "    story = []\n",
+        "\n",
+        "    # Add image with size adjustment\n",
+        "    image_sizes = {\n",
+        "        \"Small\": (200, 200),\n",
+        "        \"Medium\": (400, 400),\n",
+        "        \"Large\": (600, 600)\n",
+        "    }\n",
+        "    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])\n",
+        "    story.append(img)\n",
+        "    story.append(Spacer(1, 12))\n",
+        "\n",
+        "    # Add plain text output\n",
+        "    text = Paragraph(plain_text, styles[\"Normal\"])\n",
+        "    story.append(text)\n",
+        "\n",
+        "    doc.build(story)\n",
+        "    return filename\n",
+        "\n",
+        "def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a DOCX document.\"\"\"\n",
+        "    filename = f\"output_{uuid.uuid4()}.docx\"\n",
+        "    doc = docx.Document()\n",
+        "\n",
+        "    # Add image with size adjustment\n",
+        "    image_sizes = {\n",
+        "        \"Small\": docx.shared.Inches(2),\n",
+        "        \"Medium\": docx.shared.Inches(4),\n",
+        "        \"Large\": docx.shared.Inches(6)\n",
+        "    }\n",
+        "    doc.add_picture(media_path, width=image_sizes[image_size])\n",
+        "    doc.add_paragraph()\n",
+        "\n",
+        "    # Add plain text output\n",
+        "    paragraph = doc.add_paragraph()\n",
+        "    paragraph.paragraph_format.line_spacing = line_spacing\n",
+        "    paragraph.paragraph_format.alignment = {\n",
+        "        \"Left\": WD_ALIGN_PARAGRAPH.LEFT,\n",
+        "        \"Center\": WD_ALIGN_PARAGRAPH.CENTER,\n",
+        "        \"Right\": WD_ALIGN_PARAGRAPH.RIGHT,\n",
+        "        \"Justified\": WD_ALIGN_PARAGRAPH.JUSTIFY\n",
+        "    }[alignment]\n",
+        "    run = paragraph.add_run(plain_text)\n",
+        "    run.font.size = docx.shared.Pt(int(font_size))\n",
+        "\n",
+        "    doc.save(filename)\n",
+        "    return filename\n",
+        "\n",
+        "# CSS for output styling\n",
+        "css = \"\"\"\n",
+        "  #output {\n",
+        "    height: 500px;\n",
+        "    overflow: auto;\n",
+        "    border: 1px solid #ccc;\n",
+        "  }\n",
+        ".submit-btn {\n",
+        "    background-color: #cf3434  !important;\n",
+        "    color: white !important;\n",
+        "}\n",
+        ".submit-btn:hover {\n",
+        "    background-color: #ff2323 !important;\n",
+        "}\n",
+        ".download-btn {\n",
+        "    background-color: #35a6d6 !important;\n",
+        "    color: white !important;\n",
+        "}\n",
+        ".download-btn:hover {\n",
+        "    background-color: #22bcff !important;\n",
+        "}\n",
+        "\"\"\"\n",
+        "\n",
+        "# Gradio app setup\n",
+        "with gr.Blocks(css=css) as demo:\n",
+        "    gr.Markdown(\"# Qwen2VL Models: Vision and Language Processing\")\n",
+        "\n",
+        "    with gr.Tab(label=\"Image Input\"):\n",
+        "\n",
+        "        with gr.Row():\n",
+        "            with gr.Column():\n",
+        "                model_choice = gr.Dropdown(\n",
+        "                    label=\"Model Selection\",\n",
+        "                    choices=list(MODEL_OPTIONS.keys()),\n",
+        "                    value=\"Callisto-OCR3-2B-Instruct\"\n",
+        "                )\n",
+        "                input_media = gr.File(\n",
+        "                    label=\"Upload Image\", type=\"filepath\"\n",
+        "                )\n",
+        "                text_input = gr.Textbox(label=\"Question\", placeholder=\"Ask a question about the image...\")\n",
+        "                submit_btn = gr.Button(value=\"Submit\", elem_classes=\"submit-btn\")\n",
+        "\n",
+        "            with gr.Column():\n",
+        "                output_text = gr.Textbox(label=\"Output Text\", lines=10)\n",
+        "                plain_text_output = gr.Textbox(label=\"Standardized Plain Text\", lines=10)\n",
+        "\n",
+        "        submit_btn.click(\n",
+        "            qwen_inference, [model_choice, input_media, text_input], [output_text]\n",
+        "        ).then(\n",
+        "            lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]\n",
+        "        )\n",
+        "\n",
+        "        # Add examples directly usable by clicking\n",
+        "        with gr.Row():\n",
+        "            with gr.Column():\n",
+        "                line_spacing = gr.Dropdown(\n",
+        "                    choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],\n",
+        "                    value=1.5,\n",
+        "                    label=\"Line Spacing\"\n",
+        "                )\n",
+        "                font_size = gr.Dropdown(\n",
+        "                    choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\", \"20\", \"22\", \"24\"],\n",
+        "                    value=\"18\",\n",
+        "                    label=\"Font Size\"\n",
+        "                )\n",
+        "                alignment = gr.Dropdown(\n",
+        "                    choices=[\"Left\", \"Center\", \"Right\", \"Justified\"],\n",
+        "                    value=\"Justified\",\n",
+        "                    label=\"Text Alignment\"\n",
+        "                )\n",
+        "                image_size = gr.Dropdown(\n",
+        "                    choices=[\"Small\", \"Medium\", \"Large\"],\n",
+        "                    value=\"Small\",\n",
+        "                    label=\"Image Size\"\n",
+        "                )\n",
+        "                file_format = gr.Radio([\"pdf\", \"docx\"], label=\"File Format\", value=\"pdf\")\n",
+        "                get_document_btn = gr.Button(value=\"Get Document\", elem_classes=\"download-btn\")\n",
+        "\n",
+        "        get_document_btn.click(\n",
+        "            generate_document, [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size], gr.File(label=\"Download Document\")\n",
+        "        )\n",
+        "\n",
+        "demo.launch(debug=True)"
+      ],
+      "metadata": {
+        "id": "gwUb7Nb9fEPU"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}