File size: 2,863 Bytes
72bc02d
 
 
 
 
3d9d048
 
72bc02d
 
 
 
 
acaccf4
 
 
 
 
 
 
2b49fb1
acaccf4
 
 
 
 
 
72bc02d
9f684be
acaccf4
 
72bc02d
 
 
3d9d048
 
 
 
 
 
72bc02d
acaccf4
 
 
 
 
72bc02d
 
9f684be
 
 
 
 
 
 
72bc02d
9f684be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1eba56e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

model_name = "microsoft/Phi-3.5-vision-instruct"

# Lazy-load the model and processor at runtime
def get_model_and_processor(model_id):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    ).cuda().eval()
    processor = AutoProcessor.from_pretrained(
        model_id,
        trust_remote_code=True
    )
    return model, processor

@spaces.GPU(memory=30)
def run_example(image, text_input=None, model_id=model_name):
    model, processor = get_model_and_processor(model_id)
    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        eos_token_id=processor.tokenizer.eos_token_id
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    return response

css = """
  #output {
    height: 500px;
    overflow: auto;
    border: 1px solid #ccc;
  }
"""

# Create the Gradio interface
demo = gr.Blocks(css=css)

with demo:
    gr.Markdown("## Phi-3.5 Vision Instruct Demo with Example Inputs")

    with gr.Tab(label="Phi-3.5 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(
                    choices=[model_name],
                    label="Model",
                    value=model_name
                )
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        examples = [
            ["image1.jpeg", "What does this painting tell us explain in detail?"],
            ["image2.jpg", "What does this painting tell us explain in detail?"],
            ["image3.jpg", "Describe the scene in this picture."]
        ]

        gr.Examples(
            examples=examples,
            inputs=[input_img, text_input],
            examples_per_page=3
        )

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

# Queue and launch the demo
demo.queue()
demo.launch(server_name="0.0.0.0")