import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

model_name = "microsoft/Phi-3.5-vision-instruct"

# Lazy-load the model and processor at runtime
def get_model_and_processor(model_id):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    ).cuda().eval()
    processor = AutoProcessor.from_pretrained(
        model_id,
        trust_remote_code=True
    )
    return model, processor

# Modified the spaces.GPU decorator usage
@spaces.GPU(memory=30)  # Specify the GPU memory requirement in GB
def run_example(image, text_input=None, model_id=model_name):
    model, processor = get_model_and_processor(model_id)
    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        eos_token_id=processor.tokenizer.eos_token_id
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    return response

# Rest of your code remains the same until demo.launch()

# Modified launch parameters
demo.launch(share=True, server_name="0.0.0.0")