import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoProcessor import torch from PIL import Image import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) user_prompt = '<|user|>\n' assistant_prompt = '<|assistant|>\n' prompt_suffix = "<|end|>\n" model_name = "microsoft/Phi-3.5-vision-instruct" # Lazy-load the model and processor at runtime def get_model_and_processor(model_id): model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.bfloat16 ).cuda().eval() processor = AutoProcessor.from_pretrained( model_id, trust_remote_code=True ) return model, processor # Modified the spaces.GPU decorator usage @spaces.GPU(memory=30) # Specify the GPU memory requirement in GB def run_example(image, text_input=None, model_id=model_name): model, processor = get_model_and_processor(model_id) prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" image = Image.fromarray(image).convert("RGB") inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") generate_ids = model.generate( **inputs, max_new_tokens=1000, eos_token_id=processor.tokenizer.eos_token_id ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response # Rest of your code remains the same until demo.launch() # Modified launch parameters demo.launch(share=True, server_name="0.0.0.0")