import os from huggingface_hub import login from peft import PeftModel import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from typing import Dict, List, Any class EndpointHandler: def __init__(self, path=""): login(token=os.environ["HF_ACCESS_TOKEN"]) # load model and tokenizer from path self.tokenizer = AutoTokenizer.from_pretrained(path) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=torch.bfloat16) base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-11B-Vision-Instruct", quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16) self.model = PeftModel.from_pretrained(base_model, path) self.device = "cuda" if torch.cuda.is_available() else "cpu" def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: print(data) print(type(data)) if isinstance(data, dict) and "inputs" in data: inputs = data["inputs"] else: return {"error": "Invalid input format. Expecting {'inputs': 'your text'}"} inputs = self.tokenizer( inputs, return_tensors="pt", padding=True, truncation=True, max_length=700).to(self.device) with torch.no_grad(): outputs = self.model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=700, temperature=0.01) # postprocess the prediction prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return [{"generated_text": prediction}]