my-ai-university
/

LLaMA-TOMMI-1.0

Question Answering

Model card Files Files and versions Community

LLaMA-TOMMI-1.0 / handler.py

rahulgulati's picture

Uploading model after hyperparameter optimization

a658fd9 verified about 2 months ago

1.99 kB

	import os
	from huggingface_hub import login
	from peft import PeftModel
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from typing import Dict, List, Any


	class EndpointHandler:
	def __init__(self, path=""):
	login(token=os.environ["HF_ACCESS_TOKEN"])

	# load model and tokenizer from path
	self.tokenizer = AutoTokenizer.from_pretrained(path)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_storage=torch.bfloat16)
	base_model = AutoModelForCausalLM.from_pretrained(
	"meta-llama/Llama-3.2-11B-Vision-Instruct",
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.bfloat16)

	self.model = PeftModel.from_pretrained(base_model, path)
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

	print(data)
	print(type(data))

	if isinstance(data, dict) and "inputs" in data:
	inputs = data["inputs"]
	else:
	return {"error": "Invalid input format. Expecting {'inputs': 'your text'}"}

	inputs = self.tokenizer(
	inputs,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=700).to(self.device)

	with torch.no_grad():
	outputs = self.model.generate(
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	max_new_tokens=700,
	temperature=0.01)

	# postprocess the prediction
	prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	return [{"generated_text": prediction}]