|
--- |
|
base_model: meta-llama/Llama-3.2-11B-Vision-Instruct |
|
library_name: peft |
|
license: apache-2.0 |
|
language: |
|
- en |
|
pipeline_tag: question-answering |
|
--- |
|
|
|
# Model Details |
|
|
|
This model is a fine-tuned version of base model [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) using LoRA on the [train_qa_wo_students.csv](https://drive.google.com/file/d/1uv-kVP0z3E8u9-u8PWAKA9tkr3ENHeZv/view?usp=sharing) dataset combing materials from FEM courses of [Prof. Krishna Garikipati](https://viterbi.usc.edu/directory/faculty/Garikipati/Krishna). |
|
|
|
Compared with [TOMMI-0.35](https://huggingface.co/my-ai-university/TOMMI-0.35/), TOMMI-1.0 uses the optimal hyperparameters (without student asked QA pairs) and increased token length of 700 from 500. |
|
|
|
## **Hyperparameters** |
|
|
|
* learning_rate: 5e-5 |
|
* gradient_accumulation_steps: 2 |
|
* epoch: 5 |
|
* r (lora rank): 45 |
|
* lora_alpha: 65 |
|
* lora_dropout: 0.05 |
|
* target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] |
|
|
|
## **Usage** |
|
|
|
**For Expanse usage, you should request at least two V100s to run the following code.** |
|
|
|
### **Env Setup** |
|
|
|
```bash |
|
#!/bin/bash |
|
# python 3.10 + cuda 11.8.0 |
|
|
|
|
|
export MKL_NUM_THREADS=1 |
|
export NUMEXPR_NUM_THREADS=1 |
|
export OPENBLAS_NUM_THREADS=1 |
|
export OMP_NUM_THREADS=1 |
|
|
|
conda clean -a -y # conda for traditional and reliable setup |
|
mamba clean -a -y # mamba for smart and efficient setup |
|
pip install --upgrade pip |
|
|
|
# cuda, gcc/g++, torch |
|
conda install cuda -c nvidia/label/cuda-11.8.0 -y |
|
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118 |
|
pip install torchao==0.7.0 --index-url https://download.pytorch.org/whl/cu118 |
|
|
|
# deepspeed |
|
mamba install gcc gxx -c conda-forge -y # ensure > 9.0 for ninja JIT |
|
pip install deepspeed==0.15.4 |
|
|
|
# bitsandbytes |
|
pip install setuptools |
|
mamba install bitsandbytes=0.45.0 -c conda-forge --no-deps -y |
|
pip install psutil |
|
# add the following to your .bashrc or running scripts |
|
#export BNB_CUDA_VERSION=118 |
|
#export CUDA_HOME=$CONDA_PREFIX |
|
#export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" |
|
|
|
# trl, accelerate, peft |
|
pip install trl |
|
pip install accelerate peft optuna optuna_integration datasets |
|
|
|
# other dependencies |
|
pip install scikit-learn pexpect |
|
pip install wandb plotly # takes a while |
|
``` |
|
|
|
### **Example Code** |
|
|
|
```python |
|
from peft import PeftModel |
|
import time |
|
import torch |
|
from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM |
|
|
|
|
|
class Conversation: |
|
def __init__(self, |
|
model, |
|
tokenizer, |
|
device, |
|
system=""): |
|
self.model = model |
|
self.tokenizer = tokenizer |
|
self.device = device |
|
self.message = [] |
|
if system: |
|
self.message.append({"role": "system", "content": system}) |
|
|
|
def get_prompt(self): |
|
prompt = '<|begin_of_text|>' |
|
# Include the system message if it exists |
|
for msg in self.message: |
|
role = msg['role'] |
|
content = msg['content'] |
|
prompt += f"<|start_header_id|>{role}<|end_header_id|>{content}<|eot_id|>" |
|
# Append the assistant's role header to prompt for the next response |
|
prompt += "<|start_header_id|>assistant<|end_header_id|>" |
|
return prompt |
|
|
|
def generate(self, |
|
user_input, |
|
temp=0.7, |
|
max_new_tokens=1024, |
|
top_k=50, |
|
top_p=0.95): |
|
|
|
# Add the user's input to the conversation history |
|
self.message.append({"role": "user", "content": user_input}) |
|
|
|
# Generate the prompt |
|
prompt = self.get_prompt() |
|
|
|
# Tokenize the prompt |
|
inputs = self.tokenizer(prompt, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=2048).to(self.device) |
|
# inputs = {k: v.to(device) for k, v in inputs.items()} |
|
if self.tokenizer.eos_token_id is None: |
|
self.tokenizer.eos_token_id = self.tokenizer.convert_tokens_to_ids('</s>') |
|
if self.tokenizer.pad_token_id is None: |
|
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id |
|
|
|
print(f"EOS Token ID: {self.tokenizer.eos_token_id}") |
|
print(f"PAD Token ID: {self.tokenizer.pad_token_id}") |
|
# Generate the response |
|
with torch.no_grad(): |
|
outputs = self.model.generate( |
|
**inputs, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=True, |
|
temperature=temp, |
|
top_k=top_k, |
|
top_p=top_p, |
|
pad_token_id=self.tokenizer.eos_token_id, |
|
# eos_token_id=self.tokenizer.convert_tokens_to_ids('<|eot_id|>'), |
|
eos_token_id=self.tokenizer.eos_token_id, |
|
) |
|
|
|
# Decode the generated tokens |
|
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False) |
|
|
|
# Extract the assistant's response |
|
assistant_response = self.extract_assistant_response(prompt, generated_text) |
|
|
|
# Append the assistant's response to the conversation |
|
self.message.append({'role': 'assistant', 'content': assistant_response}) |
|
|
|
return assistant_response |
|
|
|
def extract_assistant_response(self, prompt, generated_text): |
|
# Llama will keep generating after the prompt submitted, this function will |
|
# extract only the LLM's generated output with no special tokens |
|
|
|
# Remove the prompt from the generated text |
|
response_text = generated_text[len(prompt):] |
|
|
|
# Split at the end-of-turn token |
|
if '<|eot_id|>' in response_text: |
|
assistant_response = response_text.split('<|eot_id|>')[0] |
|
else: |
|
assistant_response = response_text |
|
|
|
# Remove special token at the end and leading or trailing whitespaces |
|
assistant_response = assistant_response.replace('<|end_header_id|>', '') |
|
assistant_response = assistant_response.strip() |
|
|
|
return assistant_response |
|
|
|
|
|
if __name__ == "__main__": |
|
base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
|
peft_model_name = "my-ai-university/TOMMI-0.3" |
|
|
|
tokenizer = PreTrainedTokenizerFast.from_pretrained( |
|
model_args.model_name_or_path, |
|
return_tensors="pt") |
|
tokenizer.pad_token = "<|reserved_special_token_5|>" |
|
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
base_model_name, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto") |
|
model = PeftModel.from_pretrained(base_model, peft_model_name) |
|
model = model.merge_and_unload() # Optional: Merge adapter with base model for faster inference |
|
|
|
# Initialize the conversation object |
|
system_message = 'You are an expert professor who replies in a helpful way.' |
|
conv = Conversation( |
|
model, |
|
tokenizer, |
|
model.device, |
|
system_message) |
|
|
|
# Run the conversation loop |
|
print("Starting conversation ...") |
|
input_text = "" |
|
while input_text.lower() != "exit": |
|
input_text = input("Enter your prompt (type 'exit' to quit): ") |
|
|
|
start_time = time.time() |
|
response = conv.generate(input_text) |
|
end_time = time.time() |
|
|
|
print(response) |
|
print(f"Response time: {end_time - start_time:.2f} seconds") |
|
|
|
# Save the conversation to a file |
|
with open("./conversation.txt", "w") as f: |
|
f.write(str(conv.message)) |
|
``` |