File size: 7,732 Bytes
a658fd9 d51287b a658fd9 d51287b a658fd9 ec162a0 7c10c47 9355062 7c10c47 9355062 7c10c47 a658fd9 d51287b a658fd9 d51287b a658fd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
---
base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
library_name: peft
license: apache-2.0
language:
- en
pipeline_tag: question-answering
---
# Model Details
This model is a fine-tuned version of base model [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) using LoRA on the [train_qa_wo_students.csv](https://drive.google.com/file/d/1uv-kVP0z3E8u9-u8PWAKA9tkr3ENHeZv/view?usp=sharing) dataset combing materials from FEM courses of [Prof. Krishna Garikipati](https://viterbi.usc.edu/directory/faculty/Garikipati/Krishna).
Compared with [TOMMI-0.35](https://huggingface.co/my-ai-university/TOMMI-0.35/), TOMMI-1.0 uses the optimal hyperparameters (without student asked QA pairs) and increased token length of 700 from 500.
## **Paper**
* [arXiv](https://arxiv.org/abs/2504.08846)
## **Project page**
* [AI University](https://my-ai-university.com)
## **Github**
* [Github repo](https://github.com/my-ai-university/finite-element-method)
## **Hyperparameters**
* learning_rate: 5e-5
* gradient_accumulation_steps: 2
* epoch: 5
* r (lora rank): 45
* lora_alpha: 65
* lora_dropout: 0.05
* target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
## **Usage**
**For Expanse usage, you should request at least two V100s to run the following code.**
### **Env Setup**
```bash
#!/bin/bash
# python 3.10 + cuda 11.8.0
export MKL_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1
export OMP_NUM_THREADS=1
conda clean -a -y # conda for traditional and reliable setup
mamba clean -a -y # mamba for smart and efficient setup
pip install --upgrade pip
# cuda, gcc/g++, torch
conda install cuda -c nvidia/label/cuda-11.8.0 -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118
pip install torchao==0.7.0 --index-url https://download.pytorch.org/whl/cu118
# deepspeed
mamba install gcc gxx -c conda-forge -y # ensure > 9.0 for ninja JIT
pip install deepspeed==0.15.4
# bitsandbytes
pip install setuptools
mamba install bitsandbytes=0.45.0 -c conda-forge --no-deps -y
pip install psutil
# add the following to your .bashrc or running scripts
#export BNB_CUDA_VERSION=118
#export CUDA_HOME=$CONDA_PREFIX
#export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
# trl, accelerate, peft
pip install trl
pip install accelerate peft optuna optuna_integration datasets
# other dependencies
pip install scikit-learn pexpect
pip install wandb plotly # takes a while
```
### **Example Code**
```python
from peft import PeftModel
import time
import torch
from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM
class Conversation:
def __init__(self,
model,
tokenizer,
device,
system=""):
self.model = model
self.tokenizer = tokenizer
self.device = device
self.message = []
if system:
self.message.append({"role": "system", "content": system})
def get_prompt(self):
prompt = '<|begin_of_text|>'
# Include the system message if it exists
for msg in self.message:
role = msg['role']
content = msg['content']
prompt += f"<|start_header_id|>{role}<|end_header_id|>{content}<|eot_id|>"
# Append the assistant's role header to prompt for the next response
prompt += "<|start_header_id|>assistant<|end_header_id|>"
return prompt
def generate(self,
user_input,
temp=0.7,
max_new_tokens=1024,
top_k=50,
top_p=0.95):
# Add the user's input to the conversation history
self.message.append({"role": "user", "content": user_input})
# Generate the prompt
prompt = self.get_prompt()
# Tokenize the prompt
inputs = self.tokenizer(prompt,
return_tensors="pt",
truncation=True,
max_length=2048).to(self.device)
# inputs = {k: v.to(device) for k, v in inputs.items()}
if self.tokenizer.eos_token_id is None:
self.tokenizer.eos_token_id = self.tokenizer.convert_tokens_to_ids('</s>')
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
print(f"EOS Token ID: {self.tokenizer.eos_token_id}")
print(f"PAD Token ID: {self.tokenizer.pad_token_id}")
# Generate the response
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temp,
top_k=top_k,
top_p=top_p,
pad_token_id=self.tokenizer.eos_token_id,
# eos_token_id=self.tokenizer.convert_tokens_to_ids('<|eot_id|>'),
eos_token_id=self.tokenizer.eos_token_id,
)
# Decode the generated tokens
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
# Extract the assistant's response
assistant_response = self.extract_assistant_response(prompt, generated_text)
# Append the assistant's response to the conversation
self.message.append({'role': 'assistant', 'content': assistant_response})
return assistant_response
def extract_assistant_response(self, prompt, generated_text):
# Llama will keep generating after the prompt submitted, this function will
# extract only the LLM's generated output with no special tokens
# Remove the prompt from the generated text
response_text = generated_text[len(prompt):]
# Split at the end-of-turn token
if '<|eot_id|>' in response_text:
assistant_response = response_text.split('<|eot_id|>')[0]
else:
assistant_response = response_text
# Remove special token at the end and leading or trailing whitespaces
assistant_response = assistant_response.replace('<|end_header_id|>', '')
assistant_response = assistant_response.strip()
return assistant_response
if __name__ == "__main__":
base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
peft_model_name = "my-ai-university/TOMMI-0.3"
tokenizer = PreTrainedTokenizerFast.from_pretrained(
model_args.model_name_or_path,
return_tensors="pt")
tokenizer.pad_token = "<|reserved_special_token_5|>"
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.bfloat16,
device_map="auto")
model = PeftModel.from_pretrained(base_model, peft_model_name)
model = model.merge_and_unload() # Optional: Merge adapter with base model for faster inference
# Initialize the conversation object
system_message = 'You are an expert professor who replies in a helpful way.'
conv = Conversation(
model,
tokenizer,
model.device,
system_message)
# Run the conversation loop
print("Starting conversation ...")
input_text = ""
while input_text.lower() != "exit":
input_text = input("Enter your prompt (type 'exit' to quit): ")
start_time = time.time()
response = conv.generate(input_text)
end_time = time.time()
print(response)
print(f"Response time: {end_time - start_time:.2f} seconds")
# Save the conversation to a file
with open("./conversation.txt", "w") as f:
f.write(str(conv.message))
``` |