--- base_model: meta-llama/Llama-3.2-11B-Vision-Instruct library_name: peft license: apache-2.0 language: - en pipeline_tag: question-answering --- # Model Details This model is a fine-tuned version of base model [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) using LoRA on the [train_qa_wo_students.csv](https://drive.google.com/file/d/1uv-kVP0z3E8u9-u8PWAKA9tkr3ENHeZv/view?usp=sharing) dataset combing materials from FEM courses of [Prof. Krishna Garikipati](https://viterbi.usc.edu/directory/faculty/Garikipati/Krishna). Compared with [TOMMI-0.35](https://huggingface.co/my-ai-university/TOMMI-0.35/), TOMMI-1.0 uses the optimal hyperparameters (without student asked QA pairs) and increased token length of 700 from 500. ## **Paper** * [arXiv](https://arxiv.org/abs/2504.08846) ## **Project page** * [AI University](https://my-ai-university.com) ## **Github** * [Github repo](https://github.com/my-ai-university/finite-element-method) ## **Hyperparameters** * learning_rate: 5e-5 * gradient_accumulation_steps: 2 * epoch: 5 * r (lora rank): 45 * lora_alpha: 65 * lora_dropout: 0.05 * target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] ## **Usage** **For Expanse usage, you should request at least two V100s to run the following code.** ### **Env Setup** ```bash #!/bin/bash # python 3.10 + cuda 11.8.0 export MKL_NUM_THREADS=1 export NUMEXPR_NUM_THREADS=1 export OPENBLAS_NUM_THREADS=1 export OMP_NUM_THREADS=1 conda clean -a -y # conda for traditional and reliable setup mamba clean -a -y # mamba for smart and efficient setup pip install --upgrade pip # cuda, gcc/g++, torch conda install cuda -c nvidia/label/cuda-11.8.0 -y pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118 pip install torchao==0.7.0 --index-url https://download.pytorch.org/whl/cu118 # deepspeed mamba install gcc gxx -c conda-forge -y # ensure > 9.0 for ninja JIT pip install deepspeed==0.15.4 # bitsandbytes pip install setuptools mamba install bitsandbytes=0.45.0 -c conda-forge --no-deps -y pip install psutil # add the following to your .bashrc or running scripts #export BNB_CUDA_VERSION=118 #export CUDA_HOME=$CONDA_PREFIX #export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" # trl, accelerate, peft pip install trl pip install accelerate peft optuna optuna_integration datasets # other dependencies pip install scikit-learn pexpect pip install wandb plotly # takes a while ``` ### **Example Code** ```python from peft import PeftModel import time import torch from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM class Conversation: def __init__(self, model, tokenizer, device, system=""): self.model = model self.tokenizer = tokenizer self.device = device self.message = [] if system: self.message.append({"role": "system", "content": system}) def get_prompt(self): prompt = '<|begin_of_text|>' # Include the system message if it exists for msg in self.message: role = msg['role'] content = msg['content'] prompt += f"<|start_header_id|>{role}<|end_header_id|>{content}<|eot_id|>" # Append the assistant's role header to prompt for the next response prompt += "<|start_header_id|>assistant<|end_header_id|>" return prompt def generate(self, user_input, temp=0.7, max_new_tokens=1024, top_k=50, top_p=0.95): # Add the user's input to the conversation history self.message.append({"role": "user", "content": user_input}) # Generate the prompt prompt = self.get_prompt() # Tokenize the prompt inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device) # inputs = {k: v.to(device) for k, v in inputs.items()} if self.tokenizer.eos_token_id is None: self.tokenizer.eos_token_id = self.tokenizer.convert_tokens_to_ids('') if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id print(f"EOS Token ID: {self.tokenizer.eos_token_id}") print(f"PAD Token ID: {self.tokenizer.pad_token_id}") # Generate the response with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p, pad_token_id=self.tokenizer.eos_token_id, # eos_token_id=self.tokenizer.convert_tokens_to_ids('<|eot_id|>'), eos_token_id=self.tokenizer.eos_token_id, ) # Decode the generated tokens generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False) # Extract the assistant's response assistant_response = self.extract_assistant_response(prompt, generated_text) # Append the assistant's response to the conversation self.message.append({'role': 'assistant', 'content': assistant_response}) return assistant_response def extract_assistant_response(self, prompt, generated_text): # Llama will keep generating after the prompt submitted, this function will # extract only the LLM's generated output with no special tokens # Remove the prompt from the generated text response_text = generated_text[len(prompt):] # Split at the end-of-turn token if '<|eot_id|>' in response_text: assistant_response = response_text.split('<|eot_id|>')[0] else: assistant_response = response_text # Remove special token at the end and leading or trailing whitespaces assistant_response = assistant_response.replace('<|end_header_id|>', '') assistant_response = assistant_response.strip() return assistant_response if __name__ == "__main__": base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" peft_model_name = "my-ai-university/TOMMI-0.3" tokenizer = PreTrainedTokenizerFast.from_pretrained( model_args.model_name_or_path, return_tensors="pt") tokenizer.pad_token = "<|reserved_special_token_5|>" base_model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.bfloat16, device_map="auto") model = PeftModel.from_pretrained(base_model, peft_model_name) model = model.merge_and_unload() # Optional: Merge adapter with base model for faster inference # Initialize the conversation object system_message = 'You are an expert professor who replies in a helpful way.' conv = Conversation( model, tokenizer, model.device, system_message) # Run the conversation loop print("Starting conversation ...") input_text = "" while input_text.lower() != "exit": input_text = input("Enter your prompt (type 'exit' to quit): ") start_time = time.time() response = conv.generate(input_text) end_time = time.time() print(response) print(f"Response time: {end_time - start_time:.2f} seconds") # Save the conversation to a file with open("./conversation.txt", "w") as f: f.write(str(conv.message)) ```