---
base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
library_name: peft
license: apache-2.0
language:
- en
pipeline_tag: question-answering
---

# Model Details

This model is a fine-tuned version of base model [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) using LoRA on the [train_qa_wo_students.csv](https://drive.google.com/file/d/1uv-kVP0z3E8u9-u8PWAKA9tkr3ENHeZv/view?usp=sharing) dataset combing materials from FEM courses of [Prof. Krishna Garikipati](https://viterbi.usc.edu/directory/faculty/Garikipati/Krishna).

Compared with [TOMMI-0.35](https://huggingface.co/my-ai-university/TOMMI-0.35/), TOMMI-1.0 uses the optimal hyperparameters (without student asked QA pairs) and increased token length of 700 from 500.

## **Paper**
* [arXiv](https://arxiv.org/abs/2504.08846)

## **Project page**
* [AI University](https://my-ai-university.com)

## **Github**
* [Github repo](https://github.com/my-ai-university/finite-element-method)

## **Hyperparameters**

* learning_rate: 5e-5
* gradient_accumulation_steps: 2
* epoch: 5
* r (lora rank): 45
* lora_alpha: 65
* lora_dropout: 0.05
* target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

## **Usage**

**For Expanse usage, you should request at least two V100s to run the following code.**

### **Env Setup**

```bash
#!/bin/bash
# python 3.10 + cuda 11.8.0


export MKL_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1
export OMP_NUM_THREADS=1

conda clean -a -y # conda for traditional and reliable setup
mamba clean -a -y # mamba for smart and efficient setup
pip install --upgrade pip

# cuda, gcc/g++, torch
conda install cuda -c nvidia/label/cuda-11.8.0 -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118
pip install torchao==0.7.0 --index-url https://download.pytorch.org/whl/cu118

# deepspeed
mamba install gcc gxx -c conda-forge -y # ensure > 9.0 for ninja JIT
pip install deepspeed==0.15.4

# bitsandbytes
pip install setuptools
mamba install bitsandbytes=0.45.0 -c conda-forge --no-deps -y
pip install psutil
# add the following to your .bashrc or running scripts
#export BNB_CUDA_VERSION=118
#export CUDA_HOME=$CONDA_PREFIX
#export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"

# trl, accelerate, peft
pip install trl
pip install accelerate peft optuna optuna_integration datasets

# other dependencies
pip install scikit-learn pexpect
pip install wandb plotly # takes a while
```

### **Example Code**

```python
from peft import PeftModel
import time
import torch
from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM


class Conversation:
    def __init__(self,
                 model,
                 tokenizer,
                 device,
                 system=""):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.message = []
        if system:
            self.message.append({"role": "system", "content": system})

    def get_prompt(self):
        prompt = '<|begin_of_text|>'
        # Include the system message if it exists
        for msg in self.message:
            role = msg['role']
            content = msg['content']
            prompt += f"<|start_header_id|>{role}<|end_header_id|>{content}<|eot_id|>"
        # Append the assistant's role header to prompt for the next response
        prompt += "<|start_header_id|>assistant<|end_header_id|>"
        return prompt

    def generate(self,
                 user_input,
                 temp=0.7,
                 max_new_tokens=1024,
                 top_k=50,
                 top_p=0.95):

        # Add the user's input to the conversation history
        self.message.append({"role": "user", "content": user_input})

        # Generate the prompt
        prompt = self.get_prompt()

        # Tokenize the prompt
        inputs = self.tokenizer(prompt,
                                return_tensors="pt",
                                truncation=True,
                                max_length=2048).to(self.device)
        # inputs = {k: v.to(device) for k, v in inputs.items()}
        if self.tokenizer.eos_token_id is None:
            self.tokenizer.eos_token_id = self.tokenizer.convert_tokens_to_ids('</s>')
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        print(f"EOS Token ID: {self.tokenizer.eos_token_id}")
        print(f"PAD Token ID: {self.tokenizer.pad_token_id}")
        # Generate the response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temp,
                top_k=top_k,
                top_p=top_p,
                pad_token_id=self.tokenizer.eos_token_id,
                # eos_token_id=self.tokenizer.convert_tokens_to_ids('<|eot_id|>'),
                eos_token_id=self.tokenizer.eos_token_id,
            )

        # Decode the generated tokens
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

        # Extract the assistant's response
        assistant_response = self.extract_assistant_response(prompt, generated_text)

        # Append the assistant's response to the conversation
        self.message.append({'role': 'assistant', 'content': assistant_response})

        return assistant_response

    def extract_assistant_response(self, prompt, generated_text):
        # Llama will keep generating after the prompt submitted, this function will
        # extract only the LLM's generated output with no special tokens

        # Remove the prompt from the generated text
        response_text = generated_text[len(prompt):]

        # Split at the end-of-turn token
        if '<|eot_id|>' in response_text:
            assistant_response = response_text.split('<|eot_id|>')[0]
        else:
            assistant_response = response_text

        # Remove special token at the end and leading or trailing whitespaces
        assistant_response = assistant_response.replace('<|end_header_id|>', '')
        assistant_response = assistant_response.strip()

        return assistant_response


if __name__ == "__main__":
    base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    peft_model_name = "my-ai-university/TOMMI-0.3"

    tokenizer = PreTrainedTokenizerFast.from_pretrained(
        model_args.model_name_or_path,
        return_tensors="pt")
    tokenizer.pad_token = "<|reserved_special_token_5|>"

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto")
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload() # Optional: Merge adapter with base model for faster inference

    # Initialize the conversation object
    system_message = 'You are an expert professor who replies in a helpful way.'
    conv = Conversation(
        model,
        tokenizer,
        model.device,
        system_message)

    # Run the conversation loop
    print("Starting conversation ...")
    input_text = ""
    while input_text.lower() != "exit":
        input_text = input("Enter your prompt (type 'exit' to quit): ")

        start_time = time.time()
        response = conv.generate(input_text)
        end_time = time.time()

        print(response)
        print(f"Response time: {end_time - start_time:.2f} seconds")

    # Save the conversation to a file
    with open("./conversation.txt", "w") as f:
        f.write(str(conv.message))
```