AmelieSchreiber
/

esm2_t6_8M_lora_cafa5

@@ -15,6 +15,10 @@ tags:
 ---
 ## Training procedure
 ```
 Epoch 3/3
 Training Loss: 0.0152
@@ -27,5 +31,136 @@ Micro-Average ROC AUC: 0.8894
 ### Framework versions
-- PEFT 0.4.0

 ---
 ## Training procedure
+This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
+a Low Rank Adaptation (LoRA) was trained on top of the model
+[AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co/AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).
 ```
 Epoch 3/3
 Training Loss: 0.0152
 ### Framework versions
+- PEFT 0.4.0
+## Using the Model
+```python
+import os
+import numpy as np
+import torch
+from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
+from torch.nn.functional import binary_cross_entropy_with_logits
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, precision_score, recall_score
+from accelerate import Accelerator
+from Bio import SeqIO
+# Step 1: Data Preprocessing
+fasta_file = "data/Train/train_sequences.fasta"
+tsv_file = "data/Train/train_terms.tsv"
+fasta_data = {}
+tsv_data = {}
+for record in SeqIO.parse(fasta_file, "fasta"):
+    fasta_data[record.id] = str(record.seq)
+with open(tsv_file, 'r') as f:
+    for line in f:
+        parts = line.strip().split("\t")
+        tsv_data[parts[0]] = parts[1:]
+unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
+def parse_fasta(file_path):
+    """
+    Parses a FASTA file and returns a list of sequences.
+    """
+    with open(file_path, 'r') as f:
+        content = f.readlines()
+    sequences = []
+    current_sequence = ""
+    for line in content:
+        if line.startswith(">"):
+            if current_sequence:
+                sequences.append(current_sequence)
+                current_sequence = ""
+        else:
+            current_sequence += line.strip()
+    if current_sequence:
+        sequences.append(current_sequence)
+    return sequences
+# Parse the provided FASTA file
+fasta_file_path = "data/Test/testsuperset.fasta"
+protein_sequences = parse_fasta(fasta_file_path)
+# protein_sequences[:3]  # Displaying the first 3 sequences for verification
+import torch
+from transformers import AutoTokenizer, EsmForSequenceClassification
+from sklearn.metrics import precision_recall_fscore_support
+# 1. Parsing the go-basic.obo file (Assuming this is still needed)
+def parse_obo_file(file_path):
+    with open(file_path, 'r') as f:
+        data = f.read().split("[Term]")
+    terms = []
+    for entry in data[1:]:
+        lines = entry.strip().split("\n")
+        term = {}
+        for line in lines:
+            if line.startswith("id:"):
+                term["id"] = line.split("id:")[1].strip()
+            elif line.startswith("name:"):
+                term["name"] = line.split("name:")[1].strip()
+            elif line.startswith("namespace:"):
+                term["namespace"] = line.split("namespace:")[1].strip()
+            elif line.startswith("def:"):
+                term["definition"] = line.split("def:")[1].split('"')[1]
+        terms.append(term)
+    return terms
+# Let's assume the path to go-basic.obo is as follows (please modify if different)
+obo_file_path = "data/Train/go-basic.obo"
+parsed_terms = parse_obo_file("data/Train/go-basic.obo")  # Replace with your path
+# 2. Load the saved model and tokenizer
+# Assuming the model path provided is correct
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from peft import PeftModel, PeftConfig
+# Load the tokenizer and model
+model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5"  # Replace with your Hugging Face hub model name
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# First, we load the underlying base model
+base_model = AutoModelForSequenceClassification.from_pretrained(model_id)
+# Then, we load the model with PEFT
+model = PeftModel.from_pretrained(base_model, model_id)
+loaded_model = model
+loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
+# 3. The predict_protein_function function
+def predict_protein_function(sequence, model, tokenizer, go_terms):
+    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predictions = torch.sigmoid(outputs.logits)
+        predicted_indices = torch.where(predictions > 0.05)[1].tolist()
+    functions = []
+    for idx in predicted_indices:
+        term_id = unique_terms[idx]  # Use the unique_terms list from your training script
+        for term in go_terms:
+            if term["id"] == term_id:
+                functions.append(term["name"])
+                break
+    return functions
+# 4. Predicting protein function for the sequences in the FASTA file
+protein_functions = {}
+for seq in protein_sequences[:20]:  # Using only the first 3 sequences for demonstration
+    predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
+    protein_functions[seq[:20] + "..."] = predicted_functions  # Using first 20 characters as key
+protein_functions
+```