Commit
·
33998ff
1
Parent(s):
7895fd5
Update README.md
Browse files
README.md
CHANGED
@@ -15,6 +15,10 @@ tags:
|
|
15 |
---
|
16 |
## Training procedure
|
17 |
|
|
|
|
|
|
|
|
|
18 |
```
|
19 |
Epoch 3/3
|
20 |
Training Loss: 0.0152
|
@@ -27,5 +31,136 @@ Micro-Average ROC AUC: 0.8894
|
|
27 |
|
28 |
### Framework versions
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
|
|
|
15 |
---
|
16 |
## Training procedure
|
17 |
|
18 |
+
This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
|
19 |
+
a Low Rank Adaptation (LoRA) was trained on top of the model
|
20 |
+
[AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co/AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).
|
21 |
+
|
22 |
```
|
23 |
Epoch 3/3
|
24 |
Training Loss: 0.0152
|
|
|
31 |
|
32 |
### Framework versions
|
33 |
|
34 |
+
- PEFT 0.4.0
|
35 |
+
|
36 |
+
## Using the Model
|
37 |
+
|
38 |
+
```python
|
39 |
+
import os
|
40 |
+
import numpy as np
|
41 |
+
import torch
|
42 |
+
from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
|
43 |
+
from torch.nn.functional import binary_cross_entropy_with_logits
|
44 |
+
from sklearn.model_selection import train_test_split
|
45 |
+
from sklearn.metrics import f1_score, precision_score, recall_score
|
46 |
+
from accelerate import Accelerator
|
47 |
+
from Bio import SeqIO
|
48 |
+
|
49 |
+
# Step 1: Data Preprocessing
|
50 |
+
fasta_file = "data/Train/train_sequences.fasta"
|
51 |
+
tsv_file = "data/Train/train_terms.tsv"
|
52 |
+
|
53 |
+
fasta_data = {}
|
54 |
+
tsv_data = {}
|
55 |
+
|
56 |
+
for record in SeqIO.parse(fasta_file, "fasta"):
|
57 |
+
fasta_data[record.id] = str(record.seq)
|
58 |
+
|
59 |
+
with open(tsv_file, 'r') as f:
|
60 |
+
for line in f:
|
61 |
+
parts = line.strip().split("\t")
|
62 |
+
tsv_data[parts[0]] = parts[1:]
|
63 |
+
|
64 |
+
unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
|
65 |
+
|
66 |
+
def parse_fasta(file_path):
|
67 |
+
"""
|
68 |
+
Parses a FASTA file and returns a list of sequences.
|
69 |
+
"""
|
70 |
+
with open(file_path, 'r') as f:
|
71 |
+
content = f.readlines()
|
72 |
+
|
73 |
+
sequences = []
|
74 |
+
current_sequence = ""
|
75 |
+
|
76 |
+
for line in content:
|
77 |
+
if line.startswith(">"):
|
78 |
+
if current_sequence:
|
79 |
+
sequences.append(current_sequence)
|
80 |
+
current_sequence = ""
|
81 |
+
else:
|
82 |
+
current_sequence += line.strip()
|
83 |
+
|
84 |
+
if current_sequence:
|
85 |
+
sequences.append(current_sequence)
|
86 |
+
|
87 |
+
return sequences
|
88 |
+
|
89 |
+
# Parse the provided FASTA file
|
90 |
+
fasta_file_path = "data/Test/testsuperset.fasta"
|
91 |
+
protein_sequences = parse_fasta(fasta_file_path)
|
92 |
+
# protein_sequences[:3] # Displaying the first 3 sequences for verification
|
93 |
+
|
94 |
+
import torch
|
95 |
+
from transformers import AutoTokenizer, EsmForSequenceClassification
|
96 |
+
from sklearn.metrics import precision_recall_fscore_support
|
97 |
+
|
98 |
+
# 1. Parsing the go-basic.obo file (Assuming this is still needed)
|
99 |
+
def parse_obo_file(file_path):
|
100 |
+
with open(file_path, 'r') as f:
|
101 |
+
data = f.read().split("[Term]")
|
102 |
+
|
103 |
+
terms = []
|
104 |
+
for entry in data[1:]:
|
105 |
+
lines = entry.strip().split("\n")
|
106 |
+
term = {}
|
107 |
+
for line in lines:
|
108 |
+
if line.startswith("id:"):
|
109 |
+
term["id"] = line.split("id:")[1].strip()
|
110 |
+
elif line.startswith("name:"):
|
111 |
+
term["name"] = line.split("name:")[1].strip()
|
112 |
+
elif line.startswith("namespace:"):
|
113 |
+
term["namespace"] = line.split("namespace:")[1].strip()
|
114 |
+
elif line.startswith("def:"):
|
115 |
+
term["definition"] = line.split("def:")[1].split('"')[1]
|
116 |
+
terms.append(term)
|
117 |
+
return terms
|
118 |
+
|
119 |
+
# Let's assume the path to go-basic.obo is as follows (please modify if different)
|
120 |
+
obo_file_path = "data/Train/go-basic.obo"
|
121 |
+
parsed_terms = parse_obo_file("data/Train/go-basic.obo") # Replace with your path
|
122 |
+
|
123 |
+
# 2. Load the saved model and tokenizer
|
124 |
+
# Assuming the model path provided is correct
|
125 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
126 |
+
from peft import PeftModel, PeftConfig
|
127 |
+
|
128 |
+
# Load the tokenizer and model
|
129 |
+
model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5" # Replace with your Hugging Face hub model name
|
130 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
131 |
+
|
132 |
+
# First, we load the underlying base model
|
133 |
+
base_model = AutoModelForSequenceClassification.from_pretrained(model_id)
|
134 |
+
|
135 |
+
# Then, we load the model with PEFT
|
136 |
+
model = PeftModel.from_pretrained(base_model, model_id)
|
137 |
+
loaded_model = model
|
138 |
+
loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
|
139 |
+
|
140 |
+
# 3. The predict_protein_function function
|
141 |
+
def predict_protein_function(sequence, model, tokenizer, go_terms):
|
142 |
+
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
|
143 |
+
model.eval()
|
144 |
+
with torch.no_grad():
|
145 |
+
outputs = model(**inputs)
|
146 |
+
predictions = torch.sigmoid(outputs.logits)
|
147 |
+
predicted_indices = torch.where(predictions > 0.05)[1].tolist()
|
148 |
+
|
149 |
+
functions = []
|
150 |
+
for idx in predicted_indices:
|
151 |
+
term_id = unique_terms[idx] # Use the unique_terms list from your training script
|
152 |
+
for term in go_terms:
|
153 |
+
if term["id"] == term_id:
|
154 |
+
functions.append(term["name"])
|
155 |
+
break
|
156 |
+
|
157 |
+
return functions
|
158 |
+
|
159 |
+
# 4. Predicting protein function for the sequences in the FASTA file
|
160 |
+
protein_functions = {}
|
161 |
+
for seq in protein_sequences[:20]: # Using only the first 3 sequences for demonstration
|
162 |
+
predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
|
163 |
+
protein_functions[seq[:20] + "..."] = predicted_functions # Using first 20 characters as key
|
164 |
|
165 |
+
protein_functions
|
166 |
+
```
|