Spaces:
Build error
Build error
Upload 16 files
Browse files- Dockerfile +320 -0
- Samples/SMILES_GENERATED.txt +95 -0
- Samples/generated_smiles.txt +93 -0
- app.py +1084 -0
- modelsBioembed/.gitattributes +8 -0
- modelsBioembed/README.md +141 -0
- modelsBioembed/config.json +16 -0
- modelsBioembed/special_tokens_map.json +1 -0
- modelsBioembed/tokenizer_config.json +1 -0
- modelsBioembed/vocab.txt +30 -0
- models_folder/atom_types.txt +67 -0
- models_folder/configs.json +1 -0
- modelstrc.py +1331 -0
- requirements.txt +17 -0
- runtime.txt +1 -0
- templates/index.html +93 -0
Dockerfile
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.8
|
2 |
+
|
3 |
+
# Install required system dependencies
|
4 |
+
RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
|
5 |
+
|
6 |
+
# Set the working directory inside the container
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
# Copy the requirements file into the container
|
10 |
+
COPY requirements.txt .
|
11 |
+
|
12 |
+
# Install dependencies
|
13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
+
|
15 |
+
# Create necessary directories (but don't download models here!)
|
16 |
+
RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
17 |
+
|
18 |
+
# Copy the entire project to the container
|
19 |
+
COPY . .
|
20 |
+
|
21 |
+
# Expose the port for Flask
|
22 |
+
EXPOSE 7860
|
23 |
+
|
24 |
+
# Run the app with Gunicorn
|
25 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
|
26 |
+
|
27 |
+
|
28 |
+
# # Use Python 3.8 as the base image
|
29 |
+
# FROM python:3.8
|
30 |
+
|
31 |
+
# # Install required system dependencies
|
32 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git wget && rm -rf /var/lib/apt/lists/*
|
33 |
+
|
34 |
+
# # Set the working directory inside the container
|
35 |
+
# WORKDIR /app
|
36 |
+
|
37 |
+
# # Copy the requirements file into the container
|
38 |
+
# COPY requirements.txt .
|
39 |
+
|
40 |
+
# # Install dependencies
|
41 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
42 |
+
|
43 |
+
# # Create necessary directories inside the container
|
44 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
45 |
+
|
46 |
+
# # Download model files from Dropbox using wget
|
47 |
+
# RUN wget -O /app/modelsBioembed/pytorch_model.bin "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1" && \
|
48 |
+
# wget -O /app/modelsBioembed/config.json "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1" && \
|
49 |
+
# wget -O /app/modelsBioembed/special_tokens_map.json "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1" && \
|
50 |
+
# wget -O /app/modelsBioembed/tokenizer_config.json "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1" && \
|
51 |
+
# wget -O /app/modelsBioembed/vocab.txt "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1"
|
52 |
+
|
53 |
+
# # Copy the entire project to the container
|
54 |
+
# COPY . .
|
55 |
+
|
56 |
+
# # Expose the port for Flask
|
57 |
+
# EXPOSE 8000
|
58 |
+
|
59 |
+
# # Run the app with Gunicorn
|
60 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
# # Use Python 3.8 as the base image
|
66 |
+
# FROM python:3.8
|
67 |
+
|
68 |
+
# # Install required system dependencies
|
69 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git wget && rm -rf /var/lib/apt/lists/*
|
70 |
+
|
71 |
+
# # Set the working directory inside the container
|
72 |
+
# WORKDIR /app
|
73 |
+
|
74 |
+
# # Copy the requirements file into the container
|
75 |
+
# COPY requirements.txt .
|
76 |
+
|
77 |
+
# # Install dependencies
|
78 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
79 |
+
|
80 |
+
# # Create necessary directories inside the container
|
81 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
82 |
+
|
83 |
+
# # Download model files from Dropbox using wget
|
84 |
+
# RUN wget -O /app/modelsBioembed/pytorch_model.bin "https://www.dropbox.com/s/example/pytorch_model.bin?dl=1" && \
|
85 |
+
# wget -O /app/modelsBioembed/config.json "https://www.dropbox.com/s/example/config.json?dl=1" && \
|
86 |
+
# wget -O /app/modelsBioembed/tokenizer_config.json "https://www.dropbox.com/s/example/tokenizer_config.json?dl=1" && \
|
87 |
+
# wget -O /app/modelsBioembed/vocab.txt "https://www.dropbox.com/s/example/vocab.txt?dl=1" && \
|
88 |
+
# wget -O /app/modelsBioembed/special_tokens_map.json "https://www.dropbox.com/s/example/special_tokens_map.json?dl=1"
|
89 |
+
|
90 |
+
# # Copy the entire project to the container
|
91 |
+
# COPY . .
|
92 |
+
|
93 |
+
# # Expose the port for Flask
|
94 |
+
# EXPOSE 8000
|
95 |
+
|
96 |
+
# # Run the app with Gunicorn
|
97 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
# # Use Python 3.8 as the base image
|
112 |
+
# FROM python:3.8
|
113 |
+
|
114 |
+
# # Install required system dependencies
|
115 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
|
116 |
+
|
117 |
+
# # Set the working directory inside the container
|
118 |
+
# WORKDIR /app
|
119 |
+
|
120 |
+
# # Copy the requirements file into the container
|
121 |
+
# COPY requirements.txt .
|
122 |
+
|
123 |
+
# # Install dependencies
|
124 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
125 |
+
|
126 |
+
# # Install gdown for Google Drive downloads
|
127 |
+
# RUN pip install --no-cache-dir gdown
|
128 |
+
|
129 |
+
# # Create necessary directories inside the container
|
130 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
131 |
+
|
132 |
+
# # Download model files from Google Drive using gdown
|
133 |
+
# RUN gdown --id 1aDirthtWAu-oyVjcWZ6linrddN-dmLMI -O /app/modelsBioembed/pytorch_model.bin && \
|
134 |
+
# gdown --id 1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3 -O /app/modelsBioembed/config.json && \
|
135 |
+
# gdown --id 1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf -O /app/modelsBioembed/tokenizer_config.json && \
|
136 |
+
# gdown --id 1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ -O /app/modelsBioembed/vocab.txt && \
|
137 |
+
# gdown --id 1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P -O /app/modelsBioembed/special_tokens_map.json
|
138 |
+
|
139 |
+
# # Copy the entire project to the container
|
140 |
+
# COPY . .
|
141 |
+
|
142 |
+
# # Expose the port for Flask
|
143 |
+
# EXPOSE 8000
|
144 |
+
|
145 |
+
# # Run the app with Gunicorn
|
146 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
147 |
+
# #Use Python 3.8 as the base image
|
148 |
+
# FROM python:3.8
|
149 |
+
|
150 |
+
# # Install required system dependencies
|
151 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
|
152 |
+
|
153 |
+
# # Set the working directory inside the container
|
154 |
+
# WORKDIR /app
|
155 |
+
|
156 |
+
# # Copy the requirements file into the container
|
157 |
+
# COPY requirements.txt .
|
158 |
+
|
159 |
+
# # Install dependencies
|
160 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
161 |
+
|
162 |
+
# # Install gdown for Google Drive downloads
|
163 |
+
# RUN pip install --no-cache-dir gdown
|
164 |
+
|
165 |
+
# # Create necessary directories inside the container
|
166 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
167 |
+
|
168 |
+
# # Download model files from Google Drive using alternative methods
|
169 |
+
# RUN curl -L -o /app/modelsBioembed/pytorch_model.bin "https://drive.google.com/uc?export=download&id=11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb" && \
|
170 |
+
# curl -L -o /app/modelsBioembed/config.json "https://drive.google.com/uc?export=download&id=1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E" && \
|
171 |
+
# curl -L -o /app/modelsBioembed/tokenizer_config.json "https://drive.google.com/uc?export=download&id=1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc" && \
|
172 |
+
# curl -L -o /app/modelsBioembed/vocab.txt "https://drive.google.com/uc?export=download&id=1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w" && \
|
173 |
+
# curl -L -o /app/modelsBioembed/special_tokens_map.json "https://drive.google.com/uc?export=download&id=1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl"
|
174 |
+
|
175 |
+
# # Copy the entire project to the container
|
176 |
+
# COPY . .
|
177 |
+
|
178 |
+
# # Expose the port for Flask
|
179 |
+
# EXPOSE 8000
|
180 |
+
|
181 |
+
# # Run the app with Gunicorn
|
182 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
# # Use Python 3.8 as the base image
|
188 |
+
# FROM python:3.8
|
189 |
+
|
190 |
+
# # Install required system dependencies
|
191 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git curl wget && rm -rf /var/lib/apt/lists/*
|
192 |
+
|
193 |
+
# # Set the working directory inside the container
|
194 |
+
# WORKDIR /app
|
195 |
+
|
196 |
+
# # Copy the requirements file into the container
|
197 |
+
# COPY requirements.txt .
|
198 |
+
|
199 |
+
# # Install dependencies
|
200 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
201 |
+
|
202 |
+
# # Create necessary directories inside the container
|
203 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
204 |
+
|
205 |
+
# # Function to download file from Google Drive using wget
|
206 |
+
# RUN wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb' -O /app/modelsBioembed/pytorch_model.bin && \
|
207 |
+
# wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E' -O /app/modelsBioembed/config.json && \
|
208 |
+
# wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc' -O /app/modelsBioembed/tokenizer_config.json && \
|
209 |
+
# wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w' -O /app/modelsBioembed/vocab.txt && \
|
210 |
+
# wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl' -O /app/modelsBioembed/special_tokens_map.json
|
211 |
+
|
212 |
+
# # Copy the entire project to the container
|
213 |
+
# COPY . .
|
214 |
+
|
215 |
+
# # Expose the port for Flask
|
216 |
+
# EXPOSE 8000
|
217 |
+
|
218 |
+
# # Run the app with Gunicorn
|
219 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
220 |
+
|
221 |
+
|
222 |
+
# # Use Python 3.8 as the base image
|
223 |
+
# FROM python:3.8
|
224 |
+
|
225 |
+
# # Install required system dependencies
|
226 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
|
227 |
+
|
228 |
+
# # Set the working directory inside the container
|
229 |
+
# WORKDIR /app
|
230 |
+
|
231 |
+
# # Copy the requirements file into the container
|
232 |
+
# COPY requirements.txt .
|
233 |
+
|
234 |
+
# # Install dependencies
|
235 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
236 |
+
|
237 |
+
# # Install gdown for Google Drive downloads
|
238 |
+
# RUN pip install --no-cache-dir gdown
|
239 |
+
|
240 |
+
# # Create necessary directories inside the container
|
241 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
242 |
+
|
243 |
+
# # Download model files using gdown with file IDs
|
244 |
+
# RUN gdown --id 1aDirthtWAu-oyVjcWZ6linrddN-dmLMI -O /app/modelsBioembed/pytorch_model.bin && \
|
245 |
+
# gdown --id 1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3 -O /app/modelsBioembed/config.json && \
|
246 |
+
# gdown --id 1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf -O /app/modelsBioembed/tokenizer_config.json && \
|
247 |
+
# gdown --id 1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ -O /app/modelsBioembed/vocab.txt && \
|
248 |
+
# gdown --id 1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P -O /app/modelsBioembed/special_tokens_map.json
|
249 |
+
|
250 |
+
# # Copy the entire project to the container
|
251 |
+
# COPY . .
|
252 |
+
|
253 |
+
# # Expose the port for Flask
|
254 |
+
# EXPOSE 8000
|
255 |
+
|
256 |
+
# # Run the app with Gunicorn
|
257 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
# # Use Python 3.8 as the base image
|
278 |
+
# FROM python:3.8
|
279 |
+
|
280 |
+
# # Install required system dependencies
|
281 |
+
# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
|
282 |
+
|
283 |
+
# # Set the working directory inside the container
|
284 |
+
# WORKDIR /app
|
285 |
+
|
286 |
+
# # Copy the requirements file into the container
|
287 |
+
# COPY requirements.txt .
|
288 |
+
|
289 |
+
# # Install dependencies
|
290 |
+
# RUN pip install --no-cache-dir -r requirements.txt
|
291 |
+
|
292 |
+
# # Create necessary directories inside the container
|
293 |
+
# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
|
294 |
+
|
295 |
+
# # Set the OAuth 2.0 Access Token (Replace with your actual token)
|
296 |
+
# ENV ACCESS_TOKEN="ya29.a0AeXRPp6PRilVeuzghPDbZQE7DxYHHWv4TARoaumWuo4gX9bIcEtMzp3PGi1Ak36YIbvKk32V7Cwb6bEjGfJuOWp0ZDW5rdog1c5uf9PJH7b-zgPxIeGa0kmZhGlk79gc7WfoSAl-GUopljJfOmKsyVn628CGB10RRHBtMQiHaCgYKARQSARESFQHGX2MiRd-59J4_XHWekXXqwK-jsw0175"
|
297 |
+
|
298 |
+
# # Define Google Drive File IDs
|
299 |
+
# ENV FILE_ID1="1aDirthtWAu-oyVjcWZ6linrddN-dmLMI"
|
300 |
+
# ENV FILE_ID2="1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3"
|
301 |
+
# ENV FILE_ID3="1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf"
|
302 |
+
# ENV FILE_ID4="1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ"
|
303 |
+
# ENV FILE_ID5="1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P"
|
304 |
+
|
305 |
+
# # Download model files using curl with OAuth token
|
306 |
+
# RUN curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID1?alt=media" -o /app/modelsBioembed/pytorch_model.bin && \
|
307 |
+
# curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID2?alt=media" -o /app/modelsBioembed/config.json && \
|
308 |
+
# curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID3?alt=media" -o /app/modelsBioembed/tokenizer_config.json && \
|
309 |
+
# curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID4?alt=media" -o /app/modelsBioembed/vocab.txt && \
|
310 |
+
# curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID5?alt=media" -o /app/modelsBioembed/special_tokens_map.json
|
311 |
+
|
312 |
+
# # Copy the entire project to the container
|
313 |
+
# COPY . .
|
314 |
+
|
315 |
+
# # Expose the port for Flask
|
316 |
+
# EXPOSE 8000
|
317 |
+
|
318 |
+
# # Run the app with Gunicorn
|
319 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
|
320 |
+
|
Samples/SMILES_GENERATED.txt
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CC(C)C1C(=O)NC2CCN(CCC3CCC(CN)CC3)C21
|
2 |
+
CCOC(=O)C1CCCCCC12CCCCN2
|
3 |
+
O=C(O)CCNC(=O)C1NCCCC1C(=O)NO
|
4 |
+
CC(C)CC(NC(=O)C(Cc1c[nH]cn1)NC(=O)c1cccc(C(F)(F)F)c1)C(=O)O
|
5 |
+
CC(=O)OC1CC2CC(CC(c3ccc(O)c(O)c3)=C2O)OC(=O)c2ccc1c(O)c2
|
6 |
+
Cc1cc2cc(C)n(O)c(=O)c2cc1C
|
7 |
+
NCCc1ccc(S(=O)(=O)N2CCN=C2c2ccc(-c3ccccc3)cc2)cc1
|
8 |
+
NC1=NC(c2c(Cl)cc(Cl)cc2Cl)CC(=O)N1
|
9 |
+
CC=NOC(=O)c1ccc(NS(=O)(=O)N=C2c3ccc(C)cc3CCC2(C)C)cc1
|
10 |
+
CC(=O)CCCC(=O)OCC(CNc1cccc(Cl)c1)S(=O)(=O)O
|
11 |
+
CN(CCF)c1ccc(C(=O)NC(CN2C(=O)C3CCCCC3(C)C2=O)c2ccccc2)cc1
|
12 |
+
Cc1ccc(CNC(=O)C2CC(O)CC(C)C2CN2CCc3ccccc32)cc1
|
13 |
+
CC(=O)C(C(C)=NO)=C(O)c1ccccc1Sc1ccccc1CO
|
14 |
+
CNCCCCNC(=O)c1c(-c2ccccc2Cl)n[nH]c1-c1ccccc1Cl
|
15 |
+
CCOC(=O)c1c(CO)ccc(O)c1C
|
16 |
+
CCOc1ccc(F)c(CC(=O)NC2Cc3ccccc3C2)c1
|
17 |
+
Cc1ncc(CNC(=O)C2(c3cccc(F)c3)CCN(C)CC2)c(NCc2ccc(C#N)cc2)n1
|
18 |
+
CCn1cc(C=NNC(=O)c2ccc(Cl)cc2C(F)(F)F)n2ccc(=O)nc12
|
19 |
+
NC(=O)C1CCCN1c1cc(-c2nc(N)c3cc(C(F)(F)F)cnc3n2)cc(C(F)(F)F)c1
|
20 |
+
CC(C)Oc1ccn2c(N)nc(N3CCN(CC(N)=O)CC3)nc12
|
21 |
+
CC1(C)Cc2cc(c(C(=O)Nc3ccc(Cl)c(Cl)c3)nc2-c2cccnc2)N1
|
22 |
+
CCOC(=O)C1(CCCCCC=C(C)C)CCN(C(=O)CCC(=O)O)CC1
|
23 |
+
CCN1C(=O)C2CCCCC2C(C#N)=C1Nc1ccncc1
|
24 |
+
CCn1c(C(=O)N2CCN(C(=O)c3ccc(C(F)(F)F)cc3)CC2)c(O)c2ccccc21
|
25 |
+
O=C(N1CCNCC1)n1cc2c(=O)[nH]c(=S)[nH]n2c1=O
|
26 |
+
CC(C)C(O)CC(O)C(=O)c1nn2c(=O)cc(C(F)(F)F)nc2s1
|
27 |
+
Nc1c2c(nn1-c1ncc(C(F)(F)F)cn1)Cc1ccccc1C2=O
|
28 |
+
Nc1cc(-c2cnc(N)nc2-c2ccc(F)cc2)ncn1
|
29 |
+
CC1CCN(C(=O)C2NC(=O)C(c3cccs3)C2(C)C)CC1
|
30 |
+
Nc1cc(OCC(=O)c2ccc(Cl)c(Cl)c2)ncn1
|
31 |
+
CCOc1ccc2c(c1)CC(CCN1CCN(C(=O)c3ccn(Cc4cc(O)ccc4O)n3)CC1)C2=O
|
32 |
+
Cc1ncc([N+](=O)[O-])n1CCN1CCCN(C(=O)c2ccc3nccnc3c2)CC1
|
33 |
+
Cn1c(C(=O)NC2CCC(C(N)=O)CC2)nc2cc(N=C(N)c3ccc(Cl)cc3Cl)ncc21
|
34 |
+
Nc1ccc(NC(=O)c2cnc(N)nc2N)cn1
|
35 |
+
CCN1CCN(C(=O)c2sc(S(=O)(=O)NCc3cccnc3N3CCNCC3)cc2C)CC1
|
36 |
+
CCOCC(=O)N(c1c(O)nc(=O)[nH]c1C)C1CCCCC1
|
37 |
+
CCC(=Nc1[nH]nc(-c2cccc(OC)c2)c1-c1ccncc1)N(C)C
|
38 |
+
Cc1nn(-c2cccc(NC(=O)c3ccc(F)c([N+](=O)[O-])c3)c2)c2c1CCN(C(C)(C)C(=O)N(C)C)C2=O
|
39 |
+
CCC1CC=CC(=O)N2CC(C3CCCCC3)N(C(=O)c3ccc(C(F)(F)F)cc3)CC2C(=O)OC1
|
40 |
+
CCC(CCN1CCN(C(=O)c2ccc(C#N)nc2)CC1)N(C)C
|
41 |
+
CC(=O)c1ccc(S(=O)(=O)n2ccc(-c3c(C)nc(-c4ccc([N+](=O)[O-])cc4)oc3=O)n2)cc1
|
42 |
+
Cc1cc(O)ccc1C(O)C1C(=O)NC1C(=O)NC1(C(C)C)CC2CN(Cc3ccncc3)CCN2C1=O
|
43 |
+
O=C(NC1CCS(=O)(=O)C1)c1cnc(Nc2ccc(Oc3ccccc3)cc2)cn1
|
44 |
+
CC(C)CCC1N(C(=NC2CCS(=O)(=O)C2)SCc2cccnc2)C(=O)CC1(C)C
|
45 |
+
CC(=O)OC(Cc1ccc(C(C)=O)cc1)C(C)N(O)C=O
|
46 |
+
CCCC(=O)N1CCCC(N)C1
|
47 |
+
CCN(CC)CCN=c1c(O)c(O)c1=Nc1ccc(C(C)(O)C(F)(F)F)cc1
|
48 |
+
O=C(NO)c1cc2c3n(nc2oc1=O)CC(CCO)CN3CC1CCCCC1
|
49 |
+
CC1=NCC(C)(C)c2cnc(S(C)(=O)=O)nc21
|
50 |
+
Cc1cc(C)nc(C(=O)N2CC3c4ccc(OCC#CC5(C(N)=O)CCCC5)cc4N(C)CC2C3C(=O)O)c1
|
51 |
+
Cc1sc2nc(CN(CC3CCCCC3)C(=O)N3c4ccccc4C(=O)C3F)nn2c1C
|
52 |
+
Cc1cccc(CN2CCN(CC(O)Cc3c(-c4ccc(F)cn4)nc4ccccn34)CC2)c1
|
53 |
+
Cc1ccc2oc(=O)cc(CNC(=O)CNC(=O)Oc3ccc([N+](=O)[O-])cc3)c2c1
|
54 |
+
NC([PH](=O)O)S(=O)(=O)O
|
55 |
+
O=C(CCN1CCNC1=O)Nc1cccs1
|
56 |
+
Cc1nc(CS(=O)(=O)c2ccc(C(F)(F)C(F)(F)F)nc2)c2oc(CC3CCN(C)CC3)nc2c1C(=O)NC(C)C(=O)O
|
57 |
+
N#Cc1c(N2CCN(C(=O)c3ccccc3)CC2)nn2c(N)c3c(nc12)CCCC3
|
58 |
+
Nc1ncnc2c(-c3ccc(OC(F)(F)F)cc3)c(O)c(C(F)(F)F)nc12
|
59 |
+
Cc1ccn(-c2cc3c(c(OC4CCOCC4)c2)Cc2c(N)ncnc2N3)n1
|
60 |
+
CC(N=C=S)(c1cccnc1)c1cc(F)ccc1F
|
61 |
+
Cn1cnc2cc(-c3ccc(CCO)nc3)nc(-n3cnnc3)c2c1=O
|
62 |
+
CCOC(=O)C(CCCC1CCCC(O)C1)N1C(=O)C2C[SH]1C(c1ccc(Cl)c(Cl)c1)=N2
|
63 |
+
CC(C)C1C(=O)NC(C2Cc3ccccc3C2)C(=O)NC1C(=O)O
|
64 |
+
Cc1c([N+](=O)[O-])c(=O)oc2cc(NC(=O)Nc3ccc(C#N)cc3)ccc12
|
65 |
+
CCC(C)n1cnc2c(Sc3ccc(C(F)(F)F)cc3[N+](=O)[O-])nc(N)nc21
|
66 |
+
Nc1nnc(Sc2ccccc2Cl)s1
|
67 |
+
Cc1cc(O)c(C=O)cc1C(=O)NCCCCCCCN
|
68 |
+
Cc1cc(C)n(Cc2cc(C(=O)NC3CCCC3C(=O)NCCCF)ccc2Cl)n1
|
69 |
+
CC(C)CCCC(C)CS(=O)(=O)CCCC(N)Cc1cnn(C2=COCOC2)n1
|
70 |
+
CCOC(=O)C1CCCN(CC)C1c1cc2c(=O)[nH]cc(CC)c2cc1O
|
71 |
+
Nc1nn2c(=O)cc(CSc3nc4ccc(C(=O)O)cc4[nH]3)nc2s1
|
72 |
+
CC(=O)Oc1ccc(C#CC=C(C)C(=O)NC(CN=C(N)N)C(C)C)cc1
|
73 |
+
CC(C)=NC(=O)c1c(C)n2nc(C(F)(F)F)sc2[n+]1[O-]
|
74 |
+
Nc1cc(Cl)ccc1C(=O)NCCNCC(O)CCO
|
75 |
+
Cc1noc(-c2ccccc2C(=O)N2CCN3CCC2C(C)(C)C3=O)n1
|
76 |
+
CC(=O)OC(c1cccc(C(F)(F)F)c1)C(O)CC1CCN(C)C1CS(=O)(=O)c1ccccc1
|
77 |
+
CC(=O)N1CCN(Cc2c(-c3ccccc3)nn(C)c2Cl)CC1
|
78 |
+
Nc1ncnc(Oc2cccc(Cc3ccccc3O)c2)c1C(F)(F)F
|
79 |
+
Cc1cc(NCCC(=O)Nc2ccc3c(c2)S(=O)(=O)N(C)C3=O)n2nc(C)c(-c3ccccc3)c2n1
|
80 |
+
Nc1ncnc2ccc(F)c(-c3csc(C(N)(CO)C(F)(F)F)n3)c12
|
81 |
+
CCCCCCCC=C(C)C(=O)NC1CNC(=O)C1
|
82 |
+
CC(C)C(C)C(=O)NC1CCN(c2ncnc(NC(C)C3CCCCC3)c2F)C1
|
83 |
+
CCc1nnc(CNc2nc(C)cc(N(C)C)n2)o1
|
84 |
+
CCOC(=O)Nc1ccc(OCCC(=O)O)nc1
|
85 |
+
Cc1cc(-c2nnc(C(=O)NC3CCCCCCC3)o2)ccn1
|
86 |
+
Cc1nc(CCCC(=O)C2CC(N)C(=O)NC2C(=O)N2C(C#N)CCC2C#N)cs1
|
87 |
+
Cn1cnc2c(Nc3ccn(C4CCC5CC(CO)C(=O)NC54)c(=O)n3)nc(-c3c(N)cc(Cl)cc3Cl)nc21
|
88 |
+
CCc1[nH]ncc1C(=O)N1CCC(C)CC1C(=O)NCc1ccc(N(C)C)cc1
|
89 |
+
CC(C)=CCOC1Cc2c(c(O)nc3ccccc23)CO1
|
90 |
+
CCCNCCCOc1cccc(-c2cccc(OC)c2)c1
|
91 |
+
CC(C)CC(C=O)NC(=O)C(CC1CCCC1)NC(=O)NC1CCCCCC1
|
92 |
+
Nc1cc2n(c(=O)n1)C1CCCCCC(C1)N2
|
93 |
+
NC(=O)c1cccc(-c2cnc3ccc(Nc4ccncn4)nn23)c1
|
94 |
+
CC(=O)OC1CCC(OC(C)=O)C(CNC(=S)Nc2ccc(F)c(Cl)c2)C1
|
95 |
+
CC(=O)N1CCC2(C)Oc3ccc4c(=O)c(C(=O)O)c(-n5ccnn5)oc4c3C(c3ccccn3)C2C1
|
Samples/generated_smiles.txt
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CC(=O)OC1OCCCC1Cl
|
2 |
+
O=C1NC(c2ccc(C(F)(F)F)cc2)S(=O)(=O)C1CCCN1CCC2C(Cc3ccccc3C23NC(=S)NC3=O)CC1CO
|
3 |
+
Nc1nc(-c2ccccc2)nc2ccc(C(F)(F)F)cc12
|
4 |
+
CCNCC(O)(c1ccc2c(c1)NCCC2)C(C)C
|
5 |
+
NC(=O)C1CCC(CCO)C2Cc3cccc([N+](=O)[O-])c3-c3ccccc3CC(=O)N12
|
6 |
+
CC(C)CC1OC2CC3CN(C(=O)C24CC=C1CN4)C(c1ccccc1[N+](=O)[O-])C(=O)N3
|
7 |
+
CCn1cc(C(=O)O)c(=O)c2c3ccccc3n(CCl)c21
|
8 |
+
Cc1[nH]c2c(NC(C)c3ccc(C(=O)O)cc3)ncnc2c1C(=O)O
|
9 |
+
Cc1cc(C=NOC(C)CNCCCC2CCC(n3cnc4c(N)ncnc43)O2)c2[nH]c(C)cc2n1
|
10 |
+
Cc1[nH]n(-c2cc(C(=O)NC(C)C(=O)N(Cc3ccncc3)C3CCCC3F)ccn2)c(=O)c1[N+](=O)[O-]
|
11 |
+
CN(Cc1cc2ccoc(=O)c2n1-c1cccc(O)c1)c1ccccc1
|
12 |
+
CC(C)C(c1ccc(Cl)cc1O)S(=O)(=O)N(C#N)CC(=O)NO
|
13 |
+
NC(=O)c1ccc(N2CCCC(c3cccc(F)c3)C2)nc1C(Cl)(Cl)Cl
|
14 |
+
CC(C)C1C(=O)C(O)(c2ccc([N+](=O)[O-])cc2)CC1NC(=O)C1CC(=O)N1
|
15 |
+
CCCCNCC1CCOC(c2ccc(F)cc2)O1
|
16 |
+
Cc1cc(NC(=O)C2COC(OCCCC(C)C(O)(c3ccccc3)c3ccccc3)C2)ncn1
|
17 |
+
Cc1ccc(C(=O)Nc2ccnc(Cn3nc(C)c4c3CCN(Cc3ccccc3)C4)n2)cc1
|
18 |
+
CC(=O)C1CCC(C)N1C(=O)OC1CN(C2(c3cccnn3)CCCCC2)CCC1CC(=O)O
|
19 |
+
CCC(=O)N1CC2CCCN(CC(=O)NCC3CCN(c4ncncc4F)CC3)C2C1
|
20 |
+
CC1(c2ccc(O)cc2)CC(=O)N(C=C(Cl)Cl)C(c2cccnc2)N1
|
21 |
+
O=C(Nc1ccc([N+](=O)[O-])cc1C(=O)NN=Cc1cccs1)c1ccc(Cl)s1
|
22 |
+
Cc1cc(C)n(-c2nc(N)cc(-c3cc(C(=O)NC4CCNCC4)cc(C(F)(F)F)c3)n2)n1
|
23 |
+
CC(=O)OCC1CN(Cc2cccc([N+](=O)[O-])c2)C(CN2CCCC2)C1O
|
24 |
+
CC(=O)Nc1c(N)n(CC2CCCO2)c(=O)[nH]c1=O
|
25 |
+
Cc1cc(O)nc(N2CCCC(N3CCN(C(=O)c4ccc(F)cc4)CC3)C2)n1
|
26 |
+
CC(C)CC(N)C(O)C#N
|
27 |
+
NC(=O)C1CCC(CN2CCCCC2)N1C(=O)c1cc(-c2ccnc(N)n2)cc(C(F)(F)F)c1F
|
28 |
+
CC(C)(N)CCCNc1cc(C(C)(C)C)ncn1
|
29 |
+
Nc1nc(N)c2c(-c3ccccc3)cc(S(=O)(=O)NC(=O)CC3CCCC3)cc2n1
|
30 |
+
O=C(NCc1cnn(-c2ccc(F)cc2)c1)c1cccnc1C(F)(F)F
|
31 |
+
Cc1nc(-c2ccccn2)c(CSC(=NCc2cccnc2)N2CC(C)(C)CC2=O)s1
|
32 |
+
NC(=O)C1CCC2CN(C(=O)OCc3ccc(F)cc3)CC(CO)C21
|
33 |
+
CCC(C)CC=CC(CCO)OC(=O)c1cc(S(N)(=O)=O)cnc1C(F)(F)F
|
34 |
+
CCn1cc(C)c2nc(-c3cnn(C)c3NCc3cccnc3OC)cnc21
|
35 |
+
N#Cc1cc(F)c(N2CCC(CN)C2)cc1NS(=O)(=O)c1cc(Cl)cc(Cl)c1
|
36 |
+
CCCCNc1c(C(=O)O)cc2nc(C#N)c(-c3ccccc3)nn12
|
37 |
+
CC#CCCCn1c(=O)c(NC(=O)c2cccc(F)c2)cn(CC(=O)NCC2CCCO2)c1=O
|
38 |
+
CCn1ncc(S(=O)(=O)c2ccc(C(=O)O)cc2)c1C(=O)Nc1cc(C)cc(C)c1
|
39 |
+
NC1CCCCC1C(=O)NCC(=O)NC1CCCCCCC1
|
40 |
+
Cc1cc(N2CCOCC2)nc(Nc2ccc(-c3cnc4ccn(C)c4c3)cc2)n1
|
41 |
+
CCOC(=O)Nc1nccnc1N1CCC(C(N)=O)CC1
|
42 |
+
Cc1c2nc(NC3CCC(CC(N)C(=O)O)CC3)nc(Oc3ccc(C(=O)NCC4CCCCC4)cc3)c2nn1C
|
43 |
+
CCC(C(=O)NC1CCCc2cccnc21)N(CCC1CCCCC1)S(=O)(=O)c1cccs1
|
44 |
+
CCCCN1C(=O)N=C(Nc2cccc(OCc3ccncc3)c2)C1CC1CCCC1
|
45 |
+
Nc1nccn2c(-c3cccnc3)c(CNc3ccc(F)cc3F)cc12
|
46 |
+
CC1CCC(NC(=O)c2ccc(CN(c3ccc(C(=O)O)cc3)S(=O)(=O)c3ccc(F)cc3C(F)(F)F)cc2)CC1
|
47 |
+
CC=Cc1ccc(S(=O)(=O)N2CCCC2C(=O)N2CC3(O)CCCC3C2=O)cc1
|
48 |
+
Cc1ccc(C(=O)N2Cc3nc(-c4ccc(F)cc4)ncc3C2=NN(C(=O)C2CCCC(C(N)=O)C2)C2CCNC2)cc1C
|
49 |
+
CCCCCCNCCc1cc(CC)cc(=O)o1
|
50 |
+
CC(=O)NC1CCC(CCCCOCCNC(=O)C2CCCN2C(=O)OC(C)(C)C)CC1
|
51 |
+
CCCNc1ccc(Cl)cc1C(=O)NC(C)CC(C)C
|
52 |
+
CCC(C(O)C1C(O)=C(C)OC(=S)N1CCC(O)C(C)(C)C)S(=O)(=O)CC
|
53 |
+
CCCCNC(=O)OC1CC2CCC(c3nnc4ccccn34)CCN2C1O
|
54 |
+
CC(C)N1CC(C(=O)Nc2ccc(N3CCOCC3)c([N+](=O)[O-])c2)C2CS(=O)(=O)CC21
|
55 |
+
CCCCCCCN1CC2COC1C(O)(Cn1c(-c3cccs3)nc3c(N)ncnc31)C2
|
56 |
+
Nc1nc(Sc2ccc(C(F)(F)F)cc2)nc2cccc(-c3ccc(C(=O)O)nc3)c12
|
57 |
+
O=c1oc2ccccc2n1CCS(=O)(=O)N1CCCCCC1
|
58 |
+
NCCNc1ncnc(N)c1C(=O)NC1CCCC1
|
59 |
+
N=C(N)OCCOCc1c(N)ccc2c1CC(O)CC2
|
60 |
+
CN1CCN(C(=O)C2CCN(C3CC(C)(C)c4ccc(C(=O)O)c3c4)CC2)CC1
|
61 |
+
CC(=O)N(C1=NC(=O)C(CC2CCC2OC(N)=O)N1CC(C)(C)c1ccccc1)C1CCCC1
|
62 |
+
CC(=O)NCCNCCCCCC(=O)c1ccc(-c2cc(Cl)cc(Cl)c2)o1
|
63 |
+
CCNS(=O)(=O)c1cccc(COc2ccnnc2)c1
|
64 |
+
O=C(NN=CCc1ccco1)c1cnccn1
|
65 |
+
Cc1cccc(N(C)CCC(=O)C2C(=O)OC3CC(C(=O)O)NCC3C2C(F)(F)F)c1C
|
66 |
+
Cc1cc(Sc2ccncc2)c([N+](=O)[O-])c(C)n1
|
67 |
+
Cc1c(Cl)ccc2[nH]c(=O)c(-n3c([N+](=O)[O-])cnc3C)nc12
|
68 |
+
Cc1ccc(C#N)cc1C1N=C(c2ccc(S(N)(=O)=O)cc2)CC(=O)N1C
|
69 |
+
CCOC(=O)C1CCC(Cn2c(C(C)(C)C)ccc(C#N)c2=N)C1
|
70 |
+
Cc1nn2cnc(-c3ccc(C(=O)NCc4ccc(C#N)cc4)s3)c(N3CCC(N(C)C)C3)c2c1C(C)(C)C
|
71 |
+
Cc1[nH]nc(-c2cc(C3C4CC(CCN4)N3C)nc(N3CCC4(CCCCC4)C3)n2)c1Cl
|
72 |
+
CCN(C)c1ccc(C(=O)OCC(=O)NCC2CCCC2)cc1
|
73 |
+
CC(C)n1cc(S(=O)(=O)NCC(=O)N2CCCC(c3ccccc3)(c3ccccc3)CC2)cn1
|
74 |
+
S=C1NCSS1
|
75 |
+
CC1CCC(C(=O)NC(=S)N2CCN(c3ccc([N+](=O)[O-])cc3Cl)CC2)C1
|
76 |
+
CC(=NO)C(=O)Nc1ccc(Cl)c(C(=O)O)c1
|
77 |
+
NCCCCCCC(=O)NC(C(=O)O)S(=O)(=O)O
|
78 |
+
CCN1C(=O)C2CC(NC(=O)C(NC(=O)OCc3cccc([N+](=O)[O-])c3)C(C)P(=O)(O)O)CCC21
|
79 |
+
CC(=O)C1(C)CC(c2ccc(N3CCOCC3)cc2C(F)(F)F)=NO1
|
80 |
+
CCCCCCC(N)=O
|
81 |
+
Nc1nc(Cl)ccc1C(c1ccc(-c2ccccn2)nc1N)N1CCCCC1
|
82 |
+
CCC1N=C(S)NC1c1cc(F)ccc1F
|
83 |
+
CC(=O)c1cn(C)c2ccc(S(=O)(=O)NCc3ccc(C(=O)O)cc3)cc12
|
84 |
+
CCCCCC(=CC#N)[N+](=O)[O-]
|
85 |
+
CN1CCCC1CCNCc1ccc(F)c(Cl)c1
|
86 |
+
Cc1ccc(S(=O)(=O)c2nnc(-n3cc(C)sc3=O)s2)cc1
|
87 |
+
NCC1COC(CN2CC(c3cccc([N+](=O)[O-])c3)OC2=O)C1CCC1CCCCC1
|
88 |
+
CC(C)NC(CC(=O)Nc1cnn(C)c(=O)c1Cl)C(=O)NCc1ccc(C(F)(F)F)c(F)c1
|
89 |
+
CC(C)CC1C(=O)N2CCCC2C(=O)N1C1(C)CCCO1
|
90 |
+
O=C(O)c1ccc(Cn2cccc2-c2cc(NC(=O)c3cccs3)ncn2)cc1
|
91 |
+
CCn1ncc(CC(=O)NCc2ccccc2)c1CCC(=O)NCc1ccc(F)c(F)c1
|
92 |
+
O=C1CSC(C(=O)NCCC2CN3CCC2(c2cccc(C(F)(F)F)c2)CC3)N1
|
93 |
+
Nc1nc(O)nc2c1CN(CCCN1C(=O)CCC1=O)CC2
|
app.py
ADDED
@@ -0,0 +1,1084 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
from flask import Flask, render_template, request, send_file
|
6 |
+
from rdkit import Chem
|
7 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
8 |
+
from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
9 |
+
from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
10 |
+
|
11 |
+
# 🚀 Define Directories for Railway
|
12 |
+
bio_model_dir = "/app/modelsBioembed" # Persistent model storage
|
13 |
+
cvn_model_dir = "/app/models_folder"
|
14 |
+
UPLOAD_FOLDER = "/app/Samples"
|
15 |
+
|
16 |
+
os.makedirs(bio_model_dir, exist_ok=True)
|
17 |
+
os.makedirs(cvn_model_dir, exist_ok=True)
|
18 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
19 |
+
|
20 |
+
# ✅ Environment Variables for Temp Directory
|
21 |
+
os.environ["TMPDIR"] = bio_model_dir
|
22 |
+
os.environ["TEMP"] = bio_model_dir
|
23 |
+
os.environ["TMP"] = bio_model_dir
|
24 |
+
|
25 |
+
# 🔗 Dropbox Links for Model Files
|
26 |
+
DROPBOX_LINKS = {
|
27 |
+
"pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
|
28 |
+
"config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
|
29 |
+
"tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
|
30 |
+
"vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
|
31 |
+
"special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
|
32 |
+
}
|
33 |
+
|
34 |
+
# 📥 Function to Download Model Files
|
35 |
+
def download_model_files():
|
36 |
+
for filename, url in DROPBOX_LINKS.items():
|
37 |
+
file_path = os.path.join(bio_model_dir, filename)
|
38 |
+
if not os.path.exists(file_path): # Avoid re-downloading
|
39 |
+
print(f"Downloading {filename}...")
|
40 |
+
response = requests.get(url, stream=True)
|
41 |
+
if response.status_code == 200:
|
42 |
+
with open(file_path, "wb") as f:
|
43 |
+
for chunk in response.iter_content(chunk_size=1024):
|
44 |
+
f.write(chunk)
|
45 |
+
print(f"Downloaded: {filename}")
|
46 |
+
else:
|
47 |
+
print(f"Failed to download {filename}")
|
48 |
+
# def download_model_files():
|
49 |
+
# for filename, url in DROPBOX_LINKS.items():
|
50 |
+
# file_path = os.path.join(bio_model_dir, filename)
|
51 |
+
|
52 |
+
# print(f"Downloading {filename} (forcing overwrite)...")
|
53 |
+
# response = requests.get(url, stream=True)
|
54 |
+
# if response.status_code == 200:
|
55 |
+
# with open(file_path, "wb") as f:
|
56 |
+
# for chunk in response.iter_content(chunk_size=1024):
|
57 |
+
# f.write(chunk)
|
58 |
+
# print(f"Downloaded: {filename}")
|
59 |
+
# else:
|
60 |
+
# print(f"Failed to download {filename}")
|
61 |
+
|
62 |
+
# 📥 Download models before starting
|
63 |
+
download_model_files()
|
64 |
+
|
65 |
+
# # ✅ Load ProtTrans-BERT-BFD Model
|
66 |
+
# print("Loading ProtTrans-BERT-BFD model...")
|
67 |
+
# model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
|
68 |
+
# tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
|
69 |
+
|
70 |
+
# ✅ Load Bio-Embedding Model
|
71 |
+
try:
|
72 |
+
print("Loading ProtTrans-BERT-BFD model...")
|
73 |
+
embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
76 |
+
embedder = None
|
77 |
+
|
78 |
+
# 🧬 Generate Bio-Embeddings
|
79 |
+
def generate_bio_embeddings(sequence):
|
80 |
+
if embedder is None:
|
81 |
+
return None
|
82 |
+
try:
|
83 |
+
embedding_protein = embedder.embed(sequence)
|
84 |
+
embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
85 |
+
return np.array(embedding_per_protein).reshape(1, -1)
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Embedding Error: {e}")
|
88 |
+
return None
|
89 |
+
|
90 |
+
# 🔬 Generate SMILES from Protein Sequence
|
91 |
+
def generate_smiles(sequence, n_samples=100):
|
92 |
+
start_time = time.time()
|
93 |
+
|
94 |
+
protein_embedding = generate_bio_embeddings(sequence)
|
95 |
+
if protein_embedding is None:
|
96 |
+
return None, "Embedding generation failed!"
|
97 |
+
|
98 |
+
model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
99 |
+
samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
100 |
+
valid_samples = [sample for sample in samples if sample is not None]
|
101 |
+
|
102 |
+
smiles_list = [
|
103 |
+
Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
104 |
+
]
|
105 |
+
|
106 |
+
if not smiles_list:
|
107 |
+
return None, "No valid SMILES generated!"
|
108 |
+
|
109 |
+
filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
110 |
+
with open(filename, "w") as file:
|
111 |
+
file.write("\n".join(smiles_list))
|
112 |
+
|
113 |
+
elapsed_time = time.time() - start_time
|
114 |
+
return filename, elapsed_time
|
115 |
+
|
116 |
+
# 🌐 Flask Web App
|
117 |
+
app = Flask(__name__)
|
118 |
+
|
119 |
+
@app.route("/", methods=["GET", "POST"])
|
120 |
+
def index():
|
121 |
+
if request.method == "POST":
|
122 |
+
sequence = request.form["sequence"].strip()
|
123 |
+
if not sequence:
|
124 |
+
return render_template("index.html", message="Please enter a valid sequence.")
|
125 |
+
|
126 |
+
file_path, result = generate_smiles(sequence)
|
127 |
+
if file_path is None:
|
128 |
+
return render_template("index.html", message=f"Error: {result}")
|
129 |
+
|
130 |
+
return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
131 |
+
|
132 |
+
return render_template("index.html")
|
133 |
+
|
134 |
+
@app.route("/download")
|
135 |
+
def download_file():
|
136 |
+
file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
137 |
+
return send_file(file_path, as_attachment=True)
|
138 |
+
|
139 |
+
# 🚀 Run the Flask App on Railway
|
140 |
+
if __name__ == "__main__":
|
141 |
+
app.run(host="0.0.0.0", port=7860)
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
# import os
|
154 |
+
# import time
|
155 |
+
# import requests
|
156 |
+
# import numpy as np
|
157 |
+
# import subprocess
|
158 |
+
# from flask import Flask, render_template, request, send_file
|
159 |
+
# from rdkit import Chem
|
160 |
+
# from transformers import AutoModel
|
161 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
162 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
163 |
+
|
164 |
+
# # DROPBOX LINKS FOR MODEL FILES
|
165 |
+
# DROPBOX_LINKS = {
|
166 |
+
# "pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
|
167 |
+
# "config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
|
168 |
+
# "tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
|
169 |
+
# "vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
|
170 |
+
# "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
|
171 |
+
# }
|
172 |
+
|
173 |
+
# # LOCAL DIRECTORIES
|
174 |
+
# bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed")
|
175 |
+
# cvn_model_dir = os.path.join(os.getcwd(), "models_folder")
|
176 |
+
# UPLOAD_FOLDER = "Samples"
|
177 |
+
|
178 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
179 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
180 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
181 |
+
|
182 |
+
# os.environ["TMPDIR"] = bio_model_dir
|
183 |
+
# os.environ["TEMP"] = bio_model_dir
|
184 |
+
# os.environ["TMP"] = bio_model_dir
|
185 |
+
|
186 |
+
# # FUNCTION TO DOWNLOAD FILES FROM DROPBOX
|
187 |
+
# for file_name, url in DROPBOX_LINKS.items():
|
188 |
+
# file_path = os.path.join(bio_model_dir, file_name)
|
189 |
+
# if not os.path.exists(file_path):
|
190 |
+
# print(f"Downloading {file_name} from Dropbox...")
|
191 |
+
# subprocess.run(["wget", "-O", file_path, url], check=True)
|
192 |
+
# print(f"{file_name} downloaded!")
|
193 |
+
|
194 |
+
# # BIO-EMBEDDING MODEL LOADING
|
195 |
+
# try:
|
196 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
197 |
+
# except Exception as e:
|
198 |
+
# print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
199 |
+
# embedder = None
|
200 |
+
|
201 |
+
# def generate_bio_embeddings(sequence):
|
202 |
+
# if embedder is None:
|
203 |
+
# return None
|
204 |
+
# try:
|
205 |
+
# embedding_protein = embedder.embed(sequence)
|
206 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
207 |
+
# return np.array(embedding_per_protein).reshape(1, -1)
|
208 |
+
# except Exception as e:
|
209 |
+
# print(f"Embedding Error: {e}")
|
210 |
+
# return None
|
211 |
+
|
212 |
+
# def generate_smiles(sequence, n_samples=100):
|
213 |
+
# start_time = time.time()
|
214 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
215 |
+
# if protein_embedding is None:
|
216 |
+
# return None, "Embedding generation failed!"
|
217 |
+
|
218 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
219 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
220 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
221 |
+
|
222 |
+
# smiles_list = [
|
223 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
224 |
+
# ]
|
225 |
+
|
226 |
+
# if not smiles_list:
|
227 |
+
# return None, "No valid SMILES generated!"
|
228 |
+
|
229 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
230 |
+
# with open(filename, "w") as file:
|
231 |
+
# file.write("\n".join(smiles_list))
|
232 |
+
|
233 |
+
# elapsed_time = time.time() - start_time
|
234 |
+
# return filename, elapsed_time
|
235 |
+
|
236 |
+
# app = Flask(__name__)
|
237 |
+
|
238 |
+
# @app.route("/", methods=["GET", "POST"])
|
239 |
+
# def index():
|
240 |
+
# if request.method == "POST":
|
241 |
+
# sequence = request.form["sequence"].strip()
|
242 |
+
# if not sequence:
|
243 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
244 |
+
|
245 |
+
# file_path, result = generate_smiles(sequence)
|
246 |
+
# if file_path is None:
|
247 |
+
# return render_template("index.html", message=f"Error: {result}")
|
248 |
+
|
249 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
250 |
+
|
251 |
+
# return render_template("index.html")
|
252 |
+
|
253 |
+
# @app.route("/download")
|
254 |
+
# def download_file():
|
255 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
256 |
+
# return send_file(file_path, as_attachment=True)
|
257 |
+
|
258 |
+
# if __name__ == "__main__":
|
259 |
+
# app.run(host="0.0.0.0", port=8000, debug=True)
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
# import os
|
264 |
+
# import time
|
265 |
+
# import numpy as np
|
266 |
+
# from flask import Flask, render_template, request, send_file
|
267 |
+
# from rdkit import Chem
|
268 |
+
# from transformers import AutoModel
|
269 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
270 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
271 |
+
|
272 |
+
# # # DIRECTORIES
|
273 |
+
# # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
|
274 |
+
# # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
|
275 |
+
# #bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
|
276 |
+
# bio_model_dir = "/app/modelsBioembed"
|
277 |
+
# cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
|
278 |
+
|
279 |
+
|
280 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
281 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
282 |
+
|
283 |
+
# os.environ["TMPDIR"] = bio_model_dir
|
284 |
+
# os.environ["TEMP"] = bio_model_dir
|
285 |
+
# os.environ["TMP"] = bio_model_dir
|
286 |
+
|
287 |
+
# UPLOAD_FOLDER = "Samples"
|
288 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
289 |
+
|
290 |
+
# app = Flask(__name__)
|
291 |
+
|
292 |
+
# # model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
|
293 |
+
# # if not os.path.exists(model_path):
|
294 |
+
# # print("Downloading ProtTrans-BERT-BFD model...")
|
295 |
+
# # AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
|
296 |
+
|
297 |
+
|
298 |
+
# # BIO-EMBEDDING MODEL LOADING
|
299 |
+
# try:
|
300 |
+
# print("Loading Model")
|
301 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
302 |
+
# except Exception as e:
|
303 |
+
# print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
304 |
+
# embedder = None
|
305 |
+
|
306 |
+
# def generate_bio_embeddings(sequence):
|
307 |
+
# """Generate bio-embeddings for a given protein sequence."""
|
308 |
+
# if embedder is None:
|
309 |
+
# return None
|
310 |
+
# try:
|
311 |
+
# embedding_protein = embedder.embed(sequence)
|
312 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
313 |
+
# return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
|
314 |
+
# except Exception as e:
|
315 |
+
# print(f"Embedding Error: {e}")
|
316 |
+
# return None
|
317 |
+
|
318 |
+
# def generate_smiles(sequence, n_samples=100):
|
319 |
+
# """Generate SMILES from a protein sequence."""
|
320 |
+
# start_time = time.time()
|
321 |
+
|
322 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
323 |
+
# if protein_embedding is None:
|
324 |
+
# return None, "Embedding generation failed!"
|
325 |
+
|
326 |
+
# # TRAINED CVanilla_RNN_Builder MODEL LOADING
|
327 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
328 |
+
|
329 |
+
# # MOLECULAR GRAPH GENERATION
|
330 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
331 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
332 |
+
|
333 |
+
# # CONVERSION TO SMILES
|
334 |
+
# smiles_list = [
|
335 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
336 |
+
# ]
|
337 |
+
|
338 |
+
# if not smiles_list:
|
339 |
+
# return None, "No valid SMILES generated!"
|
340 |
+
|
341 |
+
# # SAVING TO FILE
|
342 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
343 |
+
# with open(filename, "w") as file:
|
344 |
+
# file.write("\n".join(smiles_list))
|
345 |
+
|
346 |
+
# elapsed_time = time.time() - start_time
|
347 |
+
# return filename, elapsed_time
|
348 |
+
|
349 |
+
# @app.route("/", methods=["GET", "POST"])
|
350 |
+
# def index():
|
351 |
+
# if request.method == "POST":
|
352 |
+
# sequence = request.form["sequence"].strip()
|
353 |
+
# if not sequence:
|
354 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
355 |
+
|
356 |
+
# file_path, result = generate_smiles(sequence)
|
357 |
+
# if file_path is None:
|
358 |
+
# return render_template("index.html", message=f"Error: {result}")
|
359 |
+
|
360 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
361 |
+
|
362 |
+
# return render_template("index.html")
|
363 |
+
|
364 |
+
# @app.route("/download")
|
365 |
+
# def download_file():
|
366 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
367 |
+
# return send_file(file_path, as_attachment=True)
|
368 |
+
|
369 |
+
# if __name__ == "__main__":
|
370 |
+
# app.run(host="0.0.0.0", port=8000)
|
371 |
+
#MAIN
|
372 |
+
|
373 |
+
|
374 |
+
|
375 |
+
|
376 |
+
# import os
|
377 |
+
# import time
|
378 |
+
# import requests
|
379 |
+
# import numpy as np
|
380 |
+
# from flask import Flask, render_template, request, send_file
|
381 |
+
# from rdkit import Chem
|
382 |
+
# from transformers import AutoModel
|
383 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
384 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
385 |
+
|
386 |
+
# # HUGGING FACE MODEL REPO (Replace with your actual Hugging Face username)
|
387 |
+
# MODEL_BASE_URL = "https://huggingface.co/Bhanushray/protein-smiles-model/tree/main"
|
388 |
+
|
389 |
+
# # REQUIRED MODEL FILES
|
390 |
+
# MODEL_FILES = [
|
391 |
+
# "pytorch_model.bin",
|
392 |
+
# "config.json",
|
393 |
+
# "tokenizer_config.json",
|
394 |
+
# "vocab.txt",
|
395 |
+
# "special_tokens_map.json"
|
396 |
+
# ]
|
397 |
+
|
398 |
+
# # DIRECTORIES
|
399 |
+
# bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
|
400 |
+
# cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
|
401 |
+
|
402 |
+
# # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
|
403 |
+
# # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
|
404 |
+
|
405 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
406 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
407 |
+
|
408 |
+
# os.environ["TMPDIR"] = bio_model_dir
|
409 |
+
# os.environ["TEMP"] = bio_model_dir
|
410 |
+
# os.environ["TMP"] = bio_model_dir
|
411 |
+
|
412 |
+
# UPLOAD_FOLDER = "Samples"
|
413 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
414 |
+
|
415 |
+
# app = Flask(__name__)
|
416 |
+
|
417 |
+
# # DOWNLOAD MODEL FILES IF MISSING
|
418 |
+
# for file_name in MODEL_FILES:
|
419 |
+
# file_path = os.path.join(bio_model_dir, file_name)
|
420 |
+
|
421 |
+
# if not os.path.exists(file_path):
|
422 |
+
# print(f"Downloading {file_name} ...")
|
423 |
+
# response = requests.get(MODEL_BASE_URL + file_name, stream=True)
|
424 |
+
# with open(file_path, "wb") as f:
|
425 |
+
# for chunk in response.iter_content(chunk_size=1024):
|
426 |
+
# f.write(chunk)
|
427 |
+
# print(f"{file_name} downloaded!")
|
428 |
+
|
429 |
+
# # BIO-EMBEDDING MODEL LOADING
|
430 |
+
# try:
|
431 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
432 |
+
# except Exception as e:
|
433 |
+
# print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
434 |
+
# embedder = None
|
435 |
+
|
436 |
+
# def generate_bio_embeddings(sequence):
|
437 |
+
# """Generate bio-embeddings for a given protein sequence."""
|
438 |
+
# if embedder is None:
|
439 |
+
# return None
|
440 |
+
# try:
|
441 |
+
# embedding_protein = embedder.embed(sequence)
|
442 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
443 |
+
# return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
|
444 |
+
# except Exception as e:
|
445 |
+
# print(f"Embedding Error: {e}")
|
446 |
+
# return None
|
447 |
+
|
448 |
+
# def generate_smiles(sequence, n_samples=100):
|
449 |
+
# """Generate SMILES from a protein sequence."""
|
450 |
+
# start_time = time.time()
|
451 |
+
|
452 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
453 |
+
# if protein_embedding is None:
|
454 |
+
# return None, "Embedding generation failed!"
|
455 |
+
|
456 |
+
# # LOAD TRAINED CVanilla_RNN_Builder MODEL
|
457 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
458 |
+
|
459 |
+
# # MOLECULAR GRAPH GENERATION
|
460 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
461 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
462 |
+
|
463 |
+
# # CONVERT TO SMILES
|
464 |
+
# smiles_list = [
|
465 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
466 |
+
# ]
|
467 |
+
|
468 |
+
# if not smiles_list:
|
469 |
+
# return None, "No valid SMILES generated!"
|
470 |
+
|
471 |
+
# # SAVE TO FILE
|
472 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
473 |
+
# with open(filename, "w") as file:
|
474 |
+
# file.write("\n".join(smiles_list))
|
475 |
+
|
476 |
+
# elapsed_time = time.time() - start_time
|
477 |
+
# return filename, elapsed_time
|
478 |
+
|
479 |
+
# @app.route("/", methods=["GET", "POST"])
|
480 |
+
# def index():
|
481 |
+
# if request.method == "POST":
|
482 |
+
# sequence = request.form["sequence"].strip()
|
483 |
+
# if not sequence:
|
484 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
485 |
+
|
486 |
+
# file_path, result = generate_smiles(sequence)
|
487 |
+
# if file_path is None:
|
488 |
+
# return render_template("index.html", message=f"Error: {result}")
|
489 |
+
|
490 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
491 |
+
|
492 |
+
# return render_template("index.html")
|
493 |
+
|
494 |
+
# @app.route("/download")
|
495 |
+
# def download_file():
|
496 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
497 |
+
# return send_file(file_path, as_attachment=True)
|
498 |
+
|
499 |
+
# if __name__ == "__main__":
|
500 |
+
# app.run(host="0.0.0.0", port=8000, debug=True)
|
501 |
+
|
502 |
+
|
503 |
+
# import os
|
504 |
+
# import time
|
505 |
+
# import numpy as np
|
506 |
+
# from flask import Flask, render_template, request, send_file
|
507 |
+
# from rdkit import Chem
|
508 |
+
# from transformers import AutoModel
|
509 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
510 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
511 |
+
|
512 |
+
# # DIRECTORIES
|
513 |
+
# bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
|
514 |
+
# cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
|
515 |
+
|
516 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
517 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
518 |
+
|
519 |
+
# os.environ["TMPDIR"] = bio_model_dir
|
520 |
+
# os.environ["TEMP"] = bio_model_dir
|
521 |
+
# os.environ["TMP"] = bio_model_dir
|
522 |
+
|
523 |
+
# UPLOAD_FOLDER = "Samples"
|
524 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
525 |
+
|
526 |
+
# app = Flask(__name__)
|
527 |
+
|
528 |
+
# model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
|
529 |
+
# if not os.path.exists(model_path):
|
530 |
+
# print("Downloading ProtTrans-BERT-BFD model...")
|
531 |
+
# AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
|
532 |
+
|
533 |
+
|
534 |
+
# # BIO-EMBEDDING MODEL LOADING
|
535 |
+
# try:
|
536 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
537 |
+
# except Exception as e:
|
538 |
+
# print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
539 |
+
# embedder = None
|
540 |
+
|
541 |
+
# def generate_bio_embeddings(sequence):
|
542 |
+
# """Generate bio-embeddings for a given protein sequence."""
|
543 |
+
# if embedder is None:
|
544 |
+
# return None
|
545 |
+
# try:
|
546 |
+
# embedding_protein = embedder.embed(sequence)
|
547 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
548 |
+
# return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
|
549 |
+
# except Exception as e:
|
550 |
+
# print(f"Embedding Error: {e}")
|
551 |
+
# return None
|
552 |
+
|
553 |
+
# def generate_smiles(sequence, n_samples=100):
|
554 |
+
# """Generate SMILES from a protein sequence."""
|
555 |
+
# start_time = time.time()
|
556 |
+
|
557 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
558 |
+
# if protein_embedding is None:
|
559 |
+
# return None, "Embedding generation failed!"
|
560 |
+
|
561 |
+
# # TRAINED CVanilla_RNN_Builder MODEL LOADING
|
562 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
563 |
+
|
564 |
+
# # MOLECULAR GRAPH GENERATION
|
565 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
566 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
567 |
+
|
568 |
+
# # CONVERSION TO SMILES
|
569 |
+
# smiles_list = [
|
570 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
571 |
+
# ]
|
572 |
+
|
573 |
+
# if not smiles_list:
|
574 |
+
# return None, "No valid SMILES generated!"
|
575 |
+
|
576 |
+
# # SAVING TO FILE
|
577 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
578 |
+
# with open(filename, "w") as file:
|
579 |
+
# file.write("\n".join(smiles_list))
|
580 |
+
|
581 |
+
# elapsed_time = time.time() - start_time
|
582 |
+
# return filename, elapsed_time
|
583 |
+
|
584 |
+
# @app.route("/", methods=["GET", "POST"])
|
585 |
+
# def index():
|
586 |
+
# if request.method == "POST":
|
587 |
+
# sequence = request.form["sequence"].strip()
|
588 |
+
# if not sequence:
|
589 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
590 |
+
|
591 |
+
# file_path, result = generate_smiles(sequence)
|
592 |
+
# if file_path is None:
|
593 |
+
# return render_template("index.html", message=f"Error: {result}")
|
594 |
+
|
595 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
596 |
+
|
597 |
+
# return render_template("index.html")
|
598 |
+
|
599 |
+
# @app.route("/download")
|
600 |
+
# def download_file():
|
601 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
602 |
+
# return send_file(file_path, as_attachment=True)
|
603 |
+
|
604 |
+
# if __name__ == "__main__":
|
605 |
+
# app.run(host="0.0.0.0", port=8000,debug=True)
|
606 |
+
|
607 |
+
|
608 |
+
|
609 |
+
|
610 |
+
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
|
615 |
+
|
616 |
+
|
617 |
+
|
618 |
+
|
619 |
+
# import os
|
620 |
+
# import time
|
621 |
+
# import numpy as np
|
622 |
+
# from flask import Flask, render_template, request, send_file
|
623 |
+
# from rdkit import Chem
|
624 |
+
# from transformers import AutoModel
|
625 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
626 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
627 |
+
# from huggingface_hub import hf_hub_download # Import for direct file download
|
628 |
+
|
629 |
+
# # Define directories for different models
|
630 |
+
# bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
|
631 |
+
# cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
|
632 |
+
|
633 |
+
# # Ensure directories exist
|
634 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
635 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
636 |
+
|
637 |
+
# UPLOAD_FOLDER = "Samples"
|
638 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
639 |
+
|
640 |
+
# app = Flask(__name__)
|
641 |
+
|
642 |
+
# # Download only the required pytorch_model.bin file
|
643 |
+
# model_filename = "pytorch_model.bin"
|
644 |
+
# model_path = os.path.join(bio_model_dir, model_filename)
|
645 |
+
# if not os.path.exists(model_path):
|
646 |
+
# print("Downloading pytorch_model.bin from Hugging Face...")
|
647 |
+
# hf_hub_download(repo_id="Rostlab/prot_bert_bfd", filename=model_filename, local_dir=bio_model_dir)
|
648 |
+
|
649 |
+
# # Load bio-embedding model once
|
650 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
651 |
+
|
652 |
+
# def generate_bio_embeddings(sequence):
|
653 |
+
# """Generate bio-embeddings for a given protein sequence."""
|
654 |
+
# try:
|
655 |
+
# embedding_protein = embedder.embed(sequence)
|
656 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
657 |
+
# return np.array(embedding_per_protein).reshape(1, -1)
|
658 |
+
# except Exception as e:
|
659 |
+
# print(f"Embedding Error: {e}")
|
660 |
+
# return None
|
661 |
+
|
662 |
+
# def generate_smiles(sequence, n_samples=100):
|
663 |
+
# """Generate SMILES from a protein sequence."""
|
664 |
+
# start_time = time.time()
|
665 |
+
|
666 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
667 |
+
# if protein_embedding is None:
|
668 |
+
# return None, "Embedding generation failed!"
|
669 |
+
|
670 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
671 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
672 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
673 |
+
|
674 |
+
# smiles_list = [
|
675 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
676 |
+
# ]
|
677 |
+
|
678 |
+
# if not smiles_list:
|
679 |
+
# return None, "No valid SMILES generated!"
|
680 |
+
|
681 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
682 |
+
# with open(filename, "w") as file:
|
683 |
+
# file.write("\n".join(smiles_list))
|
684 |
+
|
685 |
+
# elapsed_time = time.time() - start_time
|
686 |
+
# return filename, elapsed_time
|
687 |
+
|
688 |
+
# @app.route("/", methods=["GET", "POST"])
|
689 |
+
# def index():
|
690 |
+
# if request.method == "POST":
|
691 |
+
# sequence = request.form["sequence"].strip()
|
692 |
+
# if not sequence:
|
693 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
694 |
+
|
695 |
+
# file_path, result = generate_smiles(sequence)
|
696 |
+
# if file_path is None:
|
697 |
+
# return render_template("index.html", message=f"Error: {result}")
|
698 |
+
|
699 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
700 |
+
|
701 |
+
# return render_template("index.html")
|
702 |
+
|
703 |
+
# @app.route("/download")
|
704 |
+
# def download_file():
|
705 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
706 |
+
# return send_file(file_path, as_attachment=True)
|
707 |
+
|
708 |
+
# if __name__ == "__main__":
|
709 |
+
# app.run(host="0.0.0.0", port=8000, debug=True)
|
710 |
+
|
711 |
+
|
712 |
+
|
713 |
+
|
714 |
+
|
715 |
+
# import os
|
716 |
+
# import time
|
717 |
+
# import requests
|
718 |
+
# import numpy as np
|
719 |
+
# import gdown # NEW: For Google Drive downloads
|
720 |
+
# from flask import Flask, render_template, request, send_file
|
721 |
+
# from rdkit import Chem
|
722 |
+
# from transformers import AutoModel
|
723 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
724 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
725 |
+
|
726 |
+
# # REPLACE WITH YOUR GOOGLE DRIVE FILE IDs
|
727 |
+
# GDRIVE_FILE_IDS = {
|
728 |
+
# "pytorch_model.bin": "11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb", # Replace with actual ID
|
729 |
+
# "config.json": "1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E",
|
730 |
+
# "tokenizer_config.json": "1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc",
|
731 |
+
# "vocab.txt": "1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w",
|
732 |
+
# "special_tokens_map.json": "1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl"
|
733 |
+
# }
|
734 |
+
|
735 |
+
# # LOCAL DIRECTORIES
|
736 |
+
# bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
|
737 |
+
# cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
|
738 |
+
|
739 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
740 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
741 |
+
|
742 |
+
# os.environ["TMPDIR"] = bio_model_dir
|
743 |
+
# os.environ["TEMP"] = bio_model_dir
|
744 |
+
# os.environ["TMP"] = bio_model_dir
|
745 |
+
|
746 |
+
# UPLOAD_FOLDER = "Samples"
|
747 |
+
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
748 |
+
|
749 |
+
# app = Flask(__name__)
|
750 |
+
|
751 |
+
# # DOWNLOAD MODEL FILES IF MISSING
|
752 |
+
# for file_name, file_id in GDRIVE_FILE_IDS.items():
|
753 |
+
# file_path = os.path.join(bio_model_dir, file_name)
|
754 |
+
|
755 |
+
# if not os.path.exists(file_path):
|
756 |
+
# print(f"Downloading {file_name} from Google Drive...")
|
757 |
+
# gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
|
758 |
+
# print(f"{file_name} downloaded!")
|
759 |
+
|
760 |
+
# # BIO-EMBEDDING MODEL LOADING
|
761 |
+
# try:
|
762 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
763 |
+
# except Exception as e:
|
764 |
+
# print(f"Error loading ProtTrans-BERT-BFD model: {e}")
|
765 |
+
# embedder = None
|
766 |
+
|
767 |
+
# def generate_bio_embeddings(sequence):
|
768 |
+
# """Generate bio-embeddings for a given protein sequence."""
|
769 |
+
# if embedder is None:
|
770 |
+
# return None
|
771 |
+
# try:
|
772 |
+
# embedding_protein = embedder.embed(sequence)
|
773 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
774 |
+
# return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
|
775 |
+
# except Exception as e:
|
776 |
+
# print(f"Embedding Error: {e}")
|
777 |
+
# return None
|
778 |
+
|
779 |
+
# def generate_smiles(sequence, n_samples=100):
|
780 |
+
# """Generate SMILES from a protein sequence."""
|
781 |
+
# start_time = time.time()
|
782 |
+
|
783 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
784 |
+
# if protein_embedding is None:
|
785 |
+
# return None, "Embedding generation failed!"
|
786 |
+
|
787 |
+
# # LOAD TRAINED CVanilla_RNN_Builder MODEL
|
788 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
789 |
+
|
790 |
+
# # MOLECULAR GRAPH GENERATION
|
791 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
792 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
793 |
+
|
794 |
+
# # CONVERT TO SMILES
|
795 |
+
# smiles_list = [
|
796 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
797 |
+
# ]
|
798 |
+
|
799 |
+
# if not smiles_list:
|
800 |
+
# return None, "No valid SMILES generated!"
|
801 |
+
|
802 |
+
# # SAVE TO FILE
|
803 |
+
# filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
804 |
+
# with open(filename, "w") as file:
|
805 |
+
# file.write("\n".join(smiles_list))
|
806 |
+
|
807 |
+
# elapsed_time = time.time() - start_time
|
808 |
+
# return filename, elapsed_time
|
809 |
+
|
810 |
+
# @app.route("/", methods=["GET", "POST"])
|
811 |
+
# def index():
|
812 |
+
# if request.method == "POST":
|
813 |
+
# sequence = request.form["sequence"].strip()
|
814 |
+
# if not sequence:
|
815 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
816 |
+
|
817 |
+
# file_path, result = generate_smiles(sequence)
|
818 |
+
# if file_path is None:
|
819 |
+
# return render_template("index.html", message=f"Error: {result}")
|
820 |
+
|
821 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
822 |
+
|
823 |
+
# return render_template("index.html")
|
824 |
+
|
825 |
+
# @app.route("/download")
|
826 |
+
# def download_file():
|
827 |
+
# file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
|
828 |
+
# return send_file(file_path, as_attachment=True)
|
829 |
+
|
830 |
+
# if __name__ == "__main__":
|
831 |
+
# app.run(host="0.0.0.0", port=8000, debug=True)
|
832 |
+
|
833 |
+
|
834 |
+
|
835 |
+
# import os
|
836 |
+
# import time
|
837 |
+
# import gdown
|
838 |
+
# import numpy as np
|
839 |
+
# from flask import Flask, render_template, request, send_file
|
840 |
+
# from rdkit import Chem
|
841 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
842 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
843 |
+
|
844 |
+
# # DIRECTORIES
|
845 |
+
# bio_model_dir = "/app/modelsBioembed"
|
846 |
+
# cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
|
847 |
+
# upload_folder = "Samples"
|
848 |
+
|
849 |
+
# # Create directories if they don't exist
|
850 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
851 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
852 |
+
# os.makedirs(upload_folder, exist_ok=True)
|
853 |
+
|
854 |
+
# # Google Drive file IDs for the model files
|
855 |
+
# MODEL_FILES = {
|
856 |
+
# "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
|
857 |
+
# "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
|
858 |
+
# "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
|
859 |
+
# "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
|
860 |
+
# "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
|
861 |
+
# }
|
862 |
+
|
863 |
+
# # Function to download missing files from Google Drive
|
864 |
+
# def download_model_files():
|
865 |
+
# for filename, file_id in MODEL_FILES.items():
|
866 |
+
# file_path = os.path.join(bio_model_dir, filename)
|
867 |
+
# if not os.path.exists(file_path):
|
868 |
+
# print(f"Downloading {filename} from Google Drive...")
|
869 |
+
# gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
|
870 |
+
|
871 |
+
# # Download required model files
|
872 |
+
# download_model_files()
|
873 |
+
# print("All model files are ready!")
|
874 |
+
|
875 |
+
# # Load the ProtTrans-BERT-BFD Model
|
876 |
+
# try:
|
877 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
878 |
+
# print("ProtTrans-BERT-BFD model loaded successfully!")
|
879 |
+
# except Exception as e:
|
880 |
+
# print(f"Error loading model: {e}")
|
881 |
+
# embedder = None
|
882 |
+
|
883 |
+
# # Function to generate protein embeddings
|
884 |
+
# def generate_bio_embeddings(sequence):
|
885 |
+
# if embedder is None:
|
886 |
+
# return None
|
887 |
+
# try:
|
888 |
+
# embedding_protein = embedder.embed(sequence)
|
889 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
890 |
+
# return np.array(embedding_per_protein).reshape(1, -1)
|
891 |
+
# except Exception as e:
|
892 |
+
# print(f"Embedding Error: {e}")
|
893 |
+
# return None
|
894 |
+
|
895 |
+
# # Function to generate SMILES from a protein sequence
|
896 |
+
# def generate_smiles(sequence, n_samples=100):
|
897 |
+
# start_time = time.time()
|
898 |
+
|
899 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
900 |
+
# if protein_embedding is None:
|
901 |
+
# return None, "Embedding generation failed!"
|
902 |
+
|
903 |
+
# # Load the trained CVanilla_RNN_Builder model
|
904 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
905 |
+
|
906 |
+
# # Generate molecular graphs
|
907 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
908 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
909 |
+
|
910 |
+
# # Convert to SMILES format
|
911 |
+
# smiles_list = [
|
912 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
913 |
+
# ]
|
914 |
+
|
915 |
+
# if not smiles_list:
|
916 |
+
# return None, "No valid SMILES generated!"
|
917 |
+
|
918 |
+
# # Save SMILES to a file
|
919 |
+
# filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
|
920 |
+
# with open(filename, "w") as file:
|
921 |
+
# file.write("\n".join(smiles_list))
|
922 |
+
|
923 |
+
# elapsed_time = time.time() - start_time
|
924 |
+
# return filename, elapsed_time
|
925 |
+
|
926 |
+
# # Initialize Flask App
|
927 |
+
# app = Flask(__name__)
|
928 |
+
|
929 |
+
# @app.route("/", methods=["GET", "POST"])
|
930 |
+
# def index():
|
931 |
+
# if request.method == "POST":
|
932 |
+
# sequence = request.form["sequence"].strip()
|
933 |
+
# if not sequence:
|
934 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
935 |
+
|
936 |
+
# file_path, result = generate_smiles(sequence)
|
937 |
+
# if file_path is None:
|
938 |
+
# return render_template("index.html", message=f"Error: {result}")
|
939 |
+
|
940 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
941 |
+
|
942 |
+
# return render_template("index.html")
|
943 |
+
|
944 |
+
# @app.route("/download")
|
945 |
+
# def download_file():
|
946 |
+
# file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
|
947 |
+
# return send_file(file_path, as_attachment=True)
|
948 |
+
|
949 |
+
# if __name__ == "__main__":
|
950 |
+
# app.run(host="0.0.0.0", port=8000)
|
951 |
+
|
952 |
+
|
953 |
+
|
954 |
+
# import os
|
955 |
+
# import time
|
956 |
+
# import requests
|
957 |
+
# from flask import Flask, render_template, request, send_file
|
958 |
+
# from rdkit import Chem
|
959 |
+
# from bio_embeddings.embed import ProtTransBertBFDEmbedder
|
960 |
+
# from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
|
961 |
+
|
962 |
+
# # DIRECTORIES
|
963 |
+
# bio_model_dir = "/app/modelsBioembed"
|
964 |
+
# cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
|
965 |
+
# upload_folder = "Samples"
|
966 |
+
|
967 |
+
# # Create directories if they don't exist
|
968 |
+
# os.makedirs(bio_model_dir, exist_ok=True)
|
969 |
+
# os.makedirs(cvn_model_dir, exist_ok=True)
|
970 |
+
# os.makedirs(upload_folder, exist_ok=True)
|
971 |
+
|
972 |
+
# # Google Drive file IDs for the model files
|
973 |
+
# MODEL_FILES = {
|
974 |
+
# "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
|
975 |
+
# "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
|
976 |
+
# "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
|
977 |
+
# "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
|
978 |
+
# "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
|
979 |
+
# }
|
980 |
+
|
981 |
+
# # Function to download a file from Google Drive
|
982 |
+
# def download_file_from_google_drive(file_id, destination):
|
983 |
+
# URL = f"https://drive.google.com/uc?export=download&id={file_id}"
|
984 |
+
# session = requests.Session()
|
985 |
+
# response = session.get(URL, stream=True)
|
986 |
+
|
987 |
+
# # Check if the request was successful
|
988 |
+
# if response.status_code == 200:
|
989 |
+
# with open(destination, "wb") as f:
|
990 |
+
# for chunk in response.iter_content(chunk_size=128):
|
991 |
+
# f.write(chunk)
|
992 |
+
# print(f"Downloaded {destination}")
|
993 |
+
# else:
|
994 |
+
# print(f"Failed to download {destination}")
|
995 |
+
|
996 |
+
# # Function to download missing files from Google Drive
|
997 |
+
# def download_model_files():
|
998 |
+
# for filename, file_id in MODEL_FILES.items():
|
999 |
+
# file_path = os.path.join(bio_model_dir, filename)
|
1000 |
+
# if not os.path.exists(file_path):
|
1001 |
+
# print(f"Downloading {filename} from Google Drive...")
|
1002 |
+
# download_file_from_google_drive(file_id, file_path)
|
1003 |
+
|
1004 |
+
# # Download required model files
|
1005 |
+
# download_model_files()
|
1006 |
+
# print("All model files are ready!")
|
1007 |
+
|
1008 |
+
# # Load the ProtTrans-BERT-BFD Model
|
1009 |
+
# try:
|
1010 |
+
# embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
|
1011 |
+
# print("ProtTrans-BERT-BFD model loaded successfully!")
|
1012 |
+
# except Exception as e:
|
1013 |
+
# print(f"Error loading model: {e}")
|
1014 |
+
# embedder = None
|
1015 |
+
|
1016 |
+
# # Function to generate protein embeddings
|
1017 |
+
# def generate_bio_embeddings(sequence):
|
1018 |
+
# if embedder is None:
|
1019 |
+
# return None
|
1020 |
+
# try:
|
1021 |
+
# embedding_protein = embedder.embed(sequence)
|
1022 |
+
# embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
|
1023 |
+
# return np.array(embedding_per_protein).reshape(1, -1)
|
1024 |
+
# except Exception as e:
|
1025 |
+
# print(f"Embedding Error: {e}")
|
1026 |
+
# return None
|
1027 |
+
|
1028 |
+
# # Function to generate SMILES from a protein sequence
|
1029 |
+
# def generate_smiles(sequence, n_samples=100):
|
1030 |
+
# start_time = time.time()
|
1031 |
+
|
1032 |
+
# protein_embedding = generate_bio_embeddings(sequence)
|
1033 |
+
# if protein_embedding is None:
|
1034 |
+
# return None, "Embedding generation failed!"
|
1035 |
+
|
1036 |
+
# # Load the trained CVanilla_RNN_Builder model
|
1037 |
+
# model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
|
1038 |
+
|
1039 |
+
# # Generate molecular graphs
|
1040 |
+
# samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
|
1041 |
+
# valid_samples = [sample for sample in samples if sample is not None]
|
1042 |
+
|
1043 |
+
# # Convert to SMILES format
|
1044 |
+
# smiles_list = [
|
1045 |
+
# Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
|
1046 |
+
# ]
|
1047 |
+
|
1048 |
+
# if not smiles_list:
|
1049 |
+
# return None, "No valid SMILES generated!"
|
1050 |
+
|
1051 |
+
# # Save SMILES to a file
|
1052 |
+
# filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
|
1053 |
+
# with open(filename, "w") as file:
|
1054 |
+
# file.write("\n".join(smiles_list))
|
1055 |
+
|
1056 |
+
# elapsed_time = time.time() - start_time
|
1057 |
+
# return filename, elapsed_time
|
1058 |
+
|
1059 |
+
# # Initialize Flask App
|
1060 |
+
# app = Flask(__name__)
|
1061 |
+
|
1062 |
+
# @app.route("/", methods=["GET", "POST"])
|
1063 |
+
# def index():
|
1064 |
+
# if request.method == "POST":
|
1065 |
+
# sequence = request.form["sequence"].strip()
|
1066 |
+
# if not sequence:
|
1067 |
+
# return render_template("index.html", message="Please enter a valid sequence.")
|
1068 |
+
|
1069 |
+
# file_path, result = generate_smiles(sequence)
|
1070 |
+
# if file_path is None:
|
1071 |
+
# return render_template("index.html", message=f"Error: {result}")
|
1072 |
+
|
1073 |
+
# return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
|
1074 |
+
|
1075 |
+
# return render_template("index.html")
|
1076 |
+
|
1077 |
+
# @app.route("/download")
|
1078 |
+
# def download_file():
|
1079 |
+
# file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
|
1080 |
+
# return send_file(file_path, as_attachment=True)
|
1081 |
+
|
1082 |
+
# if __name__ == "__main__":
|
1083 |
+
# app.run(host="0.0.0.0", port=8000)
|
1084 |
+
|
modelsBioembed/.gitattributes
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
modelsBioembed/README.md
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: protein
|
3 |
+
tags:
|
4 |
+
- protein language model
|
5 |
+
datasets:
|
6 |
+
- BFD
|
7 |
+
---
|
8 |
+
|
9 |
+
# ProtBert-BFD model
|
10 |
+
|
11 |
+
Pretrained model on protein sequences using a masked language modeling (MLM) objective. It was introduced in
|
12 |
+
[this paper](https://doi.org/10.1101/2020.07.12.199554) and first released in
|
13 |
+
[this repository](https://github.com/agemagician/ProtTrans). This model is trained on uppercase amino acids: it only works with capital letter amino acids.
|
14 |
+
|
15 |
+
|
16 |
+
## Model description
|
17 |
+
|
18 |
+
ProtBert-BFD is based on Bert model which pretrained on a large corpus of protein sequences in a self-supervised fashion.
|
19 |
+
This means it was pretrained on the raw protein sequences only, with no humans labelling them in any way (which is why it can use lots of
|
20 |
+
publicly available data) with an automatic process to generate inputs and labels from those protein sequences.
|
21 |
+
|
22 |
+
One important difference between our Bert model and the original Bert version is the way of dealing with sequences as separate documents
|
23 |
+
This means the Next sentence prediction is not used, as each sequence is treated as a complete document.
|
24 |
+
The masking follows the original Bert training with randomly masks 15% of the amino acids in the input.
|
25 |
+
|
26 |
+
At the end, the feature extracted from this model revealed that the LM-embeddings from unlabeled data (only protein sequences) captured important biophysical properties governing protein
|
27 |
+
shape.
|
28 |
+
This implied learning some of the grammar of the language of life realized in protein sequences.
|
29 |
+
|
30 |
+
## Intended uses & limitations
|
31 |
+
|
32 |
+
The model could be used for protein feature extraction or to be fine-tuned on downstream tasks.
|
33 |
+
We have noticed in some tasks you could gain more accuracy by fine-tuning the model rather than using it as a feature extractor.
|
34 |
+
|
35 |
+
### How to use
|
36 |
+
|
37 |
+
You can use this model directly with a pipeline for masked language modeling:
|
38 |
+
|
39 |
+
```python
|
40 |
+
>>> from transformers import BertForMaskedLM, BertTokenizer, pipeline
|
41 |
+
>>> tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
|
42 |
+
>>> model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert_bfd")
|
43 |
+
>>> unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)
|
44 |
+
>>> unmasker('D L I P T S S K L V V [MASK] D T S L Q V K K A F F A L V T')
|
45 |
+
|
46 |
+
[{'score': 0.1165614128112793,
|
47 |
+
'sequence': '[CLS] D L I P T S S K L V V L D T S L Q V K K A F F A L V T [SEP]',
|
48 |
+
'token': 5,
|
49 |
+
'token_str': 'L'},
|
50 |
+
{'score': 0.08976086974143982,
|
51 |
+
'sequence': '[CLS] D L I P T S S K L V V V D T S L Q V K K A F F A L V T [SEP]',
|
52 |
+
'token': 8,
|
53 |
+
'token_str': 'V'},
|
54 |
+
{'score': 0.08864385634660721,
|
55 |
+
'sequence': '[CLS] D L I P T S S K L V V S D T S L Q V K K A F F A L V T [SEP]',
|
56 |
+
'token': 10,
|
57 |
+
'token_str': 'S'},
|
58 |
+
{'score': 0.06227643042802811,
|
59 |
+
'sequence': '[CLS] D L I P T S S K L V V A D T S L Q V K K A F F A L V T [SEP]',
|
60 |
+
'token': 6,
|
61 |
+
'token_str': 'A'},
|
62 |
+
{'score': 0.06194969266653061,
|
63 |
+
'sequence': '[CLS] D L I P T S S K L V V T D T S L Q V K K A F F A L V T [SEP]',
|
64 |
+
'token': 15,
|
65 |
+
'token_str': 'T'}]
|
66 |
+
```
|
67 |
+
|
68 |
+
Here is how to use this model to get the features of a given protein sequence in PyTorch:
|
69 |
+
|
70 |
+
```python
|
71 |
+
from transformers import BertModel, BertTokenizer
|
72 |
+
import re
|
73 |
+
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
|
74 |
+
model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
|
75 |
+
sequence_Example = "A E T C Z A O"
|
76 |
+
sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
|
77 |
+
encoded_input = tokenizer(sequence_Example, return_tensors='pt')
|
78 |
+
output = model(**encoded_input)
|
79 |
+
```
|
80 |
+
|
81 |
+
## Training data
|
82 |
+
|
83 |
+
The ProtBert-BFD model was pretrained on [BFD](https://bfd.mmseqs.com/), a dataset consisting of 2.1 billion protein sequences.
|
84 |
+
|
85 |
+
## Training procedure
|
86 |
+
|
87 |
+
### Preprocessing
|
88 |
+
|
89 |
+
The protein sequences are uppercased and tokenized using a single space and a vocabulary size of 21.
|
90 |
+
The inputs of the model are then of the form:
|
91 |
+
|
92 |
+
```
|
93 |
+
[CLS] Protein Sequence A [SEP] Protein Sequence B [SEP]
|
94 |
+
```
|
95 |
+
|
96 |
+
Furthermore, each protein sequence was treated as a separate document.
|
97 |
+
The preprocessing step was performed twice, once for a combined length (2 sequences) of less than 512 amino acids, and another time using a combined length (2 sequences) of less than 2048 amino acids.
|
98 |
+
|
99 |
+
The details of the masking procedure for each sequence followed the original Bert model as following:
|
100 |
+
- 15% of the amino acids are masked.
|
101 |
+
- In 80% of the cases, the masked amino acids are replaced by `[MASK]`.
|
102 |
+
- In 10% of the cases, the masked amino acids are replaced by a random amino acid (different) from the one they replace.
|
103 |
+
- In the 10% remaining cases, the masked amino acids are left as is.
|
104 |
+
|
105 |
+
### Pretraining
|
106 |
+
|
107 |
+
The model was trained on a single TPU Pod V3-1024 for one million steps in total.
|
108 |
+
800k steps using sequence length 512 (batch size 32k), and 200K steps using sequence length 2048 (batch size 6k).
|
109 |
+
The optimizer used is Lamb with a learning rate of 0.002, a weight decay of 0.01, learning rate warmup for 140k steps and linear decay of the learning rate after.
|
110 |
+
|
111 |
+
## Evaluation results
|
112 |
+
|
113 |
+
When fine-tuned on downstream tasks, this model achieves the following results:
|
114 |
+
|
115 |
+
Test results :
|
116 |
+
|
117 |
+
| Task/Dataset | secondary structure (3-states) | secondary structure (8-states) | Localization | Membrane |
|
118 |
+
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
119 |
+
| CASP12 | 76 | 65 | | |
|
120 |
+
| TS115 | 84 | 73 | | |
|
121 |
+
| CB513 | 83 | 70 | | |
|
122 |
+
| DeepLoc | | | 78 | 91 |
|
123 |
+
|
124 |
+
### BibTeX entry and citation info
|
125 |
+
|
126 |
+
```bibtex
|
127 |
+
@article {Elnaggar2020.07.12.199554,
|
128 |
+
author = {Elnaggar, Ahmed and Heinzinger, Michael and Dallago, Christian and Rehawi, Ghalia and Wang, Yu and Jones, Llion and Gibbs, Tom and Feher, Tamas and Angerer, Christoph and Steinegger, Martin and BHOWMIK, DEBSINDHU and Rost, Burkhard},
|
129 |
+
title = {ProtTrans: Towards Cracking the Language of Life{\textquoteright}s Code Through Self-Supervised Deep Learning and High Performance Computing},
|
130 |
+
elocation-id = {2020.07.12.199554},
|
131 |
+
year = {2020},
|
132 |
+
doi = {10.1101/2020.07.12.199554},
|
133 |
+
publisher = {Cold Spring Harbor Laboratory},
|
134 |
+
abstract = {Computational biology and bioinformatics provide vast data gold-mines from protein sequences, ideal for Language Models (LMs) taken from Natural Language Processing (NLP). These LMs reach for new prediction frontiers at low inference costs. Here, we trained two auto-regressive language models (Transformer-XL, XLNet) and two auto-encoder models (Bert, Albert) on data from UniRef and BFD containing up to 393 billion amino acids (words) from 2.1 billion protein sequences (22- and 112 times the entire English Wikipedia). The LMs were trained on the Summit supercomputer at Oak Ridge National Laboratory (ORNL), using 936 nodes (total 5616 GPUs) and one TPU Pod (V3-512 or V3-1024). We validated the advantage of up-scaling LMs to larger models supported by bigger data by predicting secondary structure (3-states: Q3=76-84, 8 states: Q8=65-73), sub-cellular localization for 10 cellular compartments (Q10=74) and whether a protein is membrane-bound or water-soluble (Q2=89). Dimensionality reduction revealed that the LM-embeddings from unlabeled data (only protein sequences) captured important biophysical properties governing protein shape. This implied learning some of the grammar of the language of life realized in protein sequences. The successful up-scaling of protein LMs through HPC to larger data sets slightly reduced the gap between models trained on evolutionary information and LMs. Availability ProtTrans: \<a href="https://github.com/agemagician/ProtTrans"\>https://github.com/agemagician/ProtTrans\</a\>Competing Interest StatementThe authors have declared no competing interest.},
|
135 |
+
URL = {https://www.biorxiv.org/content/early/2020/07/21/2020.07.12.199554},
|
136 |
+
eprint = {https://www.biorxiv.org/content/early/2020/07/21/2020.07.12.199554.full.pdf},
|
137 |
+
journal = {bioRxiv}
|
138 |
+
}
|
139 |
+
```
|
140 |
+
|
141 |
+
> Created by [Ahmed Elnaggar/@Elnaggar_AI](https://twitter.com/Elnaggar_AI) | [LinkedIn](https://www.linkedin.com/in/prof-ahmed-elnaggar/)
|
modelsBioembed/config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"BertForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.0,
|
6 |
+
"hidden_act": "gelu",
|
7 |
+
"hidden_dropout_prob": 0.0,
|
8 |
+
"hidden_size": 1024,
|
9 |
+
"initializer_range": 0.02,
|
10 |
+
"intermediate_size": 4096,
|
11 |
+
"max_position_embeddings": 40000,
|
12 |
+
"num_attention_heads": 16,
|
13 |
+
"num_hidden_layers": 30,
|
14 |
+
"type_vocab_size": 2,
|
15 |
+
"vocab_size": 30
|
16 |
+
}
|
modelsBioembed/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
modelsBioembed/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "special_tokens_map_file": null, "full_tokenizer_file": null}
|
modelsBioembed/vocab.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[PAD]
|
2 |
+
[UNK]
|
3 |
+
[CLS]
|
4 |
+
[SEP]
|
5 |
+
[MASK]
|
6 |
+
L
|
7 |
+
A
|
8 |
+
G
|
9 |
+
V
|
10 |
+
E
|
11 |
+
S
|
12 |
+
I
|
13 |
+
K
|
14 |
+
R
|
15 |
+
D
|
16 |
+
T
|
17 |
+
P
|
18 |
+
N
|
19 |
+
Q
|
20 |
+
F
|
21 |
+
Y
|
22 |
+
M
|
23 |
+
H
|
24 |
+
C
|
25 |
+
W
|
26 |
+
X
|
27 |
+
U
|
28 |
+
B
|
29 |
+
Z
|
30 |
+
O
|
models_folder/atom_types.txt
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
P,0,0
|
2 |
+
F,0,0
|
3 |
+
O,0,1
|
4 |
+
Sn,0,0
|
5 |
+
K,0,0
|
6 |
+
S,0,2
|
7 |
+
Fe,-2,0
|
8 |
+
Cl,3,0
|
9 |
+
Zn,2,0
|
10 |
+
S,-1,0
|
11 |
+
O,-1,0
|
12 |
+
N,1,1
|
13 |
+
Cl,1,0
|
14 |
+
C,-1,0
|
15 |
+
N,1,2
|
16 |
+
Na,0,0
|
17 |
+
V,0,0
|
18 |
+
C,0,1
|
19 |
+
N,1,0
|
20 |
+
N,-1,1
|
21 |
+
Li,0,0
|
22 |
+
N,1,3
|
23 |
+
I,1,0
|
24 |
+
B,-1,0
|
25 |
+
N,0,0
|
26 |
+
Re,0,0
|
27 |
+
O,1,0
|
28 |
+
S,1,1
|
29 |
+
N,0,1
|
30 |
+
B,0,0
|
31 |
+
N,-2,0
|
32 |
+
N,0,3
|
33 |
+
F,0,1
|
34 |
+
O,0,0
|
35 |
+
S,0,0
|
36 |
+
O,0,2
|
37 |
+
Br,0,1
|
38 |
+
Ca,2,0
|
39 |
+
Se,0,0
|
40 |
+
Ru,0,0
|
41 |
+
P,0,1
|
42 |
+
B,-1,3
|
43 |
+
Fe,0,0
|
44 |
+
S,-1,1
|
45 |
+
C,1,0
|
46 |
+
C,0,0
|
47 |
+
Si,0,0
|
48 |
+
O,1,1
|
49 |
+
I,0,0
|
50 |
+
Hg,0,0
|
51 |
+
C,0,3
|
52 |
+
C,-1,1
|
53 |
+
Br,0,0
|
54 |
+
H,0,0
|
55 |
+
Na,1,0
|
56 |
+
I,0,1
|
57 |
+
Sb,0,0
|
58 |
+
Te,0,0
|
59 |
+
C,0,2
|
60 |
+
S,0,1
|
61 |
+
P,0,2
|
62 |
+
P,-1,0
|
63 |
+
Cl,0,0
|
64 |
+
P,1,0
|
65 |
+
C,-1,2
|
66 |
+
S,1,0
|
67 |
+
N,-1,0
|
models_folder/configs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"N_C": 1024, "F_e": 16, "F_h": [32, 64, 128, 128, 256, 256], "F_skip": 256, "F_c": [512], "Fh_policy": 128, "activation": "relu", "rename": false, "N_rnn": 3}
|
modelstrc.py
ADDED
@@ -0,0 +1,1331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import rdkit
|
2 |
+
from rdkit import Chem
|
3 |
+
from rdkit.Chem import Draw
|
4 |
+
from rdkit import DataStructs
|
5 |
+
from rdkit.Chem import AllChem
|
6 |
+
from rdkit.Chem import rdmolfiles
|
7 |
+
from rdkit.Chem.Draw import IPythonConsole
|
8 |
+
from molvs import standardize_smiles
|
9 |
+
import os
|
10 |
+
import gc
|
11 |
+
import sys
|
12 |
+
import time
|
13 |
+
import json
|
14 |
+
import math
|
15 |
+
import random
|
16 |
+
import argparse
|
17 |
+
import itertools
|
18 |
+
import numpy as np
|
19 |
+
import mxnet as mx
|
20 |
+
import pandas as pd
|
21 |
+
import networkx as nx
|
22 |
+
from scipy import sparse
|
23 |
+
from mxnet.gluon import nn
|
24 |
+
from collections import Counter
|
25 |
+
from mxnet.autograd import Function
|
26 |
+
from mxnet.gluon.data import Dataset
|
27 |
+
from mxnet import gluon, autograd, nd
|
28 |
+
from mxnet.gluon.data import DataLoader
|
29 |
+
from abc import ABCMeta, abstractmethod
|
30 |
+
from mxnet.gluon.data.sampler import Sampler
|
31 |
+
|
32 |
+
class MoleculeSpec(object):
|
33 |
+
|
34 |
+
def __init__(self, file_name='models_folder/atom_types.txt'):
|
35 |
+
self.atom_types = []
|
36 |
+
self.atom_symbols = []
|
37 |
+
with open(file_name) as f:
|
38 |
+
for line in f:
|
39 |
+
atom_type_i = line.strip('\n').split(',')
|
40 |
+
self.atom_types.append((atom_type_i[0], int(atom_type_i[1]), int(atom_type_i[2])))
|
41 |
+
if atom_type_i[0] not in self.atom_symbols:
|
42 |
+
self.atom_symbols.append(atom_type_i[0])
|
43 |
+
self.bond_orders = [Chem.BondType.AROMATIC,
|
44 |
+
Chem.BondType.SINGLE,
|
45 |
+
Chem.BondType.DOUBLE,
|
46 |
+
Chem.BondType.TRIPLE]
|
47 |
+
self.max_iter = 120
|
48 |
+
|
49 |
+
def get_atom_type(self, atom):
|
50 |
+
atom_symbol = atom.GetSymbol()
|
51 |
+
atom_charge = atom.GetFormalCharge()
|
52 |
+
atom_hs = atom.GetNumExplicitHs()
|
53 |
+
return self.atom_types.index((atom_symbol, atom_charge, atom_hs))
|
54 |
+
|
55 |
+
def get_bond_type(self, bond):
|
56 |
+
return self.bond_orders.index(bond.GetBondType())
|
57 |
+
|
58 |
+
def index_to_atom(self, idx):
|
59 |
+
atom_symbol, atom_charge, atom_hs = self.atom_types[idx]
|
60 |
+
a = Chem.Atom(atom_symbol)
|
61 |
+
a.SetFormalCharge(atom_charge)
|
62 |
+
a.SetNumExplicitHs(atom_hs)
|
63 |
+
return a
|
64 |
+
|
65 |
+
def index_to_bond(self, mol, begin_id, end_id, idx):
|
66 |
+
mol.AddBond(begin_id, end_id, self.bond_orders[idx])
|
67 |
+
|
68 |
+
@property
|
69 |
+
def num_atom_types(self):
|
70 |
+
return len(self.atom_types)
|
71 |
+
|
72 |
+
@property
|
73 |
+
def num_bond_types(self):
|
74 |
+
return len(self.bond_orders)
|
75 |
+
|
76 |
+
_mol_spec = None
|
77 |
+
|
78 |
+
def get_mol_spec():
|
79 |
+
global _mol_spec
|
80 |
+
if _mol_spec is None:
|
81 |
+
_mol_spec = MoleculeSpec()
|
82 |
+
return _mol_spec
|
83 |
+
|
84 |
+
|
85 |
+
def get_graph_from_smiles(smiles):
|
86 |
+
mol = Chem.MolFromSmiles(smiles)
|
87 |
+
|
88 |
+
# build graph
|
89 |
+
atom_types, atom_ranks, bonds, bond_types = [], [], [], []
|
90 |
+
for a, r in zip(mol.GetAtoms(), Chem.CanonicalRankAtoms(mol)):
|
91 |
+
atom_types.append(get_mol_spec().get_atom_type(a))
|
92 |
+
atom_ranks.append(r)
|
93 |
+
for b in mol.GetBonds():
|
94 |
+
idx_1, idx_2, bt = b.GetBeginAtomIdx(), b.GetEndAtomIdx(), get_mol_spec().get_bond_type(b)
|
95 |
+
bonds.append([idx_1, idx_2])
|
96 |
+
bond_types.append(bt)
|
97 |
+
|
98 |
+
# build nx graph
|
99 |
+
graph = nx.Graph()
|
100 |
+
graph.add_nodes_from(range(len(atom_types)))
|
101 |
+
graph.add_edges_from(bonds)
|
102 |
+
|
103 |
+
return graph, atom_types, atom_ranks, bonds, bond_types
|
104 |
+
|
105 |
+
|
106 |
+
def get_graph_from_smiles_list(smiles_list):
|
107 |
+
graph_list = []
|
108 |
+
for smiles in smiles_list:
|
109 |
+
mol = Chem.MolFromSmiles(smiles)
|
110 |
+
|
111 |
+
# build graph
|
112 |
+
atom_types, bonds, bond_types = [], [], []
|
113 |
+
for a in mol.GetAtoms():
|
114 |
+
atom_types.append(get_mol_spec().get_atom_type(a))
|
115 |
+
for b in mol.GetBonds():
|
116 |
+
idx_1, idx_2, bt = b.GetBeginAtomIdx(), b.GetEndAtomIdx(), get_mol_spec().get_bond_type(b)
|
117 |
+
bonds.append([idx_1, idx_2])
|
118 |
+
bond_types.append(bt)
|
119 |
+
|
120 |
+
X_0 = np.array(atom_types, dtype=np.int64)
|
121 |
+
A_0 = np.concatenate([np.array(bonds, dtype=np.int64),
|
122 |
+
np.array(bond_types, dtype=np.int64)[:, np.newaxis]],
|
123 |
+
axis=1)
|
124 |
+
graph_list.append([X_0, A_0])
|
125 |
+
return graph_list
|
126 |
+
|
127 |
+
|
128 |
+
def traverse_graph(graph, atom_ranks, current_node=None, step_ids=None, p=0.9, log_p=0.0):
|
129 |
+
if current_node is None:
|
130 |
+
next_nodes = range(len(atom_ranks))
|
131 |
+
step_ids = [-1, ] * len(next_nodes)
|
132 |
+
next_node_ranks = atom_ranks
|
133 |
+
else:
|
134 |
+
next_nodes = graph.neighbors(current_node) # get neighbor nodes
|
135 |
+
next_nodes = [n for n in next_nodes if step_ids[n] < 0] # filter visited nodes
|
136 |
+
next_node_ranks = [atom_ranks[n] for n in next_nodes] # get ranks for neighbors
|
137 |
+
next_nodes = [n for n, r in sorted(zip(next_nodes, next_node_ranks), key=lambda _x:_x[1])] # sort by rank
|
138 |
+
|
139 |
+
# iterate through neighbors
|
140 |
+
while len(next_nodes) > 0:
|
141 |
+
if len(next_nodes)==1:
|
142 |
+
next_node = next_nodes[0]
|
143 |
+
elif random.random() >= (1 - p):
|
144 |
+
next_node = next_nodes[0]
|
145 |
+
log_p += np.log(p)
|
146 |
+
else:
|
147 |
+
next_node = next_nodes[random.randint(1, len(next_nodes) - 1)]
|
148 |
+
log_p += np.log((1.0 - p) / (len(next_nodes) - 1))
|
149 |
+
step_ids[next_node] = max(step_ids) + 1
|
150 |
+
_, log_p = traverse_graph(graph, atom_ranks, next_node, step_ids, p, log_p)
|
151 |
+
next_nodes = [n for n in next_nodes if step_ids[n] < 0] # filter visited nodes
|
152 |
+
|
153 |
+
return step_ids, log_p
|
154 |
+
|
155 |
+
|
156 |
+
def single_reorder(X_0, A_0, step_ids):
|
157 |
+
X_0, A_0 = np.copy(X_0), np.copy(A_0)
|
158 |
+
|
159 |
+
step_ids = np.array(step_ids, dtype=np.int64)
|
160 |
+
|
161 |
+
# sort by step_ids
|
162 |
+
sorted_ids = np.argsort(step_ids)
|
163 |
+
X_0 = X_0[sorted_ids]
|
164 |
+
A_0[:, 0], A_0[:, 1] = step_ids[A_0[:, 0]], step_ids[A_0[:, 1]]
|
165 |
+
max_b, min_b = np.amax(A_0[:, :2], axis=1), np.amin(A_0[:, :2], axis=1)
|
166 |
+
A_0 = A_0[np.lexsort([-min_b, max_b]), :]
|
167 |
+
|
168 |
+
# separate append and connect
|
169 |
+
max_b, min_b = np.amax(A_0[:, :2], axis=1), np.amin(A_0[:, :2], axis=1)
|
170 |
+
is_append = np.concatenate([np.array([True]), max_b[1:] > max_b[:-1]])
|
171 |
+
A_0 = np.concatenate([np.where(is_append[:, np.newaxis],
|
172 |
+
np.stack([min_b, max_b], axis=1),
|
173 |
+
np.stack([max_b, min_b], axis=1)),
|
174 |
+
A_0[:, -1:]], axis=1)
|
175 |
+
|
176 |
+
return X_0, A_0
|
177 |
+
|
178 |
+
|
179 |
+
def single_expand(X_0, A_0):
|
180 |
+
X_0, A_0 = np.copy(X_0), np.copy(A_0)
|
181 |
+
|
182 |
+
# expand X
|
183 |
+
is_append_iter = np.less(A_0[:, 0], A_0[:, 1]).astype(np.int64)
|
184 |
+
NX = np.cumsum(np.pad(is_append_iter, [[1, 0]], mode='constant', constant_values=1))
|
185 |
+
shift = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')[:-1])
|
186 |
+
X_index = np.arange(NX.sum(), dtype=np.int64) - np.repeat(shift, NX)
|
187 |
+
X = X_0[X_index]
|
188 |
+
|
189 |
+
# expand A
|
190 |
+
_, A_index = np.tril_indices(A_0.shape[0])
|
191 |
+
A = A_0[A_index, :]
|
192 |
+
NA = np.arange(A_0.shape[0] + 1)
|
193 |
+
|
194 |
+
# get action
|
195 |
+
# action_type, atom_type, bond_type, append_pos, connect_pos
|
196 |
+
action_type = 1 - is_append_iter
|
197 |
+
atom_type = np.where(action_type == 0, X_0[A_0[:, 1]], 0)
|
198 |
+
bond_type = A_0[:, 2]
|
199 |
+
append_pos = np.where(action_type == 0, A_0[:, 0], 0)
|
200 |
+
connect_pos = np.where(action_type == 1, A_0[:, 1], 0)
|
201 |
+
actions = np.stack([action_type, atom_type, bond_type, append_pos, connect_pos],
|
202 |
+
axis=1)
|
203 |
+
last_action = [[2, 0, 0, 0, 0]]
|
204 |
+
actions = np.append(actions, last_action, axis=0)
|
205 |
+
|
206 |
+
action_0 = np.array([X_0[0]], dtype=np.int64)
|
207 |
+
|
208 |
+
# }}}
|
209 |
+
|
210 |
+
# {{{ Get mask
|
211 |
+
last_atom_index = shift + NX - 1
|
212 |
+
last_atom_mask = np.zeros_like(X)
|
213 |
+
last_atom_mask[last_atom_index] = np.where(
|
214 |
+
np.pad(is_append_iter, [[1, 0]], mode='constant', constant_values=1) == 1,
|
215 |
+
np.ones_like(last_atom_index),
|
216 |
+
np.ones_like(last_atom_index) * 2)
|
217 |
+
# }}}
|
218 |
+
|
219 |
+
return action_0, X, NX, A, NA, actions, last_atom_mask
|
220 |
+
|
221 |
+
|
222 |
+
def get_d(A, X):
|
223 |
+
_to_sparse = lambda _A, _X: sparse.coo_matrix((np.ones([_A.shape[0] * 2], dtype=np.int64),
|
224 |
+
(np.concatenate([_A[:, 0], _A[:, 1]], axis=0),
|
225 |
+
np.concatenate([_A[:, 1], _A[:, 0]], axis=0))),
|
226 |
+
shape=[_X.shape[0], ] * 2)
|
227 |
+
A_sparse = _to_sparse(A, X)
|
228 |
+
|
229 |
+
d2 = A_sparse * A_sparse
|
230 |
+
d3 = d2 * A_sparse
|
231 |
+
|
232 |
+
# get D_2
|
233 |
+
D_2 = np.stack(d2.nonzero(), axis=1)
|
234 |
+
D_2 = D_2[D_2[:, 0] < D_2[:, 1], :]
|
235 |
+
|
236 |
+
# get D_3
|
237 |
+
D_3 = np.stack(d3.nonzero(), axis=1)
|
238 |
+
D_3 = D_3[D_3[:, 0] < D_3[:, 1], :]
|
239 |
+
|
240 |
+
# remove D_1 elements from D_3
|
241 |
+
D_3_sparse = _to_sparse(D_3, X)
|
242 |
+
D_3_sparse = D_3_sparse - D_3_sparse.multiply(A_sparse)
|
243 |
+
D_3 = np.stack(D_3_sparse.nonzero(), axis=1)
|
244 |
+
D_3 = D_3[D_3[:, 0] < D_3[:, 1], :]
|
245 |
+
|
246 |
+
return D_2, D_3
|
247 |
+
|
248 |
+
|
249 |
+
def merge_single_0(X_0, A_0, NX_0, NA_0):
|
250 |
+
# shift_ids
|
251 |
+
cumsum = np.cumsum(np.pad(NX_0, [[1, 0]], mode='constant')[:-1])
|
252 |
+
A_0[:, :2] += np.stack([np.repeat(cumsum, NA_0), ] * 2, axis=1)
|
253 |
+
|
254 |
+
# get D
|
255 |
+
D_0_2, D_0_3 = get_d(A_0, X_0)
|
256 |
+
|
257 |
+
# split A
|
258 |
+
A_split = []
|
259 |
+
for i in range(get_mol_spec().num_bond_types):
|
260 |
+
A_i = A_0[A_0[:, 2] == i, :2]
|
261 |
+
A_split.append(A_i)
|
262 |
+
A_split.extend([D_0_2, D_0_3])
|
263 |
+
A_0 = A_split
|
264 |
+
|
265 |
+
# NX_rep
|
266 |
+
NX_rep_0 = np.repeat(np.arange(NX_0.shape[0]), NX_0)
|
267 |
+
|
268 |
+
return X_0, A_0, NX_0, NX_rep_0
|
269 |
+
|
270 |
+
|
271 |
+
def merge_single(X, A,
|
272 |
+
NX, NA,
|
273 |
+
mol_ids, rep_ids, iw_ids,
|
274 |
+
action_0, actions,
|
275 |
+
last_append_mask,
|
276 |
+
log_p):
|
277 |
+
X, A, NX, NX_rep = merge_single_0(X, A, NX, NA)
|
278 |
+
cumsum = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')[:-1])
|
279 |
+
actions[:, -2] += cumsum * (actions[:, 0] == 0)
|
280 |
+
actions[:, -1] += cumsum * (actions[:, 0] == 1)
|
281 |
+
mol_ids_rep = np.repeat(mol_ids, NX)
|
282 |
+
rep_ids_rep = np.repeat(rep_ids, NX)
|
283 |
+
|
284 |
+
return X, A,\
|
285 |
+
mol_ids_rep, rep_ids_rep, iw_ids,\
|
286 |
+
last_append_mask,\
|
287 |
+
NX, NX_rep,\
|
288 |
+
action_0, actions, \
|
289 |
+
log_p
|
290 |
+
|
291 |
+
def process_single(smiles, k, p):
|
292 |
+
graph, atom_types, atom_ranks, bonds, bond_types = get_graph_from_smiles(smiles)
|
293 |
+
|
294 |
+
# original
|
295 |
+
X_0 = np.array(atom_types, dtype=np.int64)
|
296 |
+
A_0 = np.concatenate([np.array(bonds, dtype=np.int64),
|
297 |
+
np.array(bond_types, dtype=np.int64)[:, np.newaxis]],
|
298 |
+
axis=1)
|
299 |
+
|
300 |
+
X, A = [], []
|
301 |
+
NX, NA = [], []
|
302 |
+
mol_ids, rep_ids, iw_ids = [], [], []
|
303 |
+
action_0, actions = [], []
|
304 |
+
last_append_mask = []
|
305 |
+
log_p = []
|
306 |
+
|
307 |
+
# random sampling decoding route
|
308 |
+
for i in range(k):
|
309 |
+
step_ids_i, log_p_i = traverse_graph(graph, atom_ranks, p=p)
|
310 |
+
X_i, A_i = single_reorder(X_0, A_0, step_ids_i)
|
311 |
+
action_0_i, X_i, NX_i, A_i, NA_i, actions_i, last_atom_mask_i = single_expand(X_i, A_i)
|
312 |
+
|
313 |
+
# appends
|
314 |
+
X.append(X_i)
|
315 |
+
A.append(A_i)
|
316 |
+
NX.append(NX_i)
|
317 |
+
NA.append(NA_i)
|
318 |
+
action_0.append(action_0_i)
|
319 |
+
actions.append(actions_i)
|
320 |
+
last_append_mask.append(last_atom_mask_i)
|
321 |
+
|
322 |
+
mol_ids.append(np.zeros_like(NX_i, dtype=np.int64))
|
323 |
+
rep_ids.append(np.ones_like(NX_i, dtype=np.int64) * i)
|
324 |
+
iw_ids.append(np.ones_like(NX_i, dtype=np.int64) * i)
|
325 |
+
|
326 |
+
log_p.append(log_p_i)
|
327 |
+
|
328 |
+
# concatenate
|
329 |
+
X = np.concatenate(X, axis=0)
|
330 |
+
A = np.concatenate(A, axis = 0)
|
331 |
+
NX = np.concatenate(NX, axis = 0)
|
332 |
+
NA = np.concatenate(NA, axis = 0)
|
333 |
+
action_0 = np.concatenate(action_0, axis = 0)
|
334 |
+
actions = np.concatenate(actions, axis = 0)
|
335 |
+
last_append_mask = np.concatenate(last_append_mask, axis = 0)
|
336 |
+
mol_ids = np.concatenate(mol_ids, axis = 0)
|
337 |
+
rep_ids = np.concatenate(rep_ids, axis = 0)
|
338 |
+
iw_ids = np.concatenate(iw_ids, axis = 0)
|
339 |
+
log_p = np.array(log_p, dtype=np.float32)
|
340 |
+
|
341 |
+
return X, A, NX, NA, mol_ids, rep_ids, iw_ids, action_0, actions, last_append_mask, log_p
|
342 |
+
|
343 |
+
|
344 |
+
# noinspection PyArgumentList
|
345 |
+
def get_mol_from_graph(X, A, sanitize=True):
|
346 |
+
try:
|
347 |
+
mol = Chem.RWMol(Chem.Mol())
|
348 |
+
|
349 |
+
X, A = X.tolist(), A.tolist()
|
350 |
+
for i, atom_type in enumerate(X):
|
351 |
+
mol.AddAtom(get_mol_spec().index_to_atom(atom_type))
|
352 |
+
|
353 |
+
for atom_id1, atom_id2, bond_type in A:
|
354 |
+
get_mol_spec().index_to_bond(mol, atom_id1, atom_id2, bond_type)
|
355 |
+
except:
|
356 |
+
return None
|
357 |
+
|
358 |
+
if sanitize:
|
359 |
+
try:
|
360 |
+
mol = mol.GetMol()
|
361 |
+
Chem.SanitizeMol(mol)
|
362 |
+
return mol
|
363 |
+
except:
|
364 |
+
return None
|
365 |
+
else:
|
366 |
+
return mol
|
367 |
+
|
368 |
+
def get_mol_from_graph_list(graph_list, sanitize=True):
|
369 |
+
mol_list = [get_mol_from_graph(X, A, sanitize) for X, A in graph_list]
|
370 |
+
return mol_list
|
371 |
+
|
372 |
+
class GraphConvFn(Function):
|
373 |
+
|
374 |
+
def __init__(self, A):
|
375 |
+
self.A = A # type: nd.sparse.CSRNDArray
|
376 |
+
self.A_T = self.A # assume symmetric
|
377 |
+
super(GraphConvFn, self).__init__()
|
378 |
+
|
379 |
+
def forward(self, X):
|
380 |
+
if self.A is not None:
|
381 |
+
if len(X.shape) > 2:
|
382 |
+
X_resized = X.reshape((X.shape[0], -1))
|
383 |
+
output = nd.sparse.dot(self.A, X_resized)
|
384 |
+
output = output.reshape([-1, ] + [X.shape[i] for i in range(1, len(X.shape))])
|
385 |
+
else:
|
386 |
+
output = nd.sparse.dot(self.A, X)
|
387 |
+
return output
|
388 |
+
else:
|
389 |
+
return nd.zeros_like(X)
|
390 |
+
|
391 |
+
def backward(self, grad_output):
|
392 |
+
|
393 |
+
if self.A is not None:
|
394 |
+
if len(grad_output.shape) > 2:
|
395 |
+
grad_output_resized = grad_output.reshape((grad_output.shape[0], -1))
|
396 |
+
grad_input = nd.sparse.dot(self.A_T, grad_output_resized)
|
397 |
+
grad_input = grad_input.reshape([-1] + [grad_output.shape[i]
|
398 |
+
for i in range(1, len(grad_output.shape))])
|
399 |
+
else:
|
400 |
+
grad_input = nd.sparse.dot(self.A_T, grad_output)
|
401 |
+
return grad_input
|
402 |
+
else:
|
403 |
+
return nd.zeros_like(grad_output)
|
404 |
+
|
405 |
+
|
406 |
+
class EfficientGraphConvFn(Function):
|
407 |
+
"""Save memory by re-computation"""
|
408 |
+
|
409 |
+
def __init__(self, A_list):
|
410 |
+
self.A_list = A_list
|
411 |
+
super(EfficientGraphConvFn, self).__init__()
|
412 |
+
|
413 |
+
def forward(self, X, W):
|
414 |
+
X_list = [X]
|
415 |
+
for A in self.A_list:
|
416 |
+
if A is not None:
|
417 |
+
X_list.append(nd.sparse.dot(A, X))
|
418 |
+
else:
|
419 |
+
X_list.append(nd.zeros_like(X))
|
420 |
+
X_out = nd.concat(*X_list, dim=1)
|
421 |
+
self.save_for_backward(X, W)
|
422 |
+
|
423 |
+
return nd.dot(X_out, W)
|
424 |
+
|
425 |
+
def backward(self, grad_output):
|
426 |
+
X, W = self.saved_tensors
|
427 |
+
|
428 |
+
# recompute X_out
|
429 |
+
X_list = [X, ]
|
430 |
+
for A in self.A_list:
|
431 |
+
if A is not None:
|
432 |
+
X_list.append(nd.sparse.dot(A, X))
|
433 |
+
else:
|
434 |
+
X_list.append(nd.zeros_like(X))
|
435 |
+
X_out = nd.concat(*X_list, dim=1)
|
436 |
+
|
437 |
+
grad_W = nd.dot(X_out.T, grad_output)
|
438 |
+
|
439 |
+
grad_X_out = nd.dot(grad_output, W.T)
|
440 |
+
grad_X_out_list = nd.split(grad_X_out, num_outputs=len(self.A_list) + 1)
|
441 |
+
|
442 |
+
|
443 |
+
grad_X = [grad_X_out_list[0], ]
|
444 |
+
for A, grad_X_out in zip(self.A_list, grad_X_out_list[1:]):
|
445 |
+
if A is not None:
|
446 |
+
grad_X.append(nd.sparse.dot(A, grad_X_out))
|
447 |
+
else:
|
448 |
+
grad_X.append(nd.zeros_like(grad_X_out))
|
449 |
+
|
450 |
+
grad_X = sum(grad_X)
|
451 |
+
|
452 |
+
return grad_X, grad_W
|
453 |
+
|
454 |
+
|
455 |
+
class SegmentSumFn(GraphConvFn):
|
456 |
+
|
457 |
+
def __init__(self, idx, num_seg):
|
458 |
+
# build A
|
459 |
+
# construct coo
|
460 |
+
data = nd.ones(idx.shape[0], ctx=idx.context, dtype='int64')
|
461 |
+
row, col = idx, nd.arange(idx.shape[0], ctx=idx.context, dtype='int64')
|
462 |
+
shape = (num_seg, int(idx.shape[0]))
|
463 |
+
sparse = nd.sparse.csr_matrix((data, (row, col)), shape=shape,
|
464 |
+
ctx=idx.context, dtype='float32')
|
465 |
+
super(SegmentSumFn, self).__init__(sparse)
|
466 |
+
|
467 |
+
sparse = nd.sparse.csr_matrix((data, (col, row)), shape=(shape[1], shape[0]),
|
468 |
+
ctx=idx.context, dtype='float32')
|
469 |
+
self.A_T = sparse
|
470 |
+
|
471 |
+
|
472 |
+
def squeeze(input, axis):
|
473 |
+
assert input.shape[axis] == 1
|
474 |
+
|
475 |
+
new_shape = list(input.shape)
|
476 |
+
del new_shape[axis]
|
477 |
+
|
478 |
+
return input.reshape(new_shape)
|
479 |
+
|
480 |
+
|
481 |
+
def unsqueeze(input, axis):
|
482 |
+
return nd.expand_dims(input, axis=axis)
|
483 |
+
|
484 |
+
|
485 |
+
def logsumexp(inputs, axis=None, keepdims=False):
|
486 |
+
"""Numerically stable logsumexp.
|
487 |
+
Args:
|
488 |
+
inputs: A Variable with any shape.
|
489 |
+
axis: An integer.
|
490 |
+
keepdims: A boolean.
|
491 |
+
Returns:
|
492 |
+
Equivalent of log(sum(exp(inputs), dim=dim, keepdim=keepdim)).
|
493 |
+
Adopted from: https://github.com/pytorch/pytorch/issues/2591
|
494 |
+
"""
|
495 |
+
# For a 1-D array x (any array along a single dimension),
|
496 |
+
# log sum exp(x) = s + log sum exp(x - s)
|
497 |
+
# with s = max(x) being a common choice.
|
498 |
+
if axis is None:
|
499 |
+
inputs = inputs.reshape([-1])
|
500 |
+
axis = 0
|
501 |
+
s = nd.max(inputs, axis=axis, keepdims=True)
|
502 |
+
outputs = s + (inputs - s).exp().sum(axis=axis, keepdims=True).log()
|
503 |
+
if not keepdims:
|
504 |
+
outputs = nd.sum(outputs, axis=axis, keepdims=False)
|
505 |
+
return outputs
|
506 |
+
|
507 |
+
|
508 |
+
def get_activation(name):
|
509 |
+
activation_dict = {
|
510 |
+
'relu':nd.relu,
|
511 |
+
'tanh':nd.tanh
|
512 |
+
}
|
513 |
+
return activation_dict[name]
|
514 |
+
|
515 |
+
|
516 |
+
|
517 |
+
class Linear_BN(nn.Sequential):
|
518 |
+
def __init__(self, F_in, F_out):
|
519 |
+
super(Linear_BN, self).__init__()
|
520 |
+
self.add(nn.Dense(F_out, in_units=F_in, use_bias=False))
|
521 |
+
self.add(BatchNorm(in_channels=F_out))
|
522 |
+
|
523 |
+
|
524 |
+
class GraphConv(nn.Block):
|
525 |
+
|
526 |
+
def __init__(self, Fin, Fout, D):
|
527 |
+
super(GraphConv, self).__init__()
|
528 |
+
|
529 |
+
# model settings
|
530 |
+
self.Fin = Fin
|
531 |
+
self.Fout = Fout
|
532 |
+
self.D = D
|
533 |
+
|
534 |
+
# model parameters
|
535 |
+
self.W = self.params.get('w', shape=(self.Fin * (self.D + 1), self.Fout),
|
536 |
+
init=None, allow_deferred_init=False)
|
537 |
+
|
538 |
+
def forward(self, X, A_list):
|
539 |
+
try:
|
540 |
+
assert len(A_list) == self.D
|
541 |
+
except AssertionError as e:
|
542 |
+
print(self.D, len(A_list))
|
543 |
+
raise e
|
544 |
+
return EfficientGraphConvFn(A_list)(X, self.W.data(X.context))
|
545 |
+
|
546 |
+
|
547 |
+
class Policy(nn.Block):
|
548 |
+
|
549 |
+
def __init__(self, F_in, F_h, N_A, N_B, k=1):
|
550 |
+
super(Policy, self).__init__()
|
551 |
+
self.F_in = F_in # number of input features for each atom
|
552 |
+
self.F_h = F_h # number of context variables
|
553 |
+
self.N_A = N_A # number of atom types
|
554 |
+
self.N_B = N_B # number of bond types
|
555 |
+
self.k = k # number of softmax used in the mixture
|
556 |
+
|
557 |
+
|
558 |
+
with self.name_scope():
|
559 |
+
self.linear_h = Linear_BN(F_in * 2, self.F_h * k)
|
560 |
+
self.linear_h_t = Linear_BN(F_in, self.F_h * k)
|
561 |
+
|
562 |
+
self.linear_x = nn.Dense(self.N_B + self.N_B*self.N_A, in_units=self.F_h)
|
563 |
+
self.linear_x_t = nn.Dense(1, in_units=self.F_h)
|
564 |
+
|
565 |
+
if self.k > 1:
|
566 |
+
self.linear_pi = nn.Dense(self.k, in_units=self.F_in)
|
567 |
+
else:
|
568 |
+
self.linear_pi = None
|
569 |
+
|
570 |
+
def forward(self, X, NX, NX_rep, X_end=None):
|
571 |
+
# segment mean for X
|
572 |
+
if X_end is None:
|
573 |
+
X_end = SegmentSumFn(NX_rep, NX.shape[0])(X)/nd.cast(fn.unsqueeze(NX, 1), 'float32')
|
574 |
+
X = nd.concat(X, X_end[NX_rep, :], dim=1)
|
575 |
+
|
576 |
+
X_h = nd.relu(self.linear_h(X)).reshape([-1, self.F_h])
|
577 |
+
X_h_end = nd.relu(self.linear_h_t(X_end)).reshape([-1, self.F_h])
|
578 |
+
|
579 |
+
X_x = nd.exp(self.linear_x(X_h)).reshape([-1, self.k, self.N_B + self.N_B*self.N_A])
|
580 |
+
X_x_end = nd.exp(self.linear_x_t(X_h_end)).reshape([-1, self.k, 1])
|
581 |
+
|
582 |
+
X_sum = nd.sum(SegmentSumFn(NX_rep, NX.shape[0])(X_x), -1, keepdims=True) + X_x_end
|
583 |
+
X_sum_gathered = X_sum[NX_rep, :, :]
|
584 |
+
|
585 |
+
X_softmax = X_x / X_sum_gathered
|
586 |
+
X_softmax_end = X_x_end/ X_sum
|
587 |
+
|
588 |
+
if self.k > 1:
|
589 |
+
pi = unsqueeze(nd.softmax(self.linear_pi(X_end), axis=1), -1)
|
590 |
+
pi_gathered = pi[NX_rep, :, :]
|
591 |
+
|
592 |
+
X_softmax = nd.sum(X_softmax * pi_gathered, axis=1)
|
593 |
+
X_softmax_end = nd.sum(X_softmax_end * pi, axis=1)
|
594 |
+
else:
|
595 |
+
X_softmax = squeeze(X_softmax, 1)
|
596 |
+
X_softmax_end = squeeze(X_softmax_end, 1)
|
597 |
+
|
598 |
+
# generate output probabilities
|
599 |
+
connect, append = X_softmax[:, :self.N_B], X_softmax[:, self.N_B:]
|
600 |
+
append = append.reshape([-1, self.N_A, self.N_B])
|
601 |
+
end = squeeze(X_softmax_end, -1)
|
602 |
+
|
603 |
+
return append, connect, end
|
604 |
+
|
605 |
+
|
606 |
+
class BatchNorm(nn.Block):
|
607 |
+
|
608 |
+
def __init__(self, in_channels, momentum=0.9, eps=1e-5):
|
609 |
+
super(BatchNorm, self).__init__()
|
610 |
+
self.F = in_channels
|
611 |
+
|
612 |
+
self.bn_weight = self.params.get('bn_weight', shape=(self.F,), init=mx.init.One(),
|
613 |
+
allow_deferred_init=False)
|
614 |
+
self.bn_bias = self.params.get('bn_bias', shape=(self.F,), init=mx.init.Zero(),
|
615 |
+
allow_deferred_init=False)
|
616 |
+
|
617 |
+
self.running_mean = self.params.get('running_mean', grad_req='null',
|
618 |
+
shape=(self.F,),
|
619 |
+
init=mx.init.Zero(),
|
620 |
+
allow_deferred_init=False,
|
621 |
+
differentiable=False)
|
622 |
+
self.running_var = self.params.get('running_var', grad_req='null',
|
623 |
+
shape=(self.F,),
|
624 |
+
init=mx.init.One(),
|
625 |
+
allow_deferred_init=False,
|
626 |
+
differentiable=False)
|
627 |
+
self.momentum = momentum
|
628 |
+
self.eps = eps
|
629 |
+
|
630 |
+
def forward(self, x):
|
631 |
+
if autograd.is_training():
|
632 |
+
return nd.BatchNorm(x,
|
633 |
+
gamma=self.bn_weight.data(x.context),
|
634 |
+
beta=self.bn_bias.data(x.context),
|
635 |
+
moving_mean=self.running_mean.data(x.context),
|
636 |
+
moving_var=self.running_var.data(x.context),
|
637 |
+
eps=self.eps, momentum=self.momentum,
|
638 |
+
use_global_stats=False)
|
639 |
+
else:
|
640 |
+
return nd.BatchNorm(x,
|
641 |
+
gamma=self.bn_weight.data(x.context),
|
642 |
+
beta=self.bn_bias.data(x.context),
|
643 |
+
moving_mean=self.running_mean.data(x.context),
|
644 |
+
moving_var=self.running_var.data(x.context),
|
645 |
+
eps=self.eps, momentum=self.momentum,
|
646 |
+
use_global_stats=True)
|
647 |
+
|
648 |
+
class MoleculeGenerator(nn.Block):
|
649 |
+
|
650 |
+
__metaclass__ = ABCMeta
|
651 |
+
|
652 |
+
def __init__(self, N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
|
653 |
+
*args, **kwargs):
|
654 |
+
super(MoleculeGenerator, self).__init__()
|
655 |
+
self.N_A = N_A
|
656 |
+
self.N_B = N_B
|
657 |
+
self.D = D
|
658 |
+
self.F_e = F_e
|
659 |
+
self.F_skip = F_skip
|
660 |
+
self.F_c = list(F_c) if isinstance(F_c, tuple) else F_c
|
661 |
+
self.Fh_policy = Fh_policy
|
662 |
+
self.activation = get_activation(activation)
|
663 |
+
|
664 |
+
with self.name_scope():
|
665 |
+
# embeddings
|
666 |
+
self.embedding_atom = nn.Embedding(self.N_A, self.F_e)
|
667 |
+
self.embedding_mask = nn.Embedding(3, self.F_e)
|
668 |
+
|
669 |
+
# graph conv
|
670 |
+
self._build_graph_conv(*args, **kwargs)
|
671 |
+
|
672 |
+
# fully connected
|
673 |
+
self.dense = nn.Sequential()
|
674 |
+
for i, (f_in, f_out) in enumerate(zip([self.F_skip, ] + self.F_c[:-1], self.F_c)):
|
675 |
+
self.dense.add(Linear_BN(f_in, f_out))
|
676 |
+
|
677 |
+
# policy
|
678 |
+
self.policy_0 = self.params.get('policy_0', shape=[self.N_A, ],
|
679 |
+
init=mx.init.Zero(),
|
680 |
+
allow_deferred_init=False)
|
681 |
+
self.policy_h = Policy(self.F_c[-1], self.Fh_policy, self.N_A, self.N_B)
|
682 |
+
|
683 |
+
self.mode = 'loss'
|
684 |
+
|
685 |
+
@abstractmethod
|
686 |
+
def _build_graph_conv(self, *args, **kwargs):
|
687 |
+
raise NotImplementedError
|
688 |
+
|
689 |
+
@abstractmethod
|
690 |
+
def _graph_conv_forward(self, X, A):
|
691 |
+
raise NotImplementedError
|
692 |
+
|
693 |
+
def _policy_0(self, ctx):
|
694 |
+
policy_0 = nd.exp(self.policy_0.data(ctx))
|
695 |
+
policy_0 = policy_0/policy_0.sum()
|
696 |
+
return policy_0
|
697 |
+
|
698 |
+
def _policy(self, X, A, NX, NX_rep, last_append_mask):
|
699 |
+
# get initial embedding
|
700 |
+
X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
|
701 |
+
|
702 |
+
# convolution
|
703 |
+
X = self._graph_conv_forward(X, A)
|
704 |
+
|
705 |
+
# linear
|
706 |
+
X = self.dense(X)
|
707 |
+
|
708 |
+
# policy
|
709 |
+
append, connect, end = self.policy_h(X, NX, NX_rep)
|
710 |
+
|
711 |
+
return append, connect, end
|
712 |
+
|
713 |
+
def _likelihood(self, init, append, connect, end,
|
714 |
+
action_0, actions, iw_ids, log_p_sigma,
|
715 |
+
batch_size, iw_size):
|
716 |
+
|
717 |
+
# decompose action:
|
718 |
+
action_type, node_type, edge_type, append_pos, connect_pos = \
|
719 |
+
actions[:, 0], actions[:, 1], actions[:, 2], actions[:, 3], actions[:, 4]
|
720 |
+
_log_mask = lambda _x, _mask: _mask * nd.log(_x + 1e-10) + (1- _mask) * nd.zeros_like(_x)
|
721 |
+
|
722 |
+
# init
|
723 |
+
init = init.reshape([batch_size * iw_size, self.N_A])
|
724 |
+
index = nd.stack(nd.arange(action_0.shape[0], ctx=action_0.context, dtype='int64'), action_0, axis=0)
|
725 |
+
loss_init = nd.log(nd.gather_nd(init, index) + 1e-10)
|
726 |
+
|
727 |
+
# end
|
728 |
+
loss_end = _log_mask(end, nd.cast(action_type == 2, 'float32'))
|
729 |
+
|
730 |
+
# append
|
731 |
+
index = nd.stack(append_pos, node_type, edge_type, axis=0)
|
732 |
+
loss_append = _log_mask(nd.gather_nd(append, index), nd.cast(action_type == 0, 'float32'))
|
733 |
+
|
734 |
+
# connect
|
735 |
+
index = nd.stack(connect_pos, edge_type, axis=0)
|
736 |
+
loss_connect = _log_mask(nd.gather_nd(connect, index), nd.cast(action_type == 1, 'float32'))
|
737 |
+
|
738 |
+
# sum up results
|
739 |
+
log_p_x = loss_end + loss_append + loss_connect
|
740 |
+
log_p_x = squeeze(SegmentSumFn(iw_ids, batch_size*iw_size)(unsqueeze(log_p_x, -1)), -1)
|
741 |
+
log_p_x = log_p_x + loss_init
|
742 |
+
|
743 |
+
# reshape
|
744 |
+
log_p_x = log_p_x.reshape([batch_size, iw_size])
|
745 |
+
log_p_sigma = log_p_sigma.reshape([batch_size, iw_size])
|
746 |
+
l = log_p_x - log_p_sigma
|
747 |
+
l = logsumexp(l, axis=1) - math.log(float(iw_size))
|
748 |
+
return l
|
749 |
+
|
750 |
+
def forward(self, *input):
|
751 |
+
if self.mode=='loss' or self.mode=='likelihood':
|
752 |
+
X, A, iw_ids, last_append_mask, \
|
753 |
+
NX, NX_rep, action_0, actions, log_p, \
|
754 |
+
batch_size, iw_size = input
|
755 |
+
|
756 |
+
init = self._policy_0(X.context).tile([batch_size * iw_size, 1])
|
757 |
+
append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask)
|
758 |
+
l = self._likelihood(init, append, connect, end, action_0, actions, iw_ids, log_p, batch_size, iw_size)
|
759 |
+
if self.mode=='likelihood':
|
760 |
+
return l
|
761 |
+
else:
|
762 |
+
return -l.mean()
|
763 |
+
elif self.mode == 'decode_0':
|
764 |
+
return self._policy_0(input[0])
|
765 |
+
elif self.mode == 'decode_step':
|
766 |
+
X, A, NX, NX_rep, last_append_mask = input
|
767 |
+
return self._policy(X, A, NX, NX_rep, last_append_mask)
|
768 |
+
|
769 |
+
|
770 |
+
class MoleculeGenerator_RNN(MoleculeGenerator):
|
771 |
+
|
772 |
+
__metaclass__ = ABCMeta
|
773 |
+
|
774 |
+
def __init__(self, N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
|
775 |
+
N_rnn, *args, **kwargs):
|
776 |
+
super(MoleculeGenerator_RNN, self).__init__(N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
|
777 |
+
*args, **kwargs)
|
778 |
+
self.N_rnn = N_rnn
|
779 |
+
|
780 |
+
with self.name_scope():
|
781 |
+
self.rnn = gluon.rnn.GRU(hidden_size=self.F_c[-1],
|
782 |
+
num_layers=self.N_rnn,
|
783 |
+
layout='NTC', input_size=self.F_c[-1] * 2)
|
784 |
+
|
785 |
+
def _rnn_train(self, X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum):
|
786 |
+
X_avg = SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast(unsqueeze(NX, 1), 'float32')
|
787 |
+
X_curr = nd.take(X, indices=NX_cum-1)
|
788 |
+
X = nd.concat(X_avg, X_curr, dim=1)
|
789 |
+
|
790 |
+
# rnn
|
791 |
+
X = nd.take(X, indices=graph_to_rnn) # batch_size, iw_size, length, num_features
|
792 |
+
batch_size, iw_size, length, num_features = X.shape
|
793 |
+
X = X.reshape([batch_size*iw_size, length, num_features])
|
794 |
+
X = self.rnn(X)
|
795 |
+
|
796 |
+
X = X.reshape([batch_size, iw_size, length, -1])
|
797 |
+
X = nd.gather_nd(X, indices=rnn_to_graph)
|
798 |
+
|
799 |
+
return X
|
800 |
+
|
801 |
+
def _rnn_test(self, X, NX, NX_rep, NX_cum, h):
|
802 |
+
# note: one partition for one molecule
|
803 |
+
X_avg = SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast(unsqueeze(NX, 1), 'float32')
|
804 |
+
X_curr = nd.take(X, indices=NX_cum - 1)
|
805 |
+
X = nd.concat(X_avg, X_curr, dim=1) # size: [NX, F_in * 2]
|
806 |
+
|
807 |
+
# rnn
|
808 |
+
X = unsqueeze(X, axis=1)
|
809 |
+
X, h = self.rnn(X, h)
|
810 |
+
|
811 |
+
X = squeeze(X, axis=1)
|
812 |
+
return X, h
|
813 |
+
|
814 |
+
def _policy(self, X, A, NX, NX_rep, last_append_mask, graph_to_rnn, rnn_to_graph, NX_cum):
|
815 |
+
# get initial embedding
|
816 |
+
X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
|
817 |
+
|
818 |
+
# convolution
|
819 |
+
X = self._graph_conv_forward(X, A)
|
820 |
+
|
821 |
+
# linear
|
822 |
+
X = self.dense(X)
|
823 |
+
|
824 |
+
# rnn
|
825 |
+
X_mol = self._rnn_train(X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum)
|
826 |
+
|
827 |
+
# policy
|
828 |
+
append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
|
829 |
+
|
830 |
+
return append, connect, end
|
831 |
+
|
832 |
+
def _decode_step(self, X, A, NX, NX_rep, last_append_mask, NX_cum, h):
|
833 |
+
# get initial embedding
|
834 |
+
X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
|
835 |
+
|
836 |
+
# convolution
|
837 |
+
X = self._graph_conv_forward(X, A)
|
838 |
+
|
839 |
+
# linear
|
840 |
+
X = self.dense(X)
|
841 |
+
|
842 |
+
# rnn
|
843 |
+
X_mol, h = self._rnn_test(X, NX, NX_rep, NX_cum, h)
|
844 |
+
|
845 |
+
# policy
|
846 |
+
append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
|
847 |
+
|
848 |
+
return append, connect, end, h
|
849 |
+
|
850 |
+
def forward(self, *input):
|
851 |
+
if self.mode=='loss' or self.mode=='likelihood':
|
852 |
+
X, A, iw_ids, last_append_mask, \
|
853 |
+
NX, NX_rep, action_0, actions, log_p, \
|
854 |
+
batch_size, iw_size, \
|
855 |
+
graph_to_rnn, rnn_to_graph, NX_cum = input
|
856 |
+
|
857 |
+
init = self._policy_0(X.context).tile([batch_size * iw_size, 1])
|
858 |
+
append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask, graph_to_rnn, rnn_to_graph, NX_cum)
|
859 |
+
l = self._likelihood(init, append, connect, end, action_0, actions, iw_ids, log_p, batch_size, iw_size)
|
860 |
+
if self.mode=='likelihood':
|
861 |
+
return l
|
862 |
+
else:
|
863 |
+
return -l.mean()
|
864 |
+
elif self.mode == 'decode_0':
|
865 |
+
return self._policy_0(input[0])
|
866 |
+
elif self.mode == 'decode_step':
|
867 |
+
X, A, NX, NX_rep, last_append_mask, NX_cum, h = input
|
868 |
+
return self._decode_step(X, A, NX, NX_rep, last_append_mask, NX_cum, h)
|
869 |
+
else:
|
870 |
+
raise ValueError
|
871 |
+
|
872 |
+
class _TwoLayerDense(nn.Block):
|
873 |
+
|
874 |
+
def __init__(self, input_size, hidden_size, output_size):
|
875 |
+
super(_TwoLayerDense, self).__init__()
|
876 |
+
|
877 |
+
self.hidden_size = hidden_size
|
878 |
+
self.output_size = output_size
|
879 |
+
self.input_size = input_size
|
880 |
+
|
881 |
+
with self.name_scope():
|
882 |
+
# config 1
|
883 |
+
self.input = nn.Dense(self.hidden_size, use_bias=False, in_units=self.input_size)
|
884 |
+
self.bn_input = BatchNorm(in_channels=hidden_size)
|
885 |
+
self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.hidden_size)
|
886 |
+
|
887 |
+
# config 2
|
888 |
+
#self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.input_size)
|
889 |
+
|
890 |
+
# config 3
|
891 |
+
#self.input1 = nn.Dense(self.hidden_size, use_bias=False, in_units=self.input_size)
|
892 |
+
#self.bn_input1 = BatchNorm(in_channels=self.hidden_size)
|
893 |
+
#self.input2 = nn.Dense(self.hidden_size, use_bias=False, in_units=self.hidden_size)
|
894 |
+
#self.bn_input2 = BatchNorm(in_channels=self.hidden_size)
|
895 |
+
#self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.hidden_size)
|
896 |
+
|
897 |
+
# config 4
|
898 |
+
#self.bn_input = BatchNorm(in_channels=self.input_size)
|
899 |
+
#self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.input_size)
|
900 |
+
|
901 |
+
# config 5
|
902 |
+
#self.bn_input = BatchNorm(in_channels=1024)
|
903 |
+
#self.output = nn.Dense(self.output_size, use_bias=True, in_units=1024)
|
904 |
+
|
905 |
+
|
906 |
+
def forward(self, c):
|
907 |
+
# config 1
|
908 |
+
return nd.softmax(self.output(nd.relu(self.bn_input(self.input(c)))), axis=-1)
|
909 |
+
|
910 |
+
# config 2
|
911 |
+
#return nd.softmax(self.output(c), axis=-1)
|
912 |
+
|
913 |
+
# config 3
|
914 |
+
#return nd.softmax(self.output(nd.relu(self.bn_input2(self.input2(nd.relu(self.bn_input1(self.input1(c))))))), axis=-1)
|
915 |
+
|
916 |
+
# config 4
|
917 |
+
#return nd.softmax(self.output(nd.relu(self.bn_input(c))), axis=-1)
|
918 |
+
|
919 |
+
# config 5
|
920 |
+
#return nd.softmax(self.output(c), axis=-1)
|
921 |
+
|
922 |
+
|
923 |
+
class CMoleculeGenerator_RNN(MoleculeGenerator_RNN):
|
924 |
+
__metaclass__ = ABCMeta
|
925 |
+
|
926 |
+
def __init__(self, N_A, N_B, N_C, D,
|
927 |
+
F_e, F_skip, F_c, Fh_policy,
|
928 |
+
activation, N_rnn,
|
929 |
+
*args, **kwargs):
|
930 |
+
self.N_C = N_C # number of conditional variables
|
931 |
+
super(CMoleculeGenerator_RNN, self).__init__(N_A, N_B, D,
|
932 |
+
F_e, F_skip, F_c, Fh_policy,
|
933 |
+
activation, N_rnn,
|
934 |
+
*args, **kwargs)
|
935 |
+
with self.name_scope():
|
936 |
+
self.dense_policy_0 = _TwoLayerDense(self.N_C, self.N_A * 3, self.N_A)
|
937 |
+
|
938 |
+
@abstractmethod
|
939 |
+
def _graph_conv_forward(self, X, A, c, ids):
|
940 |
+
raise NotImplementedError
|
941 |
+
|
942 |
+
def _policy_0(self, c):
|
943 |
+
return self.dense_policy_0(c) + 0.0 * self.policy_0.data(c.context)
|
944 |
+
|
945 |
+
def _policy(self, X, A, NX, NX_rep, last_append_mask,
|
946 |
+
graph_to_rnn, rnn_to_graph, NX_cum,
|
947 |
+
c, ids):
|
948 |
+
# get initial embedding
|
949 |
+
X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
|
950 |
+
|
951 |
+
# convolution
|
952 |
+
X = self._graph_conv_forward(X, A, c, ids)
|
953 |
+
|
954 |
+
# linear
|
955 |
+
X = self.dense(X)
|
956 |
+
|
957 |
+
# rnn
|
958 |
+
X_mol = self._rnn_train(X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum)
|
959 |
+
|
960 |
+
# policy
|
961 |
+
append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
|
962 |
+
|
963 |
+
return append, connect, end
|
964 |
+
|
965 |
+
def _decode_step(self, X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids):
|
966 |
+
# get initial embedding
|
967 |
+
X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
|
968 |
+
|
969 |
+
# convolution
|
970 |
+
X = self._graph_conv_forward(X, A, c, ids)
|
971 |
+
|
972 |
+
# linear
|
973 |
+
X = self.dense(X)
|
974 |
+
|
975 |
+
# rnn
|
976 |
+
X_mol, h = self._rnn_test(X, NX, NX_rep, NX_cum, h)
|
977 |
+
|
978 |
+
# policy
|
979 |
+
append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
|
980 |
+
|
981 |
+
return append, connect, end, h
|
982 |
+
|
983 |
+
|
984 |
+
def forward(self, *input):
|
985 |
+
if self.mode=='loss' or self.mode=='likelihood':
|
986 |
+
X, A, iw_ids, last_append_mask, \
|
987 |
+
NX, NX_rep, action_0, actions, log_p, \
|
988 |
+
batch_size, iw_size, \
|
989 |
+
graph_to_rnn, rnn_to_graph, NX_cum, \
|
990 |
+
c, ids = input
|
991 |
+
|
992 |
+
init = nd.tile(unsqueeze(self._policy_0(c), axis=1), [1, iw_size, 1])
|
993 |
+
append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask,
|
994 |
+
graph_to_rnn, rnn_to_graph, NX_cum,
|
995 |
+
c, ids)
|
996 |
+
l = self._likelihood(init, append, connect, end,
|
997 |
+
action_0, actions, iw_ids, log_p,
|
998 |
+
batch_size, iw_size)
|
999 |
+
if self.mode=='likelihood':
|
1000 |
+
return l
|
1001 |
+
else:
|
1002 |
+
return -l.mean()
|
1003 |
+
elif self.mode == 'decode_0':
|
1004 |
+
return self._policy_0(*input)
|
1005 |
+
elif self.mode == 'decode_step':
|
1006 |
+
X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids = input
|
1007 |
+
return self._decode_step(X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids)
|
1008 |
+
else:
|
1009 |
+
raise ValueError
|
1010 |
+
|
1011 |
+
class CVanillaMolGen_RNN(CMoleculeGenerator_RNN):
|
1012 |
+
|
1013 |
+
def __init__(self, N_A, N_B, N_C, D,
|
1014 |
+
F_e, F_h, F_skip, F_c, Fh_policy,
|
1015 |
+
activation, N_rnn, rename=False):
|
1016 |
+
self.rename = rename
|
1017 |
+
super(CVanillaMolGen_RNN, self).__init__(N_A, N_B, N_C, D,
|
1018 |
+
F_e, F_skip, F_c, Fh_policy,
|
1019 |
+
activation, N_rnn,
|
1020 |
+
F_h)
|
1021 |
+
|
1022 |
+
def _build_graph_conv(self, F_h):
|
1023 |
+
self.F_h = list(F_h) if isinstance(F_h, tuple) else F_h
|
1024 |
+
self.conv, self.bn = [], []
|
1025 |
+
for i, (f_in, f_out) in enumerate(zip([self.F_e] + self.F_h[:-1], self.F_h)):
|
1026 |
+
conv = GraphConv(f_in, f_out, self.N_B + self.D)
|
1027 |
+
self.conv.append(conv)
|
1028 |
+
self.register_child(conv)
|
1029 |
+
|
1030 |
+
if i != 0:
|
1031 |
+
bn = BatchNorm(in_channels=f_in)
|
1032 |
+
self.register_child(bn)
|
1033 |
+
else:
|
1034 |
+
bn = None
|
1035 |
+
self.bn.append(bn)
|
1036 |
+
|
1037 |
+
self.bn_skip = BatchNorm(in_channels=sum(self.F_h))
|
1038 |
+
self.linear_skip = Linear_BN(sum(self.F_h), self.F_skip)
|
1039 |
+
|
1040 |
+
# projectors for conditional variable (protein embedding)
|
1041 |
+
self.linear_c = []
|
1042 |
+
for i, f_out in enumerate(self.F_h):
|
1043 |
+
if self.rename:
|
1044 |
+
linear_c = nn.Dense(f_out, use_bias=False, in_units=self.N_C, prefix='cond_{}'.format(i))
|
1045 |
+
else:
|
1046 |
+
linear_c = nn.Dense(f_out, use_bias=False, in_units=self.N_C)
|
1047 |
+
self.register_child(linear_c)
|
1048 |
+
self.linear_c.append(linear_c)
|
1049 |
+
|
1050 |
+
def _graph_conv_forward(self, X, A, c, ids):
|
1051 |
+
X_out = [X]
|
1052 |
+
for conv, bn, linear_c in zip(self.conv, self.bn, self.linear_c):
|
1053 |
+
X = X_out[-1]
|
1054 |
+
if bn is not None:
|
1055 |
+
X_out.append(conv(self.activation(bn(X)), A) + linear_c(c)[ids, :])
|
1056 |
+
else:
|
1057 |
+
X_out.append(conv(X, A) + linear_c(c)[ids, :])
|
1058 |
+
X_out = nd.concat(*X_out[1:], dim=1)
|
1059 |
+
return self.activation(self.linear_skip(self.activation(self.bn_skip(X_out))))
|
1060 |
+
|
1061 |
+
def _decode_step(X, A, NX, NA, last_action, finished,
|
1062 |
+
get_init, get_action,
|
1063 |
+
random=False, n_node_types=get_mol_spec().num_atom_types,
|
1064 |
+
n_edge_types=get_mol_spec().num_bond_types):
|
1065 |
+
if X is None:
|
1066 |
+
init = get_init()
|
1067 |
+
|
1068 |
+
if random:
|
1069 |
+
X = []
|
1070 |
+
for i in range(init.shape[0]):
|
1071 |
+
# init probabilities(for first atom)
|
1072 |
+
p = init[i, :]
|
1073 |
+
# Random sampling using init probability distribution
|
1074 |
+
selected_atom = np.random.choice(np.arange(init.shape[1]), 1, p=p)[0]
|
1075 |
+
X.append(selected_atom)
|
1076 |
+
X = np.array(X, dtype=np.int64)
|
1077 |
+
else:
|
1078 |
+
X = np.argmax(init, axis=1)
|
1079 |
+
A = np.zeros((0, 3), dtype=np.int64)
|
1080 |
+
NX = last_action = np.ones([X.shape[0]], dtype=np.int64)
|
1081 |
+
NA = np.zeros([X.shape[0]], dtype=np.int64)
|
1082 |
+
finished = np.array([False, ] * X.shape[0], dtype=np.bool)
|
1083 |
+
|
1084 |
+
return X, A, NX, NA, last_action, finished
|
1085 |
+
else:
|
1086 |
+
X_u = X[np.repeat(np.logical_not(finished), NX)]
|
1087 |
+
A_u = A[np.repeat(np.logical_not(finished), NA), :]
|
1088 |
+
NX_u = NX[np.logical_not(finished)]
|
1089 |
+
NA_u = NA[np.logical_not(finished)]
|
1090 |
+
last_action_u = last_action[np.logical_not(finished)]
|
1091 |
+
|
1092 |
+
# conv
|
1093 |
+
mol_ids_rep = NX_rep = np.repeat(np.arange(NX_u.shape[0]), NX_u)
|
1094 |
+
rep_ids_rep = np.zeros_like(mol_ids_rep)
|
1095 |
+
|
1096 |
+
if A.shape[0] == 0:
|
1097 |
+
D_2 = D_3 = np.zeros((0, 2), dtype=np.int64)
|
1098 |
+
A_u = [np.zeros((0, 2), dtype=np.int64) for _ in range(get_mol_spec().num_bond_types)]
|
1099 |
+
A_u += [D_2, D_3]
|
1100 |
+
else:
|
1101 |
+
cumsum = np.cumsum(np.pad(NX_u, [[1, 0]], mode='constant')[:-1])
|
1102 |
+
shift = np.repeat(cumsum, NA_u)
|
1103 |
+
A_u[:, :2] += np.stack([shift, ] * 2, axis=1)
|
1104 |
+
D_2, D_3 = get_d(A_u, X_u)
|
1105 |
+
A_u = [A_u[A_u[:, 2] == _i, :2] for _i in range(n_edge_types)]
|
1106 |
+
A_u += [D_2, D_3]
|
1107 |
+
|
1108 |
+
mask = np.zeros([X_u.shape[0]], dtype=np.int64)
|
1109 |
+
last_append_index = np.cumsum(NX_u) - 1
|
1110 |
+
mask[last_append_index] = np.where(last_action_u == 1,
|
1111 |
+
np.ones_like(last_append_index, dtype=np.int64),
|
1112 |
+
np.ones_like(last_append_index, dtype=np.int64) * 2)
|
1113 |
+
|
1114 |
+
decode_input = [X_u, A_u, NX_u, NX_rep, mask, mol_ids_rep, rep_ids_rep]
|
1115 |
+
append, connect, end = get_action(decode_input)
|
1116 |
+
|
1117 |
+
if A.shape[0] == 0:
|
1118 |
+
max_index = np.argmax(np.reshape(append, [-1, n_node_types * n_edge_types]), axis=1)
|
1119 |
+
atom_type, bond_type = np.unravel_index(max_index, [n_node_types, n_edge_types])
|
1120 |
+
X = np.reshape(np.stack([X, atom_type], axis=1), [-1])
|
1121 |
+
NX = np.array([2, ] * len(finished), dtype=np.int64)
|
1122 |
+
A = np.stack([np.zeros([len(finished), ], dtype=np.int64),
|
1123 |
+
np.ones([len(finished), ], dtype=np.int64),
|
1124 |
+
bond_type], axis=1)
|
1125 |
+
NA = np.ones([len(finished), ], dtype=np.int64)
|
1126 |
+
last_action = np.ones_like(NX, dtype=np.int64)
|
1127 |
+
|
1128 |
+
else:
|
1129 |
+
# process for each molecule
|
1130 |
+
append, connect = np.split(append, np.cumsum(NX_u)), np.split(connect, np.cumsum(NX_u))
|
1131 |
+
end = end.tolist()
|
1132 |
+
|
1133 |
+
unfinished_ids = np.where(np.logical_not(finished))[0].tolist()
|
1134 |
+
cumsum = np.cumsum(NX)
|
1135 |
+
cumsum_a = np.cumsum(NA)
|
1136 |
+
|
1137 |
+
X_insert = []
|
1138 |
+
X_insert_ids = []
|
1139 |
+
A_insert = []
|
1140 |
+
A_insert_ids = []
|
1141 |
+
finished_ids = []
|
1142 |
+
|
1143 |
+
for i, (unfinished_id, append_i, connect_i, end_i) \
|
1144 |
+
in enumerate(zip(unfinished_ids, append, connect, end)):
|
1145 |
+
if random:
|
1146 |
+
def _rand_id(*_x):
|
1147 |
+
_x_reshaped = [np.reshape(_xi, [-1]) for _xi in _x]
|
1148 |
+
_x_length = np.array([_x_reshape_i.shape[0] for _x_reshape_i in _x_reshaped],
|
1149 |
+
dtype=np.int64)
|
1150 |
+
_begin = np.cumsum(np.pad(_x_length, [[1, 0]], mode='constant')[:-1])
|
1151 |
+
_end = np.cumsum(_x_length) - 1
|
1152 |
+
_p = np.concatenate(_x_reshaped)
|
1153 |
+
_p = _p / np.sum(_p)
|
1154 |
+
# Count NaN values
|
1155 |
+
num_nan = np.isnan(_p).sum()
|
1156 |
+
if num_nan > 0:
|
1157 |
+
print(f'Number of NaN values in _p: {num_nan}')
|
1158 |
+
_rand_index = np.random.choice(np.arange(len(_p)), 1)[0]
|
1159 |
+
|
1160 |
+
else:
|
1161 |
+
_rand_index = np.random.choice(np.arange(_p.shape[0]), 1, p=_p)[0]
|
1162 |
+
|
1163 |
+
|
1164 |
+
_p_step = _p[_rand_index]
|
1165 |
+
_x_index = np.where(np.logical_and(_begin <= _rand_index, _end >= _rand_index))[0][0]
|
1166 |
+
_rand_index = _rand_index - _begin[_x_index]
|
1167 |
+
_rand_index = np.unravel_index(_rand_index, _x[_x_index].shape)
|
1168 |
+
return _x_index, _rand_index, _p_step
|
1169 |
+
|
1170 |
+
action_type, action_index, p_step = _rand_id(append_i, connect_i, np.array([end_i]))
|
1171 |
+
else:
|
1172 |
+
_argmax = lambda _x: np.unravel_index(np.argmax(_x), _x.shape)
|
1173 |
+
append_id, append_val = _argmax(append_i), np.max(append_i)
|
1174 |
+
connect_id, connect_val = _argmax(connect_i), np.max(connect_i)
|
1175 |
+
end_val = end_i
|
1176 |
+
if end_val >= append_val and end_val >= connect_val:
|
1177 |
+
action_type = 2
|
1178 |
+
action_index = None
|
1179 |
+
elif append_val >= connect_val and append_val >= end_val:
|
1180 |
+
action_type = 0
|
1181 |
+
action_index = append_id
|
1182 |
+
else:
|
1183 |
+
action_type = 1
|
1184 |
+
action_index = connect_id
|
1185 |
+
if action_type == 2:
|
1186 |
+
# finish growth
|
1187 |
+
finished_ids.append(unfinished_id)
|
1188 |
+
elif action_type == 0:
|
1189 |
+
# append action
|
1190 |
+
append_pos, atom_type, bond_type = action_index
|
1191 |
+
X_insert.append(atom_type)
|
1192 |
+
X_insert_ids.append(unfinished_id)
|
1193 |
+
A_insert.append([append_pos, NX[unfinished_id], bond_type])
|
1194 |
+
A_insert_ids.append(unfinished_id)
|
1195 |
+
else:
|
1196 |
+
# connect
|
1197 |
+
connect_ps, bond_type = action_index
|
1198 |
+
A_insert.append([NX[unfinished_id] - 1, connect_ps, bond_type])
|
1199 |
+
A_insert_ids.append(unfinished_id)
|
1200 |
+
if len(A_insert_ids) > 0:
|
1201 |
+
A = np.insert(A, cumsum_a[A_insert_ids], A_insert, axis=0)
|
1202 |
+
NA[A_insert_ids] += 1
|
1203 |
+
last_action[A_insert_ids] = 0
|
1204 |
+
if len(X_insert_ids) > 0:
|
1205 |
+
X = np.insert(X, cumsum[X_insert_ids], X_insert, axis=0)
|
1206 |
+
NX[X_insert_ids] += 1
|
1207 |
+
last_action[X_insert_ids] = 1
|
1208 |
+
if len(finished_ids) > 0:
|
1209 |
+
finished[finished_ids] = True
|
1210 |
+
# print finished
|
1211 |
+
|
1212 |
+
return X, A, NX, NA, last_action, finished
|
1213 |
+
|
1214 |
+
class Builder(object, metaclass=ABCMeta):
|
1215 |
+
|
1216 |
+
def __init__(self, model_loc, gpu_id=None):
|
1217 |
+
with open(os.path.join(model_loc, 'configs.json')) as f:
|
1218 |
+
configs = json.load(f)
|
1219 |
+
|
1220 |
+
self.mdl = self.__class__._get_model(configs)
|
1221 |
+
|
1222 |
+
self.ctx = mx.gpu(gpu_id) if gpu_id is not None else mx.cpu()
|
1223 |
+
self.mdl.load_parameters(os.path.join(model_loc, 'ckpt.params'), ctx=self.ctx, allow_missing=True)
|
1224 |
+
|
1225 |
+
@staticmethod
|
1226 |
+
def _get_model(configs):
|
1227 |
+
raise NotImplementedError
|
1228 |
+
|
1229 |
+
@abstractmethod
|
1230 |
+
def sample(self, num_samples, *args, **kwargs):
|
1231 |
+
raise NotImplementedError
|
1232 |
+
|
1233 |
+
|
1234 |
+
class CVanilla_RNN_Builder(Builder):
|
1235 |
+
|
1236 |
+
@staticmethod
|
1237 |
+
def _get_model(configs):
|
1238 |
+
return CVanillaMolGen_RNN(get_mol_spec().num_atom_types, get_mol_spec().num_bond_types, D=2, **configs)
|
1239 |
+
|
1240 |
+
|
1241 |
+
def sample(self, num_samples, c, output_type='mol', sanitize=True, random=True):
|
1242 |
+
if len(c.shape) == 1:
|
1243 |
+
c = np.stack([c, ]*num_samples, axis=0)
|
1244 |
+
|
1245 |
+
with autograd.predict_mode():
|
1246 |
+
# step one
|
1247 |
+
finished = [False, ] * num_samples
|
1248 |
+
|
1249 |
+
def get_init():
|
1250 |
+
self.mdl.mode = 'decode_0'
|
1251 |
+
_c = nd.array(c, dtype='float32', ctx=self.ctx)
|
1252 |
+
init = self.mdl(_c).asnumpy()
|
1253 |
+
return init
|
1254 |
+
|
1255 |
+
outputs = _decode_step(X=None, A=None, NX=None, NA=None, last_action=None, finished=finished,
|
1256 |
+
get_init=get_init, get_action=None,
|
1257 |
+
n_node_types=self.mdl.N_A, n_edge_types=self.mdl.N_B,
|
1258 |
+
random=random)
|
1259 |
+
|
1260 |
+
# If outputs is None
|
1261 |
+
if outputs is None:
|
1262 |
+
return None
|
1263 |
+
X, A, NX, NA, last_action, finished = outputs
|
1264 |
+
|
1265 |
+
count = 1
|
1266 |
+
h = np.zeros([self.mdl.N_rnn, num_samples, self.mdl.F_c[-1]], dtype=np.float32)
|
1267 |
+
while not np.all(finished) and count < 100:
|
1268 |
+
def get_action(inputs):
|
1269 |
+
self.mdl.mode = 'decode_step'
|
1270 |
+
_h = nd.array(h[:, np.logical_not(finished), :], ctx=self.ctx, dtype='float32')
|
1271 |
+
_c = nd.array(c[np.logical_not(finished), :], ctx=self.ctx, dtype='float32')
|
1272 |
+
_X, _A_sparse, _NX, _NX_rep, _mask, _NX_cum = self.to_nd(inputs)
|
1273 |
+
_append, _connect, _end, _h = self.mdl(_X, _A_sparse, _NX, _NX_rep, _mask, _NX_cum, _h, _c, _NX_rep)
|
1274 |
+
h[:, np.logical_not(finished), :] = _h[0].asnumpy()
|
1275 |
+
return _append.asnumpy(), _connect.asnumpy(), _end.asnumpy()
|
1276 |
+
|
1277 |
+
outputs = _decode_step(X, A, NX, NA, last_action, finished,
|
1278 |
+
get_init=None, get_action=get_action,
|
1279 |
+
n_node_types=self.mdl.N_A, n_edge_types=self.mdl.N_B,
|
1280 |
+
random=random)
|
1281 |
+
X, A, NX, NA, last_action, finished = outputs
|
1282 |
+
|
1283 |
+
count += 1
|
1284 |
+
|
1285 |
+
graph_list = []
|
1286 |
+
|
1287 |
+
cumsum_X_ = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')).tolist()
|
1288 |
+
cumsum_A_ = np.cumsum(np.pad(NA, [[1, 0]], mode='constant')).tolist()
|
1289 |
+
|
1290 |
+
for cumsum_A_pre, cumsum_A_post, \
|
1291 |
+
cumsum_X_pre, cumsum_X_post in zip(cumsum_A_[:-1], cumsum_A_[1:],
|
1292 |
+
cumsum_X_[:-1], cumsum_X_[1:]):
|
1293 |
+
graph_list.append([X[cumsum_X_pre:cumsum_X_post], A[cumsum_A_pre:cumsum_A_post, :]])
|
1294 |
+
|
1295 |
+
if output_type=='graph':
|
1296 |
+
return graph_list
|
1297 |
+
elif output_type == 'mol':
|
1298 |
+
return get_mol_from_graph_list(graph_list, sanitize)
|
1299 |
+
elif output_type == 'smiles':
|
1300 |
+
mol_list = get_mol_from_graph_list(graph_list, sanitize=True)
|
1301 |
+
smiles_list = [Chem.MolToSmiles(m) if m is not None else None for m in mol_list]
|
1302 |
+
return smiles_list
|
1303 |
+
else:
|
1304 |
+
raise ValueError('Unrecognized output type')
|
1305 |
+
|
1306 |
+
def to_nd(self, inputs):
|
1307 |
+
X, A, NX, NX_rep, mask = inputs[:-2]
|
1308 |
+
NX_cum = np.cumsum(NX)
|
1309 |
+
|
1310 |
+
# convert to ndarray
|
1311 |
+
_to_ndarray = lambda _x: nd.array(_x, self.ctx, 'int64')
|
1312 |
+
X, NX, NX_rep, mask, NX_cum = \
|
1313 |
+
_to_ndarray(X), _to_ndarray(NX), _to_ndarray(NX_rep), _to_ndarray(mask), _to_ndarray(NX_cum)
|
1314 |
+
A_sparse = []
|
1315 |
+
for _A_i in A:
|
1316 |
+
if _A_i.shape[0] == 0:
|
1317 |
+
A_sparse.append(None)
|
1318 |
+
else:
|
1319 |
+
# transpose may not be supported in gpu
|
1320 |
+
_A_i = np.concatenate([_A_i, _A_i[:, [1, 0]]], axis=0)
|
1321 |
+
|
1322 |
+
# construct csr matrix ...
|
1323 |
+
_data = np.ones((_A_i.shape[0],), dtype=np.float32)
|
1324 |
+
_row, _col = _A_i[:, 0], _A_i[:, 1]
|
1325 |
+
_A_sparse_i = nd.sparse.csr_matrix((_data, (_row, _col)),
|
1326 |
+
shape=tuple([int(X.shape[0]), ] * 2),
|
1327 |
+
ctx=self.ctx, dtype='float32')
|
1328 |
+
|
1329 |
+
# append to list
|
1330 |
+
A_sparse.append(_A_sparse_i)
|
1331 |
+
return X, A_sparse, NX, NX_rep, mask, NX_cum
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask
|
2 |
+
gunicorn
|
3 |
+
rdkit
|
4 |
+
transformers
|
5 |
+
bio-embeddings
|
6 |
+
torch
|
7 |
+
huggingface_hub
|
8 |
+
molvs
|
9 |
+
numpy==1.23.5
|
10 |
+
mxnet==1.8
|
11 |
+
networkx
|
12 |
+
scipy
|
13 |
+
pandas
|
14 |
+
ipython
|
15 |
+
accelerate>=0.26.0
|
16 |
+
gdown==4.6.0
|
17 |
+
requests
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.8.3
|
templates/index.html
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Protein to SMILES Generator</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: 'Arial', sans-serif;
|
10 |
+
background: linear-gradient(to right, #4facfe, #00f2fe);
|
11 |
+
display: flex;
|
12 |
+
justify-content: center;
|
13 |
+
align-items: center;
|
14 |
+
height: 100vh;
|
15 |
+
margin: 0;
|
16 |
+
}
|
17 |
+
.container {
|
18 |
+
background: white;
|
19 |
+
padding: 40px;
|
20 |
+
border-radius: 15px;
|
21 |
+
box-shadow: 0px 6px 15px rgba(0, 0, 0, 0.3);
|
22 |
+
width: 90%;
|
23 |
+
max-width: 1200px;
|
24 |
+
text-align: center;
|
25 |
+
}
|
26 |
+
h2 {
|
27 |
+
font-size: 32px;
|
28 |
+
color: #333;
|
29 |
+
margin-bottom: 20px;
|
30 |
+
}
|
31 |
+
textarea {
|
32 |
+
width: 100%;
|
33 |
+
height: 300px;
|
34 |
+
border: 2px solid #ddd;
|
35 |
+
border-radius: 10px;
|
36 |
+
padding: 15px;
|
37 |
+
resize: none;
|
38 |
+
font-size: 18px;
|
39 |
+
}
|
40 |
+
button {
|
41 |
+
margin-top: 20px;
|
42 |
+
padding: 14px 24px;
|
43 |
+
font-size: 18px;
|
44 |
+
background-color: #007bff;
|
45 |
+
color: white;
|
46 |
+
border: none;
|
47 |
+
border-radius: 8px;
|
48 |
+
cursor: pointer;
|
49 |
+
transition: background 0.3s ease;
|
50 |
+
}
|
51 |
+
button:hover {
|
52 |
+
background-color: #0056b3;
|
53 |
+
}
|
54 |
+
.message {
|
55 |
+
margin-top: 20px;
|
56 |
+
font-size: 18px;
|
57 |
+
color: #28a745;
|
58 |
+
}
|
59 |
+
.download-btn {
|
60 |
+
display: inline-block;
|
61 |
+
margin-top: 20px;
|
62 |
+
padding: 14px 24px;
|
63 |
+
font-size: 18px;
|
64 |
+
background-color: #28a745;
|
65 |
+
color: white;
|
66 |
+
text-decoration: none;
|
67 |
+
border-radius: 8px;
|
68 |
+
transition: background 0.3s ease;
|
69 |
+
}
|
70 |
+
.download-btn:hover {
|
71 |
+
background-color: #218838;
|
72 |
+
}
|
73 |
+
</style>
|
74 |
+
</head>
|
75 |
+
<body>
|
76 |
+
<div class="container">
|
77 |
+
<h2>Protein to SMILES Generator</h2>
|
78 |
+
<form method="POST">
|
79 |
+
<textarea name="sequence" placeholder="Enter non-FASTA protein sequence..."></textarea><br>
|
80 |
+
<button type="submit">Generate SMILES</button>
|
81 |
+
</form>
|
82 |
+
|
83 |
+
{% if message %}
|
84 |
+
<p class="message">{{ message }}</p>
|
85 |
+
{% endif %}
|
86 |
+
|
87 |
+
{% if file_path %}
|
88 |
+
<p>Time taken: {{ time_taken }} seconds</p>
|
89 |
+
<a href="{{ url_for('download_file') }}" class="download-btn">Download SMILES</a>
|
90 |
+
{% endif %}
|
91 |
+
</div>
|
92 |
+
</body>
|
93 |
+
</html>
|