Bhanushray commited on
Commit
f829b9d
·
verified ·
1 Parent(s): b8eb471

Upload 16 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8
2
+
3
+ # Install required system dependencies
4
+ RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
5
+
6
+ # Set the working directory inside the container
7
+ WORKDIR /app
8
+
9
+ # Copy the requirements file into the container
10
+ COPY requirements.txt .
11
+
12
+ # Install dependencies
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Create necessary directories (but don't download models here!)
16
+ RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
17
+
18
+ # Copy the entire project to the container
19
+ COPY . .
20
+
21
+ # Expose the port for Flask
22
+ EXPOSE 7860
23
+
24
+ # Run the app with Gunicorn
25
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
26
+
27
+
28
+ # # Use Python 3.8 as the base image
29
+ # FROM python:3.8
30
+
31
+ # # Install required system dependencies
32
+ # RUN apt-get update && apt-get install -y libopenblas-dev git wget && rm -rf /var/lib/apt/lists/*
33
+
34
+ # # Set the working directory inside the container
35
+ # WORKDIR /app
36
+
37
+ # # Copy the requirements file into the container
38
+ # COPY requirements.txt .
39
+
40
+ # # Install dependencies
41
+ # RUN pip install --no-cache-dir -r requirements.txt
42
+
43
+ # # Create necessary directories inside the container
44
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
45
+
46
+ # # Download model files from Dropbox using wget
47
+ # RUN wget -O /app/modelsBioembed/pytorch_model.bin "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1" && \
48
+ # wget -O /app/modelsBioembed/config.json "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1" && \
49
+ # wget -O /app/modelsBioembed/special_tokens_map.json "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1" && \
50
+ # wget -O /app/modelsBioembed/tokenizer_config.json "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1" && \
51
+ # wget -O /app/modelsBioembed/vocab.txt "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1"
52
+
53
+ # # Copy the entire project to the container
54
+ # COPY . .
55
+
56
+ # # Expose the port for Flask
57
+ # EXPOSE 8000
58
+
59
+ # # Run the app with Gunicorn
60
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
61
+
62
+
63
+
64
+
65
+ # # Use Python 3.8 as the base image
66
+ # FROM python:3.8
67
+
68
+ # # Install required system dependencies
69
+ # RUN apt-get update && apt-get install -y libopenblas-dev git wget && rm -rf /var/lib/apt/lists/*
70
+
71
+ # # Set the working directory inside the container
72
+ # WORKDIR /app
73
+
74
+ # # Copy the requirements file into the container
75
+ # COPY requirements.txt .
76
+
77
+ # # Install dependencies
78
+ # RUN pip install --no-cache-dir -r requirements.txt
79
+
80
+ # # Create necessary directories inside the container
81
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
82
+
83
+ # # Download model files from Dropbox using wget
84
+ # RUN wget -O /app/modelsBioembed/pytorch_model.bin "https://www.dropbox.com/s/example/pytorch_model.bin?dl=1" && \
85
+ # wget -O /app/modelsBioembed/config.json "https://www.dropbox.com/s/example/config.json?dl=1" && \
86
+ # wget -O /app/modelsBioembed/tokenizer_config.json "https://www.dropbox.com/s/example/tokenizer_config.json?dl=1" && \
87
+ # wget -O /app/modelsBioembed/vocab.txt "https://www.dropbox.com/s/example/vocab.txt?dl=1" && \
88
+ # wget -O /app/modelsBioembed/special_tokens_map.json "https://www.dropbox.com/s/example/special_tokens_map.json?dl=1"
89
+
90
+ # # Copy the entire project to the container
91
+ # COPY . .
92
+
93
+ # # Expose the port for Flask
94
+ # EXPOSE 8000
95
+
96
+ # # Run the app with Gunicorn
97
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ # # Use Python 3.8 as the base image
112
+ # FROM python:3.8
113
+
114
+ # # Install required system dependencies
115
+ # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
116
+
117
+ # # Set the working directory inside the container
118
+ # WORKDIR /app
119
+
120
+ # # Copy the requirements file into the container
121
+ # COPY requirements.txt .
122
+
123
+ # # Install dependencies
124
+ # RUN pip install --no-cache-dir -r requirements.txt
125
+
126
+ # # Install gdown for Google Drive downloads
127
+ # RUN pip install --no-cache-dir gdown
128
+
129
+ # # Create necessary directories inside the container
130
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
131
+
132
+ # # Download model files from Google Drive using gdown
133
+ # RUN gdown --id 1aDirthtWAu-oyVjcWZ6linrddN-dmLMI -O /app/modelsBioembed/pytorch_model.bin && \
134
+ # gdown --id 1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3 -O /app/modelsBioembed/config.json && \
135
+ # gdown --id 1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf -O /app/modelsBioembed/tokenizer_config.json && \
136
+ # gdown --id 1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ -O /app/modelsBioembed/vocab.txt && \
137
+ # gdown --id 1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P -O /app/modelsBioembed/special_tokens_map.json
138
+
139
+ # # Copy the entire project to the container
140
+ # COPY . .
141
+
142
+ # # Expose the port for Flask
143
+ # EXPOSE 8000
144
+
145
+ # # Run the app with Gunicorn
146
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
147
+ # #Use Python 3.8 as the base image
148
+ # FROM python:3.8
149
+
150
+ # # Install required system dependencies
151
+ # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
152
+
153
+ # # Set the working directory inside the container
154
+ # WORKDIR /app
155
+
156
+ # # Copy the requirements file into the container
157
+ # COPY requirements.txt .
158
+
159
+ # # Install dependencies
160
+ # RUN pip install --no-cache-dir -r requirements.txt
161
+
162
+ # # Install gdown for Google Drive downloads
163
+ # RUN pip install --no-cache-dir gdown
164
+
165
+ # # Create necessary directories inside the container
166
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
167
+
168
+ # # Download model files from Google Drive using alternative methods
169
+ # RUN curl -L -o /app/modelsBioembed/pytorch_model.bin "https://drive.google.com/uc?export=download&id=11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb" && \
170
+ # curl -L -o /app/modelsBioembed/config.json "https://drive.google.com/uc?export=download&id=1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E" && \
171
+ # curl -L -o /app/modelsBioembed/tokenizer_config.json "https://drive.google.com/uc?export=download&id=1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc" && \
172
+ # curl -L -o /app/modelsBioembed/vocab.txt "https://drive.google.com/uc?export=download&id=1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w" && \
173
+ # curl -L -o /app/modelsBioembed/special_tokens_map.json "https://drive.google.com/uc?export=download&id=1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl"
174
+
175
+ # # Copy the entire project to the container
176
+ # COPY . .
177
+
178
+ # # Expose the port for Flask
179
+ # EXPOSE 8000
180
+
181
+ # # Run the app with Gunicorn
182
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
183
+
184
+
185
+
186
+
187
+ # # Use Python 3.8 as the base image
188
+ # FROM python:3.8
189
+
190
+ # # Install required system dependencies
191
+ # RUN apt-get update && apt-get install -y libopenblas-dev git curl wget && rm -rf /var/lib/apt/lists/*
192
+
193
+ # # Set the working directory inside the container
194
+ # WORKDIR /app
195
+
196
+ # # Copy the requirements file into the container
197
+ # COPY requirements.txt .
198
+
199
+ # # Install dependencies
200
+ # RUN pip install --no-cache-dir -r requirements.txt
201
+
202
+ # # Create necessary directories inside the container
203
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
204
+
205
+ # # Function to download file from Google Drive using wget
206
+ # RUN wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb' -O /app/modelsBioembed/pytorch_model.bin && \
207
+ # wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E' -O /app/modelsBioembed/config.json && \
208
+ # wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc' -O /app/modelsBioembed/tokenizer_config.json && \
209
+ # wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w' -O /app/modelsBioembed/vocab.txt && \
210
+ # wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl' -O /app/modelsBioembed/special_tokens_map.json
211
+
212
+ # # Copy the entire project to the container
213
+ # COPY . .
214
+
215
+ # # Expose the port for Flask
216
+ # EXPOSE 8000
217
+
218
+ # # Run the app with Gunicorn
219
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
220
+
221
+
222
+ # # Use Python 3.8 as the base image
223
+ # FROM python:3.8
224
+
225
+ # # Install required system dependencies
226
+ # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
227
+
228
+ # # Set the working directory inside the container
229
+ # WORKDIR /app
230
+
231
+ # # Copy the requirements file into the container
232
+ # COPY requirements.txt .
233
+
234
+ # # Install dependencies
235
+ # RUN pip install --no-cache-dir -r requirements.txt
236
+
237
+ # # Install gdown for Google Drive downloads
238
+ # RUN pip install --no-cache-dir gdown
239
+
240
+ # # Create necessary directories inside the container
241
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
242
+
243
+ # # Download model files using gdown with file IDs
244
+ # RUN gdown --id 1aDirthtWAu-oyVjcWZ6linrddN-dmLMI -O /app/modelsBioembed/pytorch_model.bin && \
245
+ # gdown --id 1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3 -O /app/modelsBioembed/config.json && \
246
+ # gdown --id 1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf -O /app/modelsBioembed/tokenizer_config.json && \
247
+ # gdown --id 1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ -O /app/modelsBioembed/vocab.txt && \
248
+ # gdown --id 1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P -O /app/modelsBioembed/special_tokens_map.json
249
+
250
+ # # Copy the entire project to the container
251
+ # COPY . .
252
+
253
+ # # Expose the port for Flask
254
+ # EXPOSE 8000
255
+
256
+ # # Run the app with Gunicorn
257
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+ # # Use Python 3.8 as the base image
278
+ # FROM python:3.8
279
+
280
+ # # Install required system dependencies
281
+ # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
282
+
283
+ # # Set the working directory inside the container
284
+ # WORKDIR /app
285
+
286
+ # # Copy the requirements file into the container
287
+ # COPY requirements.txt .
288
+
289
+ # # Install dependencies
290
+ # RUN pip install --no-cache-dir -r requirements.txt
291
+
292
+ # # Create necessary directories inside the container
293
+ # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
294
+
295
+ # # Set the OAuth 2.0 Access Token (Replace with your actual token)
296
+ # ENV ACCESS_TOKEN="ya29.a0AeXRPp6PRilVeuzghPDbZQE7DxYHHWv4TARoaumWuo4gX9bIcEtMzp3PGi1Ak36YIbvKk32V7Cwb6bEjGfJuOWp0ZDW5rdog1c5uf9PJH7b-zgPxIeGa0kmZhGlk79gc7WfoSAl-GUopljJfOmKsyVn628CGB10RRHBtMQiHaCgYKARQSARESFQHGX2MiRd-59J4_XHWekXXqwK-jsw0175"
297
+
298
+ # # Define Google Drive File IDs
299
+ # ENV FILE_ID1="1aDirthtWAu-oyVjcWZ6linrddN-dmLMI"
300
+ # ENV FILE_ID2="1bwk1fSwqQE5mN9AhsOBlQkvFjHCGQtJ3"
301
+ # ENV FILE_ID3="1ne-xJcySd8PcGTA4SdpTA6F869xsPiTf"
302
+ # ENV FILE_ID4="1tWjWsoeyPvTdW5sYZMSWpvISlN7tDoZ"
303
+ # ENV FILE_ID5="1M8Qg9fSQ2A7CZpVFMCrZMIwam2j6Cc6P"
304
+
305
+ # # Download model files using curl with OAuth token
306
+ # RUN curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID1?alt=media" -o /app/modelsBioembed/pytorch_model.bin && \
307
+ # curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID2?alt=media" -o /app/modelsBioembed/config.json && \
308
+ # curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID3?alt=media" -o /app/modelsBioembed/tokenizer_config.json && \
309
+ # curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID4?alt=media" -o /app/modelsBioembed/vocab.txt && \
310
+ # curl -H "Authorization: Bearer $ACCESS_TOKEN" "https://www.googleapis.com/drive/v3/files/$FILE_ID5?alt=media" -o /app/modelsBioembed/special_tokens_map.json
311
+
312
+ # # Copy the entire project to the container
313
+ # COPY . .
314
+
315
+ # # Expose the port for Flask
316
+ # EXPOSE 8000
317
+
318
+ # # Run the app with Gunicorn
319
+ # CMD ["gunicorn", "-b", "0.0.0.0:8000", "app:app"]
320
+
Samples/SMILES_GENERATED.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC(C)C1C(=O)NC2CCN(CCC3CCC(CN)CC3)C21
2
+ CCOC(=O)C1CCCCCC12CCCCN2
3
+ O=C(O)CCNC(=O)C1NCCCC1C(=O)NO
4
+ CC(C)CC(NC(=O)C(Cc1c[nH]cn1)NC(=O)c1cccc(C(F)(F)F)c1)C(=O)O
5
+ CC(=O)OC1CC2CC(CC(c3ccc(O)c(O)c3)=C2O)OC(=O)c2ccc1c(O)c2
6
+ Cc1cc2cc(C)n(O)c(=O)c2cc1C
7
+ NCCc1ccc(S(=O)(=O)N2CCN=C2c2ccc(-c3ccccc3)cc2)cc1
8
+ NC1=NC(c2c(Cl)cc(Cl)cc2Cl)CC(=O)N1
9
+ CC=NOC(=O)c1ccc(NS(=O)(=O)N=C2c3ccc(C)cc3CCC2(C)C)cc1
10
+ CC(=O)CCCC(=O)OCC(CNc1cccc(Cl)c1)S(=O)(=O)O
11
+ CN(CCF)c1ccc(C(=O)NC(CN2C(=O)C3CCCCC3(C)C2=O)c2ccccc2)cc1
12
+ Cc1ccc(CNC(=O)C2CC(O)CC(C)C2CN2CCc3ccccc32)cc1
13
+ CC(=O)C(C(C)=NO)=C(O)c1ccccc1Sc1ccccc1CO
14
+ CNCCCCNC(=O)c1c(-c2ccccc2Cl)n[nH]c1-c1ccccc1Cl
15
+ CCOC(=O)c1c(CO)ccc(O)c1C
16
+ CCOc1ccc(F)c(CC(=O)NC2Cc3ccccc3C2)c1
17
+ Cc1ncc(CNC(=O)C2(c3cccc(F)c3)CCN(C)CC2)c(NCc2ccc(C#N)cc2)n1
18
+ CCn1cc(C=NNC(=O)c2ccc(Cl)cc2C(F)(F)F)n2ccc(=O)nc12
19
+ NC(=O)C1CCCN1c1cc(-c2nc(N)c3cc(C(F)(F)F)cnc3n2)cc(C(F)(F)F)c1
20
+ CC(C)Oc1ccn2c(N)nc(N3CCN(CC(N)=O)CC3)nc12
21
+ CC1(C)Cc2cc(c(C(=O)Nc3ccc(Cl)c(Cl)c3)nc2-c2cccnc2)N1
22
+ CCOC(=O)C1(CCCCCC=C(C)C)CCN(C(=O)CCC(=O)O)CC1
23
+ CCN1C(=O)C2CCCCC2C(C#N)=C1Nc1ccncc1
24
+ CCn1c(C(=O)N2CCN(C(=O)c3ccc(C(F)(F)F)cc3)CC2)c(O)c2ccccc21
25
+ O=C(N1CCNCC1)n1cc2c(=O)[nH]c(=S)[nH]n2c1=O
26
+ CC(C)C(O)CC(O)C(=O)c1nn2c(=O)cc(C(F)(F)F)nc2s1
27
+ Nc1c2c(nn1-c1ncc(C(F)(F)F)cn1)Cc1ccccc1C2=O
28
+ Nc1cc(-c2cnc(N)nc2-c2ccc(F)cc2)ncn1
29
+ CC1CCN(C(=O)C2NC(=O)C(c3cccs3)C2(C)C)CC1
30
+ Nc1cc(OCC(=O)c2ccc(Cl)c(Cl)c2)ncn1
31
+ CCOc1ccc2c(c1)CC(CCN1CCN(C(=O)c3ccn(Cc4cc(O)ccc4O)n3)CC1)C2=O
32
+ Cc1ncc([N+](=O)[O-])n1CCN1CCCN(C(=O)c2ccc3nccnc3c2)CC1
33
+ Cn1c(C(=O)NC2CCC(C(N)=O)CC2)nc2cc(N=C(N)c3ccc(Cl)cc3Cl)ncc21
34
+ Nc1ccc(NC(=O)c2cnc(N)nc2N)cn1
35
+ CCN1CCN(C(=O)c2sc(S(=O)(=O)NCc3cccnc3N3CCNCC3)cc2C)CC1
36
+ CCOCC(=O)N(c1c(O)nc(=O)[nH]c1C)C1CCCCC1
37
+ CCC(=Nc1[nH]nc(-c2cccc(OC)c2)c1-c1ccncc1)N(C)C
38
+ Cc1nn(-c2cccc(NC(=O)c3ccc(F)c([N+](=O)[O-])c3)c2)c2c1CCN(C(C)(C)C(=O)N(C)C)C2=O
39
+ CCC1CC=CC(=O)N2CC(C3CCCCC3)N(C(=O)c3ccc(C(F)(F)F)cc3)CC2C(=O)OC1
40
+ CCC(CCN1CCN(C(=O)c2ccc(C#N)nc2)CC1)N(C)C
41
+ CC(=O)c1ccc(S(=O)(=O)n2ccc(-c3c(C)nc(-c4ccc([N+](=O)[O-])cc4)oc3=O)n2)cc1
42
+ Cc1cc(O)ccc1C(O)C1C(=O)NC1C(=O)NC1(C(C)C)CC2CN(Cc3ccncc3)CCN2C1=O
43
+ O=C(NC1CCS(=O)(=O)C1)c1cnc(Nc2ccc(Oc3ccccc3)cc2)cn1
44
+ CC(C)CCC1N(C(=NC2CCS(=O)(=O)C2)SCc2cccnc2)C(=O)CC1(C)C
45
+ CC(=O)OC(Cc1ccc(C(C)=O)cc1)C(C)N(O)C=O
46
+ CCCC(=O)N1CCCC(N)C1
47
+ CCN(CC)CCN=c1c(O)c(O)c1=Nc1ccc(C(C)(O)C(F)(F)F)cc1
48
+ O=C(NO)c1cc2c3n(nc2oc1=O)CC(CCO)CN3CC1CCCCC1
49
+ CC1=NCC(C)(C)c2cnc(S(C)(=O)=O)nc21
50
+ Cc1cc(C)nc(C(=O)N2CC3c4ccc(OCC#CC5(C(N)=O)CCCC5)cc4N(C)CC2C3C(=O)O)c1
51
+ Cc1sc2nc(CN(CC3CCCCC3)C(=O)N3c4ccccc4C(=O)C3F)nn2c1C
52
+ Cc1cccc(CN2CCN(CC(O)Cc3c(-c4ccc(F)cn4)nc4ccccn34)CC2)c1
53
+ Cc1ccc2oc(=O)cc(CNC(=O)CNC(=O)Oc3ccc([N+](=O)[O-])cc3)c2c1
54
+ NC([PH](=O)O)S(=O)(=O)O
55
+ O=C(CCN1CCNC1=O)Nc1cccs1
56
+ Cc1nc(CS(=O)(=O)c2ccc(C(F)(F)C(F)(F)F)nc2)c2oc(CC3CCN(C)CC3)nc2c1C(=O)NC(C)C(=O)O
57
+ N#Cc1c(N2CCN(C(=O)c3ccccc3)CC2)nn2c(N)c3c(nc12)CCCC3
58
+ Nc1ncnc2c(-c3ccc(OC(F)(F)F)cc3)c(O)c(C(F)(F)F)nc12
59
+ Cc1ccn(-c2cc3c(c(OC4CCOCC4)c2)Cc2c(N)ncnc2N3)n1
60
+ CC(N=C=S)(c1cccnc1)c1cc(F)ccc1F
61
+ Cn1cnc2cc(-c3ccc(CCO)nc3)nc(-n3cnnc3)c2c1=O
62
+ CCOC(=O)C(CCCC1CCCC(O)C1)N1C(=O)C2C[SH]1C(c1ccc(Cl)c(Cl)c1)=N2
63
+ CC(C)C1C(=O)NC(C2Cc3ccccc3C2)C(=O)NC1C(=O)O
64
+ Cc1c([N+](=O)[O-])c(=O)oc2cc(NC(=O)Nc3ccc(C#N)cc3)ccc12
65
+ CCC(C)n1cnc2c(Sc3ccc(C(F)(F)F)cc3[N+](=O)[O-])nc(N)nc21
66
+ Nc1nnc(Sc2ccccc2Cl)s1
67
+ Cc1cc(O)c(C=O)cc1C(=O)NCCCCCCCN
68
+ Cc1cc(C)n(Cc2cc(C(=O)NC3CCCC3C(=O)NCCCF)ccc2Cl)n1
69
+ CC(C)CCCC(C)CS(=O)(=O)CCCC(N)Cc1cnn(C2=COCOC2)n1
70
+ CCOC(=O)C1CCCN(CC)C1c1cc2c(=O)[nH]cc(CC)c2cc1O
71
+ Nc1nn2c(=O)cc(CSc3nc4ccc(C(=O)O)cc4[nH]3)nc2s1
72
+ CC(=O)Oc1ccc(C#CC=C(C)C(=O)NC(CN=C(N)N)C(C)C)cc1
73
+ CC(C)=NC(=O)c1c(C)n2nc(C(F)(F)F)sc2[n+]1[O-]
74
+ Nc1cc(Cl)ccc1C(=O)NCCNCC(O)CCO
75
+ Cc1noc(-c2ccccc2C(=O)N2CCN3CCC2C(C)(C)C3=O)n1
76
+ CC(=O)OC(c1cccc(C(F)(F)F)c1)C(O)CC1CCN(C)C1CS(=O)(=O)c1ccccc1
77
+ CC(=O)N1CCN(Cc2c(-c3ccccc3)nn(C)c2Cl)CC1
78
+ Nc1ncnc(Oc2cccc(Cc3ccccc3O)c2)c1C(F)(F)F
79
+ Cc1cc(NCCC(=O)Nc2ccc3c(c2)S(=O)(=O)N(C)C3=O)n2nc(C)c(-c3ccccc3)c2n1
80
+ Nc1ncnc2ccc(F)c(-c3csc(C(N)(CO)C(F)(F)F)n3)c12
81
+ CCCCCCCC=C(C)C(=O)NC1CNC(=O)C1
82
+ CC(C)C(C)C(=O)NC1CCN(c2ncnc(NC(C)C3CCCCC3)c2F)C1
83
+ CCc1nnc(CNc2nc(C)cc(N(C)C)n2)o1
84
+ CCOC(=O)Nc1ccc(OCCC(=O)O)nc1
85
+ Cc1cc(-c2nnc(C(=O)NC3CCCCCCC3)o2)ccn1
86
+ Cc1nc(CCCC(=O)C2CC(N)C(=O)NC2C(=O)N2C(C#N)CCC2C#N)cs1
87
+ Cn1cnc2c(Nc3ccn(C4CCC5CC(CO)C(=O)NC54)c(=O)n3)nc(-c3c(N)cc(Cl)cc3Cl)nc21
88
+ CCc1[nH]ncc1C(=O)N1CCC(C)CC1C(=O)NCc1ccc(N(C)C)cc1
89
+ CC(C)=CCOC1Cc2c(c(O)nc3ccccc23)CO1
90
+ CCCNCCCOc1cccc(-c2cccc(OC)c2)c1
91
+ CC(C)CC(C=O)NC(=O)C(CC1CCCC1)NC(=O)NC1CCCCCC1
92
+ Nc1cc2n(c(=O)n1)C1CCCCCC(C1)N2
93
+ NC(=O)c1cccc(-c2cnc3ccc(Nc4ccncn4)nn23)c1
94
+ CC(=O)OC1CCC(OC(C)=O)C(CNC(=S)Nc2ccc(F)c(Cl)c2)C1
95
+ CC(=O)N1CCC2(C)Oc3ccc4c(=O)c(C(=O)O)c(-n5ccnn5)oc4c3C(c3ccccn3)C2C1
Samples/generated_smiles.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC(=O)OC1OCCCC1Cl
2
+ O=C1NC(c2ccc(C(F)(F)F)cc2)S(=O)(=O)C1CCCN1CCC2C(Cc3ccccc3C23NC(=S)NC3=O)CC1CO
3
+ Nc1nc(-c2ccccc2)nc2ccc(C(F)(F)F)cc12
4
+ CCNCC(O)(c1ccc2c(c1)NCCC2)C(C)C
5
+ NC(=O)C1CCC(CCO)C2Cc3cccc([N+](=O)[O-])c3-c3ccccc3CC(=O)N12
6
+ CC(C)CC1OC2CC3CN(C(=O)C24CC=C1CN4)C(c1ccccc1[N+](=O)[O-])C(=O)N3
7
+ CCn1cc(C(=O)O)c(=O)c2c3ccccc3n(CCl)c21
8
+ Cc1[nH]c2c(NC(C)c3ccc(C(=O)O)cc3)ncnc2c1C(=O)O
9
+ Cc1cc(C=NOC(C)CNCCCC2CCC(n3cnc4c(N)ncnc43)O2)c2[nH]c(C)cc2n1
10
+ Cc1[nH]n(-c2cc(C(=O)NC(C)C(=O)N(Cc3ccncc3)C3CCCC3F)ccn2)c(=O)c1[N+](=O)[O-]
11
+ CN(Cc1cc2ccoc(=O)c2n1-c1cccc(O)c1)c1ccccc1
12
+ CC(C)C(c1ccc(Cl)cc1O)S(=O)(=O)N(C#N)CC(=O)NO
13
+ NC(=O)c1ccc(N2CCCC(c3cccc(F)c3)C2)nc1C(Cl)(Cl)Cl
14
+ CC(C)C1C(=O)C(O)(c2ccc([N+](=O)[O-])cc2)CC1NC(=O)C1CC(=O)N1
15
+ CCCCNCC1CCOC(c2ccc(F)cc2)O1
16
+ Cc1cc(NC(=O)C2COC(OCCCC(C)C(O)(c3ccccc3)c3ccccc3)C2)ncn1
17
+ Cc1ccc(C(=O)Nc2ccnc(Cn3nc(C)c4c3CCN(Cc3ccccc3)C4)n2)cc1
18
+ CC(=O)C1CCC(C)N1C(=O)OC1CN(C2(c3cccnn3)CCCCC2)CCC1CC(=O)O
19
+ CCC(=O)N1CC2CCCN(CC(=O)NCC3CCN(c4ncncc4F)CC3)C2C1
20
+ CC1(c2ccc(O)cc2)CC(=O)N(C=C(Cl)Cl)C(c2cccnc2)N1
21
+ O=C(Nc1ccc([N+](=O)[O-])cc1C(=O)NN=Cc1cccs1)c1ccc(Cl)s1
22
+ Cc1cc(C)n(-c2nc(N)cc(-c3cc(C(=O)NC4CCNCC4)cc(C(F)(F)F)c3)n2)n1
23
+ CC(=O)OCC1CN(Cc2cccc([N+](=O)[O-])c2)C(CN2CCCC2)C1O
24
+ CC(=O)Nc1c(N)n(CC2CCCO2)c(=O)[nH]c1=O
25
+ Cc1cc(O)nc(N2CCCC(N3CCN(C(=O)c4ccc(F)cc4)CC3)C2)n1
26
+ CC(C)CC(N)C(O)C#N
27
+ NC(=O)C1CCC(CN2CCCCC2)N1C(=O)c1cc(-c2ccnc(N)n2)cc(C(F)(F)F)c1F
28
+ CC(C)(N)CCCNc1cc(C(C)(C)C)ncn1
29
+ Nc1nc(N)c2c(-c3ccccc3)cc(S(=O)(=O)NC(=O)CC3CCCC3)cc2n1
30
+ O=C(NCc1cnn(-c2ccc(F)cc2)c1)c1cccnc1C(F)(F)F
31
+ Cc1nc(-c2ccccn2)c(CSC(=NCc2cccnc2)N2CC(C)(C)CC2=O)s1
32
+ NC(=O)C1CCC2CN(C(=O)OCc3ccc(F)cc3)CC(CO)C21
33
+ CCC(C)CC=CC(CCO)OC(=O)c1cc(S(N)(=O)=O)cnc1C(F)(F)F
34
+ CCn1cc(C)c2nc(-c3cnn(C)c3NCc3cccnc3OC)cnc21
35
+ N#Cc1cc(F)c(N2CCC(CN)C2)cc1NS(=O)(=O)c1cc(Cl)cc(Cl)c1
36
+ CCCCNc1c(C(=O)O)cc2nc(C#N)c(-c3ccccc3)nn12
37
+ CC#CCCCn1c(=O)c(NC(=O)c2cccc(F)c2)cn(CC(=O)NCC2CCCO2)c1=O
38
+ CCn1ncc(S(=O)(=O)c2ccc(C(=O)O)cc2)c1C(=O)Nc1cc(C)cc(C)c1
39
+ NC1CCCCC1C(=O)NCC(=O)NC1CCCCCCC1
40
+ Cc1cc(N2CCOCC2)nc(Nc2ccc(-c3cnc4ccn(C)c4c3)cc2)n1
41
+ CCOC(=O)Nc1nccnc1N1CCC(C(N)=O)CC1
42
+ Cc1c2nc(NC3CCC(CC(N)C(=O)O)CC3)nc(Oc3ccc(C(=O)NCC4CCCCC4)cc3)c2nn1C
43
+ CCC(C(=O)NC1CCCc2cccnc21)N(CCC1CCCCC1)S(=O)(=O)c1cccs1
44
+ CCCCN1C(=O)N=C(Nc2cccc(OCc3ccncc3)c2)C1CC1CCCC1
45
+ Nc1nccn2c(-c3cccnc3)c(CNc3ccc(F)cc3F)cc12
46
+ CC1CCC(NC(=O)c2ccc(CN(c3ccc(C(=O)O)cc3)S(=O)(=O)c3ccc(F)cc3C(F)(F)F)cc2)CC1
47
+ CC=Cc1ccc(S(=O)(=O)N2CCCC2C(=O)N2CC3(O)CCCC3C2=O)cc1
48
+ Cc1ccc(C(=O)N2Cc3nc(-c4ccc(F)cc4)ncc3C2=NN(C(=O)C2CCCC(C(N)=O)C2)C2CCNC2)cc1C
49
+ CCCCCCNCCc1cc(CC)cc(=O)o1
50
+ CC(=O)NC1CCC(CCCCOCCNC(=O)C2CCCN2C(=O)OC(C)(C)C)CC1
51
+ CCCNc1ccc(Cl)cc1C(=O)NC(C)CC(C)C
52
+ CCC(C(O)C1C(O)=C(C)OC(=S)N1CCC(O)C(C)(C)C)S(=O)(=O)CC
53
+ CCCCNC(=O)OC1CC2CCC(c3nnc4ccccn34)CCN2C1O
54
+ CC(C)N1CC(C(=O)Nc2ccc(N3CCOCC3)c([N+](=O)[O-])c2)C2CS(=O)(=O)CC21
55
+ CCCCCCCN1CC2COC1C(O)(Cn1c(-c3cccs3)nc3c(N)ncnc31)C2
56
+ Nc1nc(Sc2ccc(C(F)(F)F)cc2)nc2cccc(-c3ccc(C(=O)O)nc3)c12
57
+ O=c1oc2ccccc2n1CCS(=O)(=O)N1CCCCCC1
58
+ NCCNc1ncnc(N)c1C(=O)NC1CCCC1
59
+ N=C(N)OCCOCc1c(N)ccc2c1CC(O)CC2
60
+ CN1CCN(C(=O)C2CCN(C3CC(C)(C)c4ccc(C(=O)O)c3c4)CC2)CC1
61
+ CC(=O)N(C1=NC(=O)C(CC2CCC2OC(N)=O)N1CC(C)(C)c1ccccc1)C1CCCC1
62
+ CC(=O)NCCNCCCCCC(=O)c1ccc(-c2cc(Cl)cc(Cl)c2)o1
63
+ CCNS(=O)(=O)c1cccc(COc2ccnnc2)c1
64
+ O=C(NN=CCc1ccco1)c1cnccn1
65
+ Cc1cccc(N(C)CCC(=O)C2C(=O)OC3CC(C(=O)O)NCC3C2C(F)(F)F)c1C
66
+ Cc1cc(Sc2ccncc2)c([N+](=O)[O-])c(C)n1
67
+ Cc1c(Cl)ccc2[nH]c(=O)c(-n3c([N+](=O)[O-])cnc3C)nc12
68
+ Cc1ccc(C#N)cc1C1N=C(c2ccc(S(N)(=O)=O)cc2)CC(=O)N1C
69
+ CCOC(=O)C1CCC(Cn2c(C(C)(C)C)ccc(C#N)c2=N)C1
70
+ Cc1nn2cnc(-c3ccc(C(=O)NCc4ccc(C#N)cc4)s3)c(N3CCC(N(C)C)C3)c2c1C(C)(C)C
71
+ Cc1[nH]nc(-c2cc(C3C4CC(CCN4)N3C)nc(N3CCC4(CCCCC4)C3)n2)c1Cl
72
+ CCN(C)c1ccc(C(=O)OCC(=O)NCC2CCCC2)cc1
73
+ CC(C)n1cc(S(=O)(=O)NCC(=O)N2CCCC(c3ccccc3)(c3ccccc3)CC2)cn1
74
+ S=C1NCSS1
75
+ CC1CCC(C(=O)NC(=S)N2CCN(c3ccc([N+](=O)[O-])cc3Cl)CC2)C1
76
+ CC(=NO)C(=O)Nc1ccc(Cl)c(C(=O)O)c1
77
+ NCCCCCCC(=O)NC(C(=O)O)S(=O)(=O)O
78
+ CCN1C(=O)C2CC(NC(=O)C(NC(=O)OCc3cccc([N+](=O)[O-])c3)C(C)P(=O)(O)O)CCC21
79
+ CC(=O)C1(C)CC(c2ccc(N3CCOCC3)cc2C(F)(F)F)=NO1
80
+ CCCCCCC(N)=O
81
+ Nc1nc(Cl)ccc1C(c1ccc(-c2ccccn2)nc1N)N1CCCCC1
82
+ CCC1N=C(S)NC1c1cc(F)ccc1F
83
+ CC(=O)c1cn(C)c2ccc(S(=O)(=O)NCc3ccc(C(=O)O)cc3)cc12
84
+ CCCCCC(=CC#N)[N+](=O)[O-]
85
+ CN1CCCC1CCNCc1ccc(F)c(Cl)c1
86
+ Cc1ccc(S(=O)(=O)c2nnc(-n3cc(C)sc3=O)s2)cc1
87
+ NCC1COC(CN2CC(c3cccc([N+](=O)[O-])c3)OC2=O)C1CCC1CCCCC1
88
+ CC(C)NC(CC(=O)Nc1cnn(C)c(=O)c1Cl)C(=O)NCc1ccc(C(F)(F)F)c(F)c1
89
+ CC(C)CC1C(=O)N2CCCC2C(=O)N1C1(C)CCCO1
90
+ O=C(O)c1ccc(Cn2cccc2-c2cc(NC(=O)c3cccs3)ncn2)cc1
91
+ CCn1ncc(CC(=O)NCc2ccccc2)c1CCC(=O)NCc1ccc(F)c(F)c1
92
+ O=C1CSC(C(=O)NCCC2CN3CCC2(c2cccc(C(F)(F)F)c2)CC3)N1
93
+ Nc1nc(O)nc2c1CN(CCCN1C(=O)CCC1=O)CC2
app.py ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import requests
4
+ import numpy as np
5
+ from flask import Flask, render_template, request, send_file
6
+ from rdkit import Chem
7
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
8
+ from bio_embeddings.embed import ProtTransBertBFDEmbedder
9
+ from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
10
+
11
+ # 🚀 Define Directories for Railway
12
+ bio_model_dir = "/app/modelsBioembed" # Persistent model storage
13
+ cvn_model_dir = "/app/models_folder"
14
+ UPLOAD_FOLDER = "/app/Samples"
15
+
16
+ os.makedirs(bio_model_dir, exist_ok=True)
17
+ os.makedirs(cvn_model_dir, exist_ok=True)
18
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
19
+
20
+ # ✅ Environment Variables for Temp Directory
21
+ os.environ["TMPDIR"] = bio_model_dir
22
+ os.environ["TEMP"] = bio_model_dir
23
+ os.environ["TMP"] = bio_model_dir
24
+
25
+ # 🔗 Dropbox Links for Model Files
26
+ DROPBOX_LINKS = {
27
+ "pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
28
+ "config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
29
+ "tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
30
+ "vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
31
+ "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
32
+ }
33
+
34
+ # 📥 Function to Download Model Files
35
+ def download_model_files():
36
+ for filename, url in DROPBOX_LINKS.items():
37
+ file_path = os.path.join(bio_model_dir, filename)
38
+ if not os.path.exists(file_path): # Avoid re-downloading
39
+ print(f"Downloading {filename}...")
40
+ response = requests.get(url, stream=True)
41
+ if response.status_code == 200:
42
+ with open(file_path, "wb") as f:
43
+ for chunk in response.iter_content(chunk_size=1024):
44
+ f.write(chunk)
45
+ print(f"Downloaded: {filename}")
46
+ else:
47
+ print(f"Failed to download {filename}")
48
+ # def download_model_files():
49
+ # for filename, url in DROPBOX_LINKS.items():
50
+ # file_path = os.path.join(bio_model_dir, filename)
51
+
52
+ # print(f"Downloading {filename} (forcing overwrite)...")
53
+ # response = requests.get(url, stream=True)
54
+ # if response.status_code == 200:
55
+ # with open(file_path, "wb") as f:
56
+ # for chunk in response.iter_content(chunk_size=1024):
57
+ # f.write(chunk)
58
+ # print(f"Downloaded: {filename}")
59
+ # else:
60
+ # print(f"Failed to download {filename}")
61
+
62
+ # 📥 Download models before starting
63
+ download_model_files()
64
+
65
+ # # ✅ Load ProtTrans-BERT-BFD Model
66
+ # print("Loading ProtTrans-BERT-BFD model...")
67
+ # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
68
+ # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
69
+
70
+ # ✅ Load Bio-Embedding Model
71
+ try:
72
+ print("Loading ProtTrans-BERT-BFD model...")
73
+ embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
74
+ except Exception as e:
75
+ print(f"Error loading ProtTrans-BERT-BFD model: {e}")
76
+ embedder = None
77
+
78
+ # 🧬 Generate Bio-Embeddings
79
+ def generate_bio_embeddings(sequence):
80
+ if embedder is None:
81
+ return None
82
+ try:
83
+ embedding_protein = embedder.embed(sequence)
84
+ embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
85
+ return np.array(embedding_per_protein).reshape(1, -1)
86
+ except Exception as e:
87
+ print(f"Embedding Error: {e}")
88
+ return None
89
+
90
+ # 🔬 Generate SMILES from Protein Sequence
91
+ def generate_smiles(sequence, n_samples=100):
92
+ start_time = time.time()
93
+
94
+ protein_embedding = generate_bio_embeddings(sequence)
95
+ if protein_embedding is None:
96
+ return None, "Embedding generation failed!"
97
+
98
+ model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
99
+ samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
100
+ valid_samples = [sample for sample in samples if sample is not None]
101
+
102
+ smiles_list = [
103
+ Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
104
+ ]
105
+
106
+ if not smiles_list:
107
+ return None, "No valid SMILES generated!"
108
+
109
+ filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
110
+ with open(filename, "w") as file:
111
+ file.write("\n".join(smiles_list))
112
+
113
+ elapsed_time = time.time() - start_time
114
+ return filename, elapsed_time
115
+
116
+ # 🌐 Flask Web App
117
+ app = Flask(__name__)
118
+
119
+ @app.route("/", methods=["GET", "POST"])
120
+ def index():
121
+ if request.method == "POST":
122
+ sequence = request.form["sequence"].strip()
123
+ if not sequence:
124
+ return render_template("index.html", message="Please enter a valid sequence.")
125
+
126
+ file_path, result = generate_smiles(sequence)
127
+ if file_path is None:
128
+ return render_template("index.html", message=f"Error: {result}")
129
+
130
+ return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
131
+
132
+ return render_template("index.html")
133
+
134
+ @app.route("/download")
135
+ def download_file():
136
+ file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
137
+ return send_file(file_path, as_attachment=True)
138
+
139
+ # 🚀 Run the Flask App on Railway
140
+ if __name__ == "__main__":
141
+ app.run(host="0.0.0.0", port=7860)
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ # import os
154
+ # import time
155
+ # import requests
156
+ # import numpy as np
157
+ # import subprocess
158
+ # from flask import Flask, render_template, request, send_file
159
+ # from rdkit import Chem
160
+ # from transformers import AutoModel
161
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
162
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
163
+
164
+ # # DROPBOX LINKS FOR MODEL FILES
165
+ # DROPBOX_LINKS = {
166
+ # "pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
167
+ # "config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
168
+ # "tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
169
+ # "vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
170
+ # "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
171
+ # }
172
+
173
+ # # LOCAL DIRECTORIES
174
+ # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed")
175
+ # cvn_model_dir = os.path.join(os.getcwd(), "models_folder")
176
+ # UPLOAD_FOLDER = "Samples"
177
+
178
+ # os.makedirs(bio_model_dir, exist_ok=True)
179
+ # os.makedirs(cvn_model_dir, exist_ok=True)
180
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
181
+
182
+ # os.environ["TMPDIR"] = bio_model_dir
183
+ # os.environ["TEMP"] = bio_model_dir
184
+ # os.environ["TMP"] = bio_model_dir
185
+
186
+ # # FUNCTION TO DOWNLOAD FILES FROM DROPBOX
187
+ # for file_name, url in DROPBOX_LINKS.items():
188
+ # file_path = os.path.join(bio_model_dir, file_name)
189
+ # if not os.path.exists(file_path):
190
+ # print(f"Downloading {file_name} from Dropbox...")
191
+ # subprocess.run(["wget", "-O", file_path, url], check=True)
192
+ # print(f"{file_name} downloaded!")
193
+
194
+ # # BIO-EMBEDDING MODEL LOADING
195
+ # try:
196
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
197
+ # except Exception as e:
198
+ # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
199
+ # embedder = None
200
+
201
+ # def generate_bio_embeddings(sequence):
202
+ # if embedder is None:
203
+ # return None
204
+ # try:
205
+ # embedding_protein = embedder.embed(sequence)
206
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
207
+ # return np.array(embedding_per_protein).reshape(1, -1)
208
+ # except Exception as e:
209
+ # print(f"Embedding Error: {e}")
210
+ # return None
211
+
212
+ # def generate_smiles(sequence, n_samples=100):
213
+ # start_time = time.time()
214
+ # protein_embedding = generate_bio_embeddings(sequence)
215
+ # if protein_embedding is None:
216
+ # return None, "Embedding generation failed!"
217
+
218
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
219
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
220
+ # valid_samples = [sample for sample in samples if sample is not None]
221
+
222
+ # smiles_list = [
223
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
224
+ # ]
225
+
226
+ # if not smiles_list:
227
+ # return None, "No valid SMILES generated!"
228
+
229
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
230
+ # with open(filename, "w") as file:
231
+ # file.write("\n".join(smiles_list))
232
+
233
+ # elapsed_time = time.time() - start_time
234
+ # return filename, elapsed_time
235
+
236
+ # app = Flask(__name__)
237
+
238
+ # @app.route("/", methods=["GET", "POST"])
239
+ # def index():
240
+ # if request.method == "POST":
241
+ # sequence = request.form["sequence"].strip()
242
+ # if not sequence:
243
+ # return render_template("index.html", message="Please enter a valid sequence.")
244
+
245
+ # file_path, result = generate_smiles(sequence)
246
+ # if file_path is None:
247
+ # return render_template("index.html", message=f"Error: {result}")
248
+
249
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
250
+
251
+ # return render_template("index.html")
252
+
253
+ # @app.route("/download")
254
+ # def download_file():
255
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
256
+ # return send_file(file_path, as_attachment=True)
257
+
258
+ # if __name__ == "__main__":
259
+ # app.run(host="0.0.0.0", port=8000, debug=True)
260
+
261
+
262
+
263
+ # import os
264
+ # import time
265
+ # import numpy as np
266
+ # from flask import Flask, render_template, request, send_file
267
+ # from rdkit import Chem
268
+ # from transformers import AutoModel
269
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
270
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
271
+
272
+ # # # DIRECTORIES
273
+ # # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
274
+ # # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
275
+ # #bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
276
+ # bio_model_dir = "/app/modelsBioembed"
277
+ # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
278
+
279
+
280
+ # os.makedirs(bio_model_dir, exist_ok=True)
281
+ # os.makedirs(cvn_model_dir, exist_ok=True)
282
+
283
+ # os.environ["TMPDIR"] = bio_model_dir
284
+ # os.environ["TEMP"] = bio_model_dir
285
+ # os.environ["TMP"] = bio_model_dir
286
+
287
+ # UPLOAD_FOLDER = "Samples"
288
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
289
+
290
+ # app = Flask(__name__)
291
+
292
+ # # model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
293
+ # # if not os.path.exists(model_path):
294
+ # # print("Downloading ProtTrans-BERT-BFD model...")
295
+ # # AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
296
+
297
+
298
+ # # BIO-EMBEDDING MODEL LOADING
299
+ # try:
300
+ # print("Loading Model")
301
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
302
+ # except Exception as e:
303
+ # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
304
+ # embedder = None
305
+
306
+ # def generate_bio_embeddings(sequence):
307
+ # """Generate bio-embeddings for a given protein sequence."""
308
+ # if embedder is None:
309
+ # return None
310
+ # try:
311
+ # embedding_protein = embedder.embed(sequence)
312
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
313
+ # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
314
+ # except Exception as e:
315
+ # print(f"Embedding Error: {e}")
316
+ # return None
317
+
318
+ # def generate_smiles(sequence, n_samples=100):
319
+ # """Generate SMILES from a protein sequence."""
320
+ # start_time = time.time()
321
+
322
+ # protein_embedding = generate_bio_embeddings(sequence)
323
+ # if protein_embedding is None:
324
+ # return None, "Embedding generation failed!"
325
+
326
+ # # TRAINED CVanilla_RNN_Builder MODEL LOADING
327
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
328
+
329
+ # # MOLECULAR GRAPH GENERATION
330
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
331
+ # valid_samples = [sample for sample in samples if sample is not None]
332
+
333
+ # # CONVERSION TO SMILES
334
+ # smiles_list = [
335
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
336
+ # ]
337
+
338
+ # if not smiles_list:
339
+ # return None, "No valid SMILES generated!"
340
+
341
+ # # SAVING TO FILE
342
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
343
+ # with open(filename, "w") as file:
344
+ # file.write("\n".join(smiles_list))
345
+
346
+ # elapsed_time = time.time() - start_time
347
+ # return filename, elapsed_time
348
+
349
+ # @app.route("/", methods=["GET", "POST"])
350
+ # def index():
351
+ # if request.method == "POST":
352
+ # sequence = request.form["sequence"].strip()
353
+ # if not sequence:
354
+ # return render_template("index.html", message="Please enter a valid sequence.")
355
+
356
+ # file_path, result = generate_smiles(sequence)
357
+ # if file_path is None:
358
+ # return render_template("index.html", message=f"Error: {result}")
359
+
360
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
361
+
362
+ # return render_template("index.html")
363
+
364
+ # @app.route("/download")
365
+ # def download_file():
366
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
367
+ # return send_file(file_path, as_attachment=True)
368
+
369
+ # if __name__ == "__main__":
370
+ # app.run(host="0.0.0.0", port=8000)
371
+ #MAIN
372
+
373
+
374
+
375
+
376
+ # import os
377
+ # import time
378
+ # import requests
379
+ # import numpy as np
380
+ # from flask import Flask, render_template, request, send_file
381
+ # from rdkit import Chem
382
+ # from transformers import AutoModel
383
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
384
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
385
+
386
+ # # HUGGING FACE MODEL REPO (Replace with your actual Hugging Face username)
387
+ # MODEL_BASE_URL = "https://huggingface.co/Bhanushray/protein-smiles-model/tree/main"
388
+
389
+ # # REQUIRED MODEL FILES
390
+ # MODEL_FILES = [
391
+ # "pytorch_model.bin",
392
+ # "config.json",
393
+ # "tokenizer_config.json",
394
+ # "vocab.txt",
395
+ # "special_tokens_map.json"
396
+ # ]
397
+
398
+ # # DIRECTORIES
399
+ # bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
400
+ # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
401
+
402
+ # # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
403
+ # # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
404
+
405
+ # os.makedirs(bio_model_dir, exist_ok=True)
406
+ # os.makedirs(cvn_model_dir, exist_ok=True)
407
+
408
+ # os.environ["TMPDIR"] = bio_model_dir
409
+ # os.environ["TEMP"] = bio_model_dir
410
+ # os.environ["TMP"] = bio_model_dir
411
+
412
+ # UPLOAD_FOLDER = "Samples"
413
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
414
+
415
+ # app = Flask(__name__)
416
+
417
+ # # DOWNLOAD MODEL FILES IF MISSING
418
+ # for file_name in MODEL_FILES:
419
+ # file_path = os.path.join(bio_model_dir, file_name)
420
+
421
+ # if not os.path.exists(file_path):
422
+ # print(f"Downloading {file_name} ...")
423
+ # response = requests.get(MODEL_BASE_URL + file_name, stream=True)
424
+ # with open(file_path, "wb") as f:
425
+ # for chunk in response.iter_content(chunk_size=1024):
426
+ # f.write(chunk)
427
+ # print(f"{file_name} downloaded!")
428
+
429
+ # # BIO-EMBEDDING MODEL LOADING
430
+ # try:
431
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
432
+ # except Exception as e:
433
+ # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
434
+ # embedder = None
435
+
436
+ # def generate_bio_embeddings(sequence):
437
+ # """Generate bio-embeddings for a given protein sequence."""
438
+ # if embedder is None:
439
+ # return None
440
+ # try:
441
+ # embedding_protein = embedder.embed(sequence)
442
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
443
+ # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
444
+ # except Exception as e:
445
+ # print(f"Embedding Error: {e}")
446
+ # return None
447
+
448
+ # def generate_smiles(sequence, n_samples=100):
449
+ # """Generate SMILES from a protein sequence."""
450
+ # start_time = time.time()
451
+
452
+ # protein_embedding = generate_bio_embeddings(sequence)
453
+ # if protein_embedding is None:
454
+ # return None, "Embedding generation failed!"
455
+
456
+ # # LOAD TRAINED CVanilla_RNN_Builder MODEL
457
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
458
+
459
+ # # MOLECULAR GRAPH GENERATION
460
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
461
+ # valid_samples = [sample for sample in samples if sample is not None]
462
+
463
+ # # CONVERT TO SMILES
464
+ # smiles_list = [
465
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
466
+ # ]
467
+
468
+ # if not smiles_list:
469
+ # return None, "No valid SMILES generated!"
470
+
471
+ # # SAVE TO FILE
472
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
473
+ # with open(filename, "w") as file:
474
+ # file.write("\n".join(smiles_list))
475
+
476
+ # elapsed_time = time.time() - start_time
477
+ # return filename, elapsed_time
478
+
479
+ # @app.route("/", methods=["GET", "POST"])
480
+ # def index():
481
+ # if request.method == "POST":
482
+ # sequence = request.form["sequence"].strip()
483
+ # if not sequence:
484
+ # return render_template("index.html", message="Please enter a valid sequence.")
485
+
486
+ # file_path, result = generate_smiles(sequence)
487
+ # if file_path is None:
488
+ # return render_template("index.html", message=f"Error: {result}")
489
+
490
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
491
+
492
+ # return render_template("index.html")
493
+
494
+ # @app.route("/download")
495
+ # def download_file():
496
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
497
+ # return send_file(file_path, as_attachment=True)
498
+
499
+ # if __name__ == "__main__":
500
+ # app.run(host="0.0.0.0", port=8000, debug=True)
501
+
502
+
503
+ # import os
504
+ # import time
505
+ # import numpy as np
506
+ # from flask import Flask, render_template, request, send_file
507
+ # from rdkit import Chem
508
+ # from transformers import AutoModel
509
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
510
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
511
+
512
+ # # DIRECTORIES
513
+ # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
514
+ # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
515
+
516
+ # os.makedirs(bio_model_dir, exist_ok=True)
517
+ # os.makedirs(cvn_model_dir, exist_ok=True)
518
+
519
+ # os.environ["TMPDIR"] = bio_model_dir
520
+ # os.environ["TEMP"] = bio_model_dir
521
+ # os.environ["TMP"] = bio_model_dir
522
+
523
+ # UPLOAD_FOLDER = "Samples"
524
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
525
+
526
+ # app = Flask(__name__)
527
+
528
+ # model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
529
+ # if not os.path.exists(model_path):
530
+ # print("Downloading ProtTrans-BERT-BFD model...")
531
+ # AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
532
+
533
+
534
+ # # BIO-EMBEDDING MODEL LOADING
535
+ # try:
536
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
537
+ # except Exception as e:
538
+ # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
539
+ # embedder = None
540
+
541
+ # def generate_bio_embeddings(sequence):
542
+ # """Generate bio-embeddings for a given protein sequence."""
543
+ # if embedder is None:
544
+ # return None
545
+ # try:
546
+ # embedding_protein = embedder.embed(sequence)
547
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
548
+ # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
549
+ # except Exception as e:
550
+ # print(f"Embedding Error: {e}")
551
+ # return None
552
+
553
+ # def generate_smiles(sequence, n_samples=100):
554
+ # """Generate SMILES from a protein sequence."""
555
+ # start_time = time.time()
556
+
557
+ # protein_embedding = generate_bio_embeddings(sequence)
558
+ # if protein_embedding is None:
559
+ # return None, "Embedding generation failed!"
560
+
561
+ # # TRAINED CVanilla_RNN_Builder MODEL LOADING
562
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
563
+
564
+ # # MOLECULAR GRAPH GENERATION
565
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
566
+ # valid_samples = [sample for sample in samples if sample is not None]
567
+
568
+ # # CONVERSION TO SMILES
569
+ # smiles_list = [
570
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
571
+ # ]
572
+
573
+ # if not smiles_list:
574
+ # return None, "No valid SMILES generated!"
575
+
576
+ # # SAVING TO FILE
577
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
578
+ # with open(filename, "w") as file:
579
+ # file.write("\n".join(smiles_list))
580
+
581
+ # elapsed_time = time.time() - start_time
582
+ # return filename, elapsed_time
583
+
584
+ # @app.route("/", methods=["GET", "POST"])
585
+ # def index():
586
+ # if request.method == "POST":
587
+ # sequence = request.form["sequence"].strip()
588
+ # if not sequence:
589
+ # return render_template("index.html", message="Please enter a valid sequence.")
590
+
591
+ # file_path, result = generate_smiles(sequence)
592
+ # if file_path is None:
593
+ # return render_template("index.html", message=f"Error: {result}")
594
+
595
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
596
+
597
+ # return render_template("index.html")
598
+
599
+ # @app.route("/download")
600
+ # def download_file():
601
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
602
+ # return send_file(file_path, as_attachment=True)
603
+
604
+ # if __name__ == "__main__":
605
+ # app.run(host="0.0.0.0", port=8000,debug=True)
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+
619
+ # import os
620
+ # import time
621
+ # import numpy as np
622
+ # from flask import Flask, render_template, request, send_file
623
+ # from rdkit import Chem
624
+ # from transformers import AutoModel
625
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
626
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
627
+ # from huggingface_hub import hf_hub_download # Import for direct file download
628
+
629
+ # # Define directories for different models
630
+ # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
631
+ # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
632
+
633
+ # # Ensure directories exist
634
+ # os.makedirs(bio_model_dir, exist_ok=True)
635
+ # os.makedirs(cvn_model_dir, exist_ok=True)
636
+
637
+ # UPLOAD_FOLDER = "Samples"
638
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
639
+
640
+ # app = Flask(__name__)
641
+
642
+ # # Download only the required pytorch_model.bin file
643
+ # model_filename = "pytorch_model.bin"
644
+ # model_path = os.path.join(bio_model_dir, model_filename)
645
+ # if not os.path.exists(model_path):
646
+ # print("Downloading pytorch_model.bin from Hugging Face...")
647
+ # hf_hub_download(repo_id="Rostlab/prot_bert_bfd", filename=model_filename, local_dir=bio_model_dir)
648
+
649
+ # # Load bio-embedding model once
650
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
651
+
652
+ # def generate_bio_embeddings(sequence):
653
+ # """Generate bio-embeddings for a given protein sequence."""
654
+ # try:
655
+ # embedding_protein = embedder.embed(sequence)
656
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
657
+ # return np.array(embedding_per_protein).reshape(1, -1)
658
+ # except Exception as e:
659
+ # print(f"Embedding Error: {e}")
660
+ # return None
661
+
662
+ # def generate_smiles(sequence, n_samples=100):
663
+ # """Generate SMILES from a protein sequence."""
664
+ # start_time = time.time()
665
+
666
+ # protein_embedding = generate_bio_embeddings(sequence)
667
+ # if protein_embedding is None:
668
+ # return None, "Embedding generation failed!"
669
+
670
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
671
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
672
+ # valid_samples = [sample for sample in samples if sample is not None]
673
+
674
+ # smiles_list = [
675
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
676
+ # ]
677
+
678
+ # if not smiles_list:
679
+ # return None, "No valid SMILES generated!"
680
+
681
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
682
+ # with open(filename, "w") as file:
683
+ # file.write("\n".join(smiles_list))
684
+
685
+ # elapsed_time = time.time() - start_time
686
+ # return filename, elapsed_time
687
+
688
+ # @app.route("/", methods=["GET", "POST"])
689
+ # def index():
690
+ # if request.method == "POST":
691
+ # sequence = request.form["sequence"].strip()
692
+ # if not sequence:
693
+ # return render_template("index.html", message="Please enter a valid sequence.")
694
+
695
+ # file_path, result = generate_smiles(sequence)
696
+ # if file_path is None:
697
+ # return render_template("index.html", message=f"Error: {result}")
698
+
699
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
700
+
701
+ # return render_template("index.html")
702
+
703
+ # @app.route("/download")
704
+ # def download_file():
705
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
706
+ # return send_file(file_path, as_attachment=True)
707
+
708
+ # if __name__ == "__main__":
709
+ # app.run(host="0.0.0.0", port=8000, debug=True)
710
+
711
+
712
+
713
+
714
+
715
+ # import os
716
+ # import time
717
+ # import requests
718
+ # import numpy as np
719
+ # import gdown # NEW: For Google Drive downloads
720
+ # from flask import Flask, render_template, request, send_file
721
+ # from rdkit import Chem
722
+ # from transformers import AutoModel
723
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
724
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
725
+
726
+ # # REPLACE WITH YOUR GOOGLE DRIVE FILE IDs
727
+ # GDRIVE_FILE_IDS = {
728
+ # "pytorch_model.bin": "11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb", # Replace with actual ID
729
+ # "config.json": "1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E",
730
+ # "tokenizer_config.json": "1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc",
731
+ # "vocab.txt": "1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w",
732
+ # "special_tokens_map.json": "1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl"
733
+ # }
734
+
735
+ # # LOCAL DIRECTORIES
736
+ # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
737
+ # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
738
+
739
+ # os.makedirs(bio_model_dir, exist_ok=True)
740
+ # os.makedirs(cvn_model_dir, exist_ok=True)
741
+
742
+ # os.environ["TMPDIR"] = bio_model_dir
743
+ # os.environ["TEMP"] = bio_model_dir
744
+ # os.environ["TMP"] = bio_model_dir
745
+
746
+ # UPLOAD_FOLDER = "Samples"
747
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
748
+
749
+ # app = Flask(__name__)
750
+
751
+ # # DOWNLOAD MODEL FILES IF MISSING
752
+ # for file_name, file_id in GDRIVE_FILE_IDS.items():
753
+ # file_path = os.path.join(bio_model_dir, file_name)
754
+
755
+ # if not os.path.exists(file_path):
756
+ # print(f"Downloading {file_name} from Google Drive...")
757
+ # gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
758
+ # print(f"{file_name} downloaded!")
759
+
760
+ # # BIO-EMBEDDING MODEL LOADING
761
+ # try:
762
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
763
+ # except Exception as e:
764
+ # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
765
+ # embedder = None
766
+
767
+ # def generate_bio_embeddings(sequence):
768
+ # """Generate bio-embeddings for a given protein sequence."""
769
+ # if embedder is None:
770
+ # return None
771
+ # try:
772
+ # embedding_protein = embedder.embed(sequence)
773
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
774
+ # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
775
+ # except Exception as e:
776
+ # print(f"Embedding Error: {e}")
777
+ # return None
778
+
779
+ # def generate_smiles(sequence, n_samples=100):
780
+ # """Generate SMILES from a protein sequence."""
781
+ # start_time = time.time()
782
+
783
+ # protein_embedding = generate_bio_embeddings(sequence)
784
+ # if protein_embedding is None:
785
+ # return None, "Embedding generation failed!"
786
+
787
+ # # LOAD TRAINED CVanilla_RNN_Builder MODEL
788
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
789
+
790
+ # # MOLECULAR GRAPH GENERATION
791
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
792
+ # valid_samples = [sample for sample in samples if sample is not None]
793
+
794
+ # # CONVERT TO SMILES
795
+ # smiles_list = [
796
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
797
+ # ]
798
+
799
+ # if not smiles_list:
800
+ # return None, "No valid SMILES generated!"
801
+
802
+ # # SAVE TO FILE
803
+ # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
804
+ # with open(filename, "w") as file:
805
+ # file.write("\n".join(smiles_list))
806
+
807
+ # elapsed_time = time.time() - start_time
808
+ # return filename, elapsed_time
809
+
810
+ # @app.route("/", methods=["GET", "POST"])
811
+ # def index():
812
+ # if request.method == "POST":
813
+ # sequence = request.form["sequence"].strip()
814
+ # if not sequence:
815
+ # return render_template("index.html", message="Please enter a valid sequence.")
816
+
817
+ # file_path, result = generate_smiles(sequence)
818
+ # if file_path is None:
819
+ # return render_template("index.html", message=f"Error: {result}")
820
+
821
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
822
+
823
+ # return render_template("index.html")
824
+
825
+ # @app.route("/download")
826
+ # def download_file():
827
+ # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
828
+ # return send_file(file_path, as_attachment=True)
829
+
830
+ # if __name__ == "__main__":
831
+ # app.run(host="0.0.0.0", port=8000, debug=True)
832
+
833
+
834
+
835
+ # import os
836
+ # import time
837
+ # import gdown
838
+ # import numpy as np
839
+ # from flask import Flask, render_template, request, send_file
840
+ # from rdkit import Chem
841
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
842
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
843
+
844
+ # # DIRECTORIES
845
+ # bio_model_dir = "/app/modelsBioembed"
846
+ # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
847
+ # upload_folder = "Samples"
848
+
849
+ # # Create directories if they don't exist
850
+ # os.makedirs(bio_model_dir, exist_ok=True)
851
+ # os.makedirs(cvn_model_dir, exist_ok=True)
852
+ # os.makedirs(upload_folder, exist_ok=True)
853
+
854
+ # # Google Drive file IDs for the model files
855
+ # MODEL_FILES = {
856
+ # "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
857
+ # "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
858
+ # "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
859
+ # "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
860
+ # "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
861
+ # }
862
+
863
+ # # Function to download missing files from Google Drive
864
+ # def download_model_files():
865
+ # for filename, file_id in MODEL_FILES.items():
866
+ # file_path = os.path.join(bio_model_dir, filename)
867
+ # if not os.path.exists(file_path):
868
+ # print(f"Downloading {filename} from Google Drive...")
869
+ # gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
870
+
871
+ # # Download required model files
872
+ # download_model_files()
873
+ # print("All model files are ready!")
874
+
875
+ # # Load the ProtTrans-BERT-BFD Model
876
+ # try:
877
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
878
+ # print("ProtTrans-BERT-BFD model loaded successfully!")
879
+ # except Exception as e:
880
+ # print(f"Error loading model: {e}")
881
+ # embedder = None
882
+
883
+ # # Function to generate protein embeddings
884
+ # def generate_bio_embeddings(sequence):
885
+ # if embedder is None:
886
+ # return None
887
+ # try:
888
+ # embedding_protein = embedder.embed(sequence)
889
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
890
+ # return np.array(embedding_per_protein).reshape(1, -1)
891
+ # except Exception as e:
892
+ # print(f"Embedding Error: {e}")
893
+ # return None
894
+
895
+ # # Function to generate SMILES from a protein sequence
896
+ # def generate_smiles(sequence, n_samples=100):
897
+ # start_time = time.time()
898
+
899
+ # protein_embedding = generate_bio_embeddings(sequence)
900
+ # if protein_embedding is None:
901
+ # return None, "Embedding generation failed!"
902
+
903
+ # # Load the trained CVanilla_RNN_Builder model
904
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
905
+
906
+ # # Generate molecular graphs
907
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
908
+ # valid_samples = [sample for sample in samples if sample is not None]
909
+
910
+ # # Convert to SMILES format
911
+ # smiles_list = [
912
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
913
+ # ]
914
+
915
+ # if not smiles_list:
916
+ # return None, "No valid SMILES generated!"
917
+
918
+ # # Save SMILES to a file
919
+ # filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
920
+ # with open(filename, "w") as file:
921
+ # file.write("\n".join(smiles_list))
922
+
923
+ # elapsed_time = time.time() - start_time
924
+ # return filename, elapsed_time
925
+
926
+ # # Initialize Flask App
927
+ # app = Flask(__name__)
928
+
929
+ # @app.route("/", methods=["GET", "POST"])
930
+ # def index():
931
+ # if request.method == "POST":
932
+ # sequence = request.form["sequence"].strip()
933
+ # if not sequence:
934
+ # return render_template("index.html", message="Please enter a valid sequence.")
935
+
936
+ # file_path, result = generate_smiles(sequence)
937
+ # if file_path is None:
938
+ # return render_template("index.html", message=f"Error: {result}")
939
+
940
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
941
+
942
+ # return render_template("index.html")
943
+
944
+ # @app.route("/download")
945
+ # def download_file():
946
+ # file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
947
+ # return send_file(file_path, as_attachment=True)
948
+
949
+ # if __name__ == "__main__":
950
+ # app.run(host="0.0.0.0", port=8000)
951
+
952
+
953
+
954
+ # import os
955
+ # import time
956
+ # import requests
957
+ # from flask import Flask, render_template, request, send_file
958
+ # from rdkit import Chem
959
+ # from bio_embeddings.embed import ProtTransBertBFDEmbedder
960
+ # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
961
+
962
+ # # DIRECTORIES
963
+ # bio_model_dir = "/app/modelsBioembed"
964
+ # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
965
+ # upload_folder = "Samples"
966
+
967
+ # # Create directories if they don't exist
968
+ # os.makedirs(bio_model_dir, exist_ok=True)
969
+ # os.makedirs(cvn_model_dir, exist_ok=True)
970
+ # os.makedirs(upload_folder, exist_ok=True)
971
+
972
+ # # Google Drive file IDs for the model files
973
+ # MODEL_FILES = {
974
+ # "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
975
+ # "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
976
+ # "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
977
+ # "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
978
+ # "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
979
+ # }
980
+
981
+ # # Function to download a file from Google Drive
982
+ # def download_file_from_google_drive(file_id, destination):
983
+ # URL = f"https://drive.google.com/uc?export=download&id={file_id}"
984
+ # session = requests.Session()
985
+ # response = session.get(URL, stream=True)
986
+
987
+ # # Check if the request was successful
988
+ # if response.status_code == 200:
989
+ # with open(destination, "wb") as f:
990
+ # for chunk in response.iter_content(chunk_size=128):
991
+ # f.write(chunk)
992
+ # print(f"Downloaded {destination}")
993
+ # else:
994
+ # print(f"Failed to download {destination}")
995
+
996
+ # # Function to download missing files from Google Drive
997
+ # def download_model_files():
998
+ # for filename, file_id in MODEL_FILES.items():
999
+ # file_path = os.path.join(bio_model_dir, filename)
1000
+ # if not os.path.exists(file_path):
1001
+ # print(f"Downloading {filename} from Google Drive...")
1002
+ # download_file_from_google_drive(file_id, file_path)
1003
+
1004
+ # # Download required model files
1005
+ # download_model_files()
1006
+ # print("All model files are ready!")
1007
+
1008
+ # # Load the ProtTrans-BERT-BFD Model
1009
+ # try:
1010
+ # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
1011
+ # print("ProtTrans-BERT-BFD model loaded successfully!")
1012
+ # except Exception as e:
1013
+ # print(f"Error loading model: {e}")
1014
+ # embedder = None
1015
+
1016
+ # # Function to generate protein embeddings
1017
+ # def generate_bio_embeddings(sequence):
1018
+ # if embedder is None:
1019
+ # return None
1020
+ # try:
1021
+ # embedding_protein = embedder.embed(sequence)
1022
+ # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
1023
+ # return np.array(embedding_per_protein).reshape(1, -1)
1024
+ # except Exception as e:
1025
+ # print(f"Embedding Error: {e}")
1026
+ # return None
1027
+
1028
+ # # Function to generate SMILES from a protein sequence
1029
+ # def generate_smiles(sequence, n_samples=100):
1030
+ # start_time = time.time()
1031
+
1032
+ # protein_embedding = generate_bio_embeddings(sequence)
1033
+ # if protein_embedding is None:
1034
+ # return None, "Embedding generation failed!"
1035
+
1036
+ # # Load the trained CVanilla_RNN_Builder model
1037
+ # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
1038
+
1039
+ # # Generate molecular graphs
1040
+ # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
1041
+ # valid_samples = [sample for sample in samples if sample is not None]
1042
+
1043
+ # # Convert to SMILES format
1044
+ # smiles_list = [
1045
+ # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
1046
+ # ]
1047
+
1048
+ # if not smiles_list:
1049
+ # return None, "No valid SMILES generated!"
1050
+
1051
+ # # Save SMILES to a file
1052
+ # filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1053
+ # with open(filename, "w") as file:
1054
+ # file.write("\n".join(smiles_list))
1055
+
1056
+ # elapsed_time = time.time() - start_time
1057
+ # return filename, elapsed_time
1058
+
1059
+ # # Initialize Flask App
1060
+ # app = Flask(__name__)
1061
+
1062
+ # @app.route("/", methods=["GET", "POST"])
1063
+ # def index():
1064
+ # if request.method == "POST":
1065
+ # sequence = request.form["sequence"].strip()
1066
+ # if not sequence:
1067
+ # return render_template("index.html", message="Please enter a valid sequence.")
1068
+
1069
+ # file_path, result = generate_smiles(sequence)
1070
+ # if file_path is None:
1071
+ # return render_template("index.html", message=f"Error: {result}")
1072
+
1073
+ # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
1074
+
1075
+ # return render_template("index.html")
1076
+
1077
+ # @app.route("/download")
1078
+ # def download_file():
1079
+ # file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1080
+ # return send_file(file_path, as_attachment=True)
1081
+
1082
+ # if __name__ == "__main__":
1083
+ # app.run(host="0.0.0.0", port=8000)
1084
+
modelsBioembed/.gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
modelsBioembed/README.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: protein
3
+ tags:
4
+ - protein language model
5
+ datasets:
6
+ - BFD
7
+ ---
8
+
9
+ # ProtBert-BFD model
10
+
11
+ Pretrained model on protein sequences using a masked language modeling (MLM) objective. It was introduced in
12
+ [this paper](https://doi.org/10.1101/2020.07.12.199554) and first released in
13
+ [this repository](https://github.com/agemagician/ProtTrans). This model is trained on uppercase amino acids: it only works with capital letter amino acids.
14
+
15
+
16
+ ## Model description
17
+
18
+ ProtBert-BFD is based on Bert model which pretrained on a large corpus of protein sequences in a self-supervised fashion.
19
+ This means it was pretrained on the raw protein sequences only, with no humans labelling them in any way (which is why it can use lots of
20
+ publicly available data) with an automatic process to generate inputs and labels from those protein sequences.
21
+
22
+ One important difference between our Bert model and the original Bert version is the way of dealing with sequences as separate documents
23
+ This means the Next sentence prediction is not used, as each sequence is treated as a complete document.
24
+ The masking follows the original Bert training with randomly masks 15% of the amino acids in the input.
25
+
26
+ At the end, the feature extracted from this model revealed that the LM-embeddings from unlabeled data (only protein sequences) captured important biophysical properties governing protein
27
+ shape.
28
+ This implied learning some of the grammar of the language of life realized in protein sequences.
29
+
30
+ ## Intended uses & limitations
31
+
32
+ The model could be used for protein feature extraction or to be fine-tuned on downstream tasks.
33
+ We have noticed in some tasks you could gain more accuracy by fine-tuning the model rather than using it as a feature extractor.
34
+
35
+ ### How to use
36
+
37
+ You can use this model directly with a pipeline for masked language modeling:
38
+
39
+ ```python
40
+ >>> from transformers import BertForMaskedLM, BertTokenizer, pipeline
41
+ >>> tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
42
+ >>> model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert_bfd")
43
+ >>> unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)
44
+ >>> unmasker('D L I P T S S K L V V [MASK] D T S L Q V K K A F F A L V T')
45
+
46
+ [{'score': 0.1165614128112793,
47
+ 'sequence': '[CLS] D L I P T S S K L V V L D T S L Q V K K A F F A L V T [SEP]',
48
+ 'token': 5,
49
+ 'token_str': 'L'},
50
+ {'score': 0.08976086974143982,
51
+ 'sequence': '[CLS] D L I P T S S K L V V V D T S L Q V K K A F F A L V T [SEP]',
52
+ 'token': 8,
53
+ 'token_str': 'V'},
54
+ {'score': 0.08864385634660721,
55
+ 'sequence': '[CLS] D L I P T S S K L V V S D T S L Q V K K A F F A L V T [SEP]',
56
+ 'token': 10,
57
+ 'token_str': 'S'},
58
+ {'score': 0.06227643042802811,
59
+ 'sequence': '[CLS] D L I P T S S K L V V A D T S L Q V K K A F F A L V T [SEP]',
60
+ 'token': 6,
61
+ 'token_str': 'A'},
62
+ {'score': 0.06194969266653061,
63
+ 'sequence': '[CLS] D L I P T S S K L V V T D T S L Q V K K A F F A L V T [SEP]',
64
+ 'token': 15,
65
+ 'token_str': 'T'}]
66
+ ```
67
+
68
+ Here is how to use this model to get the features of a given protein sequence in PyTorch:
69
+
70
+ ```python
71
+ from transformers import BertModel, BertTokenizer
72
+ import re
73
+ tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
74
+ model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
75
+ sequence_Example = "A E T C Z A O"
76
+ sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
77
+ encoded_input = tokenizer(sequence_Example, return_tensors='pt')
78
+ output = model(**encoded_input)
79
+ ```
80
+
81
+ ## Training data
82
+
83
+ The ProtBert-BFD model was pretrained on [BFD](https://bfd.mmseqs.com/), a dataset consisting of 2.1 billion protein sequences.
84
+
85
+ ## Training procedure
86
+
87
+ ### Preprocessing
88
+
89
+ The protein sequences are uppercased and tokenized using a single space and a vocabulary size of 21.
90
+ The inputs of the model are then of the form:
91
+
92
+ ```
93
+ [CLS] Protein Sequence A [SEP] Protein Sequence B [SEP]
94
+ ```
95
+
96
+ Furthermore, each protein sequence was treated as a separate document.
97
+ The preprocessing step was performed twice, once for a combined length (2 sequences) of less than 512 amino acids, and another time using a combined length (2 sequences) of less than 2048 amino acids.
98
+
99
+ The details of the masking procedure for each sequence followed the original Bert model as following:
100
+ - 15% of the amino acids are masked.
101
+ - In 80% of the cases, the masked amino acids are replaced by `[MASK]`.
102
+ - In 10% of the cases, the masked amino acids are replaced by a random amino acid (different) from the one they replace.
103
+ - In the 10% remaining cases, the masked amino acids are left as is.
104
+
105
+ ### Pretraining
106
+
107
+ The model was trained on a single TPU Pod V3-1024 for one million steps in total.
108
+ 800k steps using sequence length 512 (batch size 32k), and 200K steps using sequence length 2048 (batch size 6k).
109
+ The optimizer used is Lamb with a learning rate of 0.002, a weight decay of 0.01, learning rate warmup for 140k steps and linear decay of the learning rate after.
110
+
111
+ ## Evaluation results
112
+
113
+ When fine-tuned on downstream tasks, this model achieves the following results:
114
+
115
+ Test results :
116
+
117
+ | Task/Dataset | secondary structure (3-states) | secondary structure (8-states) | Localization | Membrane |
118
+ |:-----:|:-----:|:-----:|:-----:|:-----:|
119
+ | CASP12 | 76 | 65 | | |
120
+ | TS115 | 84 | 73 | | |
121
+ | CB513 | 83 | 70 | | |
122
+ | DeepLoc | | | 78 | 91 |
123
+
124
+ ### BibTeX entry and citation info
125
+
126
+ ```bibtex
127
+ @article {Elnaggar2020.07.12.199554,
128
+ author = {Elnaggar, Ahmed and Heinzinger, Michael and Dallago, Christian and Rehawi, Ghalia and Wang, Yu and Jones, Llion and Gibbs, Tom and Feher, Tamas and Angerer, Christoph and Steinegger, Martin and BHOWMIK, DEBSINDHU and Rost, Burkhard},
129
+ title = {ProtTrans: Towards Cracking the Language of Life{\textquoteright}s Code Through Self-Supervised Deep Learning and High Performance Computing},
130
+ elocation-id = {2020.07.12.199554},
131
+ year = {2020},
132
+ doi = {10.1101/2020.07.12.199554},
133
+ publisher = {Cold Spring Harbor Laboratory},
134
+ abstract = {Computational biology and bioinformatics provide vast data gold-mines from protein sequences, ideal for Language Models (LMs) taken from Natural Language Processing (NLP). These LMs reach for new prediction frontiers at low inference costs. Here, we trained two auto-regressive language models (Transformer-XL, XLNet) and two auto-encoder models (Bert, Albert) on data from UniRef and BFD containing up to 393 billion amino acids (words) from 2.1 billion protein sequences (22- and 112 times the entire English Wikipedia). The LMs were trained on the Summit supercomputer at Oak Ridge National Laboratory (ORNL), using 936 nodes (total 5616 GPUs) and one TPU Pod (V3-512 or V3-1024). We validated the advantage of up-scaling LMs to larger models supported by bigger data by predicting secondary structure (3-states: Q3=76-84, 8 states: Q8=65-73), sub-cellular localization for 10 cellular compartments (Q10=74) and whether a protein is membrane-bound or water-soluble (Q2=89). Dimensionality reduction revealed that the LM-embeddings from unlabeled data (only protein sequences) captured important biophysical properties governing protein shape. This implied learning some of the grammar of the language of life realized in protein sequences. The successful up-scaling of protein LMs through HPC to larger data sets slightly reduced the gap between models trained on evolutionary information and LMs. Availability ProtTrans: \<a href="https://github.com/agemagician/ProtTrans"\>https://github.com/agemagician/ProtTrans\</a\>Competing Interest StatementThe authors have declared no competing interest.},
135
+ URL = {https://www.biorxiv.org/content/early/2020/07/21/2020.07.12.199554},
136
+ eprint = {https://www.biorxiv.org/content/early/2020/07/21/2020.07.12.199554.full.pdf},
137
+ journal = {bioRxiv}
138
+ }
139
+ ```
140
+
141
+ > Created by [Ahmed Elnaggar/@Elnaggar_AI](https://twitter.com/Elnaggar_AI) | [LinkedIn](https://www.linkedin.com/in/prof-ahmed-elnaggar/)
modelsBioembed/config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.0,
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 4096,
11
+ "max_position_embeddings": 40000,
12
+ "num_attention_heads": 16,
13
+ "num_hidden_layers": 30,
14
+ "type_vocab_size": 2,
15
+ "vocab_size": 30
16
+ }
modelsBioembed/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
modelsBioembed/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "special_tokens_map_file": null, "full_tokenizer_file": null}
modelsBioembed/vocab.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ L
7
+ A
8
+ G
9
+ V
10
+ E
11
+ S
12
+ I
13
+ K
14
+ R
15
+ D
16
+ T
17
+ P
18
+ N
19
+ Q
20
+ F
21
+ Y
22
+ M
23
+ H
24
+ C
25
+ W
26
+ X
27
+ U
28
+ B
29
+ Z
30
+ O
models_folder/atom_types.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ P,0,0
2
+ F,0,0
3
+ O,0,1
4
+ Sn,0,0
5
+ K,0,0
6
+ S,0,2
7
+ Fe,-2,0
8
+ Cl,3,0
9
+ Zn,2,0
10
+ S,-1,0
11
+ O,-1,0
12
+ N,1,1
13
+ Cl,1,0
14
+ C,-1,0
15
+ N,1,2
16
+ Na,0,0
17
+ V,0,0
18
+ C,0,1
19
+ N,1,0
20
+ N,-1,1
21
+ Li,0,0
22
+ N,1,3
23
+ I,1,0
24
+ B,-1,0
25
+ N,0,0
26
+ Re,0,0
27
+ O,1,0
28
+ S,1,1
29
+ N,0,1
30
+ B,0,0
31
+ N,-2,0
32
+ N,0,3
33
+ F,0,1
34
+ O,0,0
35
+ S,0,0
36
+ O,0,2
37
+ Br,0,1
38
+ Ca,2,0
39
+ Se,0,0
40
+ Ru,0,0
41
+ P,0,1
42
+ B,-1,3
43
+ Fe,0,0
44
+ S,-1,1
45
+ C,1,0
46
+ C,0,0
47
+ Si,0,0
48
+ O,1,1
49
+ I,0,0
50
+ Hg,0,0
51
+ C,0,3
52
+ C,-1,1
53
+ Br,0,0
54
+ H,0,0
55
+ Na,1,0
56
+ I,0,1
57
+ Sb,0,0
58
+ Te,0,0
59
+ C,0,2
60
+ S,0,1
61
+ P,0,2
62
+ P,-1,0
63
+ Cl,0,0
64
+ P,1,0
65
+ C,-1,2
66
+ S,1,0
67
+ N,-1,0
models_folder/configs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"N_C": 1024, "F_e": 16, "F_h": [32, 64, 128, 128, 256, 256], "F_skip": 256, "F_c": [512], "Fh_policy": 128, "activation": "relu", "rename": false, "N_rnn": 3}
modelstrc.py ADDED
@@ -0,0 +1,1331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import rdkit
2
+ from rdkit import Chem
3
+ from rdkit.Chem import Draw
4
+ from rdkit import DataStructs
5
+ from rdkit.Chem import AllChem
6
+ from rdkit.Chem import rdmolfiles
7
+ from rdkit.Chem.Draw import IPythonConsole
8
+ from molvs import standardize_smiles
9
+ import os
10
+ import gc
11
+ import sys
12
+ import time
13
+ import json
14
+ import math
15
+ import random
16
+ import argparse
17
+ import itertools
18
+ import numpy as np
19
+ import mxnet as mx
20
+ import pandas as pd
21
+ import networkx as nx
22
+ from scipy import sparse
23
+ from mxnet.gluon import nn
24
+ from collections import Counter
25
+ from mxnet.autograd import Function
26
+ from mxnet.gluon.data import Dataset
27
+ from mxnet import gluon, autograd, nd
28
+ from mxnet.gluon.data import DataLoader
29
+ from abc import ABCMeta, abstractmethod
30
+ from mxnet.gluon.data.sampler import Sampler
31
+
32
+ class MoleculeSpec(object):
33
+
34
+ def __init__(self, file_name='models_folder/atom_types.txt'):
35
+ self.atom_types = []
36
+ self.atom_symbols = []
37
+ with open(file_name) as f:
38
+ for line in f:
39
+ atom_type_i = line.strip('\n').split(',')
40
+ self.atom_types.append((atom_type_i[0], int(atom_type_i[1]), int(atom_type_i[2])))
41
+ if atom_type_i[0] not in self.atom_symbols:
42
+ self.atom_symbols.append(atom_type_i[0])
43
+ self.bond_orders = [Chem.BondType.AROMATIC,
44
+ Chem.BondType.SINGLE,
45
+ Chem.BondType.DOUBLE,
46
+ Chem.BondType.TRIPLE]
47
+ self.max_iter = 120
48
+
49
+ def get_atom_type(self, atom):
50
+ atom_symbol = atom.GetSymbol()
51
+ atom_charge = atom.GetFormalCharge()
52
+ atom_hs = atom.GetNumExplicitHs()
53
+ return self.atom_types.index((atom_symbol, atom_charge, atom_hs))
54
+
55
+ def get_bond_type(self, bond):
56
+ return self.bond_orders.index(bond.GetBondType())
57
+
58
+ def index_to_atom(self, idx):
59
+ atom_symbol, atom_charge, atom_hs = self.atom_types[idx]
60
+ a = Chem.Atom(atom_symbol)
61
+ a.SetFormalCharge(atom_charge)
62
+ a.SetNumExplicitHs(atom_hs)
63
+ return a
64
+
65
+ def index_to_bond(self, mol, begin_id, end_id, idx):
66
+ mol.AddBond(begin_id, end_id, self.bond_orders[idx])
67
+
68
+ @property
69
+ def num_atom_types(self):
70
+ return len(self.atom_types)
71
+
72
+ @property
73
+ def num_bond_types(self):
74
+ return len(self.bond_orders)
75
+
76
+ _mol_spec = None
77
+
78
+ def get_mol_spec():
79
+ global _mol_spec
80
+ if _mol_spec is None:
81
+ _mol_spec = MoleculeSpec()
82
+ return _mol_spec
83
+
84
+
85
+ def get_graph_from_smiles(smiles):
86
+ mol = Chem.MolFromSmiles(smiles)
87
+
88
+ # build graph
89
+ atom_types, atom_ranks, bonds, bond_types = [], [], [], []
90
+ for a, r in zip(mol.GetAtoms(), Chem.CanonicalRankAtoms(mol)):
91
+ atom_types.append(get_mol_spec().get_atom_type(a))
92
+ atom_ranks.append(r)
93
+ for b in mol.GetBonds():
94
+ idx_1, idx_2, bt = b.GetBeginAtomIdx(), b.GetEndAtomIdx(), get_mol_spec().get_bond_type(b)
95
+ bonds.append([idx_1, idx_2])
96
+ bond_types.append(bt)
97
+
98
+ # build nx graph
99
+ graph = nx.Graph()
100
+ graph.add_nodes_from(range(len(atom_types)))
101
+ graph.add_edges_from(bonds)
102
+
103
+ return graph, atom_types, atom_ranks, bonds, bond_types
104
+
105
+
106
+ def get_graph_from_smiles_list(smiles_list):
107
+ graph_list = []
108
+ for smiles in smiles_list:
109
+ mol = Chem.MolFromSmiles(smiles)
110
+
111
+ # build graph
112
+ atom_types, bonds, bond_types = [], [], []
113
+ for a in mol.GetAtoms():
114
+ atom_types.append(get_mol_spec().get_atom_type(a))
115
+ for b in mol.GetBonds():
116
+ idx_1, idx_2, bt = b.GetBeginAtomIdx(), b.GetEndAtomIdx(), get_mol_spec().get_bond_type(b)
117
+ bonds.append([idx_1, idx_2])
118
+ bond_types.append(bt)
119
+
120
+ X_0 = np.array(atom_types, dtype=np.int64)
121
+ A_0 = np.concatenate([np.array(bonds, dtype=np.int64),
122
+ np.array(bond_types, dtype=np.int64)[:, np.newaxis]],
123
+ axis=1)
124
+ graph_list.append([X_0, A_0])
125
+ return graph_list
126
+
127
+
128
+ def traverse_graph(graph, atom_ranks, current_node=None, step_ids=None, p=0.9, log_p=0.0):
129
+ if current_node is None:
130
+ next_nodes = range(len(atom_ranks))
131
+ step_ids = [-1, ] * len(next_nodes)
132
+ next_node_ranks = atom_ranks
133
+ else:
134
+ next_nodes = graph.neighbors(current_node) # get neighbor nodes
135
+ next_nodes = [n for n in next_nodes if step_ids[n] < 0] # filter visited nodes
136
+ next_node_ranks = [atom_ranks[n] for n in next_nodes] # get ranks for neighbors
137
+ next_nodes = [n for n, r in sorted(zip(next_nodes, next_node_ranks), key=lambda _x:_x[1])] # sort by rank
138
+
139
+ # iterate through neighbors
140
+ while len(next_nodes) > 0:
141
+ if len(next_nodes)==1:
142
+ next_node = next_nodes[0]
143
+ elif random.random() >= (1 - p):
144
+ next_node = next_nodes[0]
145
+ log_p += np.log(p)
146
+ else:
147
+ next_node = next_nodes[random.randint(1, len(next_nodes) - 1)]
148
+ log_p += np.log((1.0 - p) / (len(next_nodes) - 1))
149
+ step_ids[next_node] = max(step_ids) + 1
150
+ _, log_p = traverse_graph(graph, atom_ranks, next_node, step_ids, p, log_p)
151
+ next_nodes = [n for n in next_nodes if step_ids[n] < 0] # filter visited nodes
152
+
153
+ return step_ids, log_p
154
+
155
+
156
+ def single_reorder(X_0, A_0, step_ids):
157
+ X_0, A_0 = np.copy(X_0), np.copy(A_0)
158
+
159
+ step_ids = np.array(step_ids, dtype=np.int64)
160
+
161
+ # sort by step_ids
162
+ sorted_ids = np.argsort(step_ids)
163
+ X_0 = X_0[sorted_ids]
164
+ A_0[:, 0], A_0[:, 1] = step_ids[A_0[:, 0]], step_ids[A_0[:, 1]]
165
+ max_b, min_b = np.amax(A_0[:, :2], axis=1), np.amin(A_0[:, :2], axis=1)
166
+ A_0 = A_0[np.lexsort([-min_b, max_b]), :]
167
+
168
+ # separate append and connect
169
+ max_b, min_b = np.amax(A_0[:, :2], axis=1), np.amin(A_0[:, :2], axis=1)
170
+ is_append = np.concatenate([np.array([True]), max_b[1:] > max_b[:-1]])
171
+ A_0 = np.concatenate([np.where(is_append[:, np.newaxis],
172
+ np.stack([min_b, max_b], axis=1),
173
+ np.stack([max_b, min_b], axis=1)),
174
+ A_0[:, -1:]], axis=1)
175
+
176
+ return X_0, A_0
177
+
178
+
179
+ def single_expand(X_0, A_0):
180
+ X_0, A_0 = np.copy(X_0), np.copy(A_0)
181
+
182
+ # expand X
183
+ is_append_iter = np.less(A_0[:, 0], A_0[:, 1]).astype(np.int64)
184
+ NX = np.cumsum(np.pad(is_append_iter, [[1, 0]], mode='constant', constant_values=1))
185
+ shift = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')[:-1])
186
+ X_index = np.arange(NX.sum(), dtype=np.int64) - np.repeat(shift, NX)
187
+ X = X_0[X_index]
188
+
189
+ # expand A
190
+ _, A_index = np.tril_indices(A_0.shape[0])
191
+ A = A_0[A_index, :]
192
+ NA = np.arange(A_0.shape[0] + 1)
193
+
194
+ # get action
195
+ # action_type, atom_type, bond_type, append_pos, connect_pos
196
+ action_type = 1 - is_append_iter
197
+ atom_type = np.where(action_type == 0, X_0[A_0[:, 1]], 0)
198
+ bond_type = A_0[:, 2]
199
+ append_pos = np.where(action_type == 0, A_0[:, 0], 0)
200
+ connect_pos = np.where(action_type == 1, A_0[:, 1], 0)
201
+ actions = np.stack([action_type, atom_type, bond_type, append_pos, connect_pos],
202
+ axis=1)
203
+ last_action = [[2, 0, 0, 0, 0]]
204
+ actions = np.append(actions, last_action, axis=0)
205
+
206
+ action_0 = np.array([X_0[0]], dtype=np.int64)
207
+
208
+ # }}}
209
+
210
+ # {{{ Get mask
211
+ last_atom_index = shift + NX - 1
212
+ last_atom_mask = np.zeros_like(X)
213
+ last_atom_mask[last_atom_index] = np.where(
214
+ np.pad(is_append_iter, [[1, 0]], mode='constant', constant_values=1) == 1,
215
+ np.ones_like(last_atom_index),
216
+ np.ones_like(last_atom_index) * 2)
217
+ # }}}
218
+
219
+ return action_0, X, NX, A, NA, actions, last_atom_mask
220
+
221
+
222
+ def get_d(A, X):
223
+ _to_sparse = lambda _A, _X: sparse.coo_matrix((np.ones([_A.shape[0] * 2], dtype=np.int64),
224
+ (np.concatenate([_A[:, 0], _A[:, 1]], axis=0),
225
+ np.concatenate([_A[:, 1], _A[:, 0]], axis=0))),
226
+ shape=[_X.shape[0], ] * 2)
227
+ A_sparse = _to_sparse(A, X)
228
+
229
+ d2 = A_sparse * A_sparse
230
+ d3 = d2 * A_sparse
231
+
232
+ # get D_2
233
+ D_2 = np.stack(d2.nonzero(), axis=1)
234
+ D_2 = D_2[D_2[:, 0] < D_2[:, 1], :]
235
+
236
+ # get D_3
237
+ D_3 = np.stack(d3.nonzero(), axis=1)
238
+ D_3 = D_3[D_3[:, 0] < D_3[:, 1], :]
239
+
240
+ # remove D_1 elements from D_3
241
+ D_3_sparse = _to_sparse(D_3, X)
242
+ D_3_sparse = D_3_sparse - D_3_sparse.multiply(A_sparse)
243
+ D_3 = np.stack(D_3_sparse.nonzero(), axis=1)
244
+ D_3 = D_3[D_3[:, 0] < D_3[:, 1], :]
245
+
246
+ return D_2, D_3
247
+
248
+
249
+ def merge_single_0(X_0, A_0, NX_0, NA_0):
250
+ # shift_ids
251
+ cumsum = np.cumsum(np.pad(NX_0, [[1, 0]], mode='constant')[:-1])
252
+ A_0[:, :2] += np.stack([np.repeat(cumsum, NA_0), ] * 2, axis=1)
253
+
254
+ # get D
255
+ D_0_2, D_0_3 = get_d(A_0, X_0)
256
+
257
+ # split A
258
+ A_split = []
259
+ for i in range(get_mol_spec().num_bond_types):
260
+ A_i = A_0[A_0[:, 2] == i, :2]
261
+ A_split.append(A_i)
262
+ A_split.extend([D_0_2, D_0_3])
263
+ A_0 = A_split
264
+
265
+ # NX_rep
266
+ NX_rep_0 = np.repeat(np.arange(NX_0.shape[0]), NX_0)
267
+
268
+ return X_0, A_0, NX_0, NX_rep_0
269
+
270
+
271
+ def merge_single(X, A,
272
+ NX, NA,
273
+ mol_ids, rep_ids, iw_ids,
274
+ action_0, actions,
275
+ last_append_mask,
276
+ log_p):
277
+ X, A, NX, NX_rep = merge_single_0(X, A, NX, NA)
278
+ cumsum = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')[:-1])
279
+ actions[:, -2] += cumsum * (actions[:, 0] == 0)
280
+ actions[:, -1] += cumsum * (actions[:, 0] == 1)
281
+ mol_ids_rep = np.repeat(mol_ids, NX)
282
+ rep_ids_rep = np.repeat(rep_ids, NX)
283
+
284
+ return X, A,\
285
+ mol_ids_rep, rep_ids_rep, iw_ids,\
286
+ last_append_mask,\
287
+ NX, NX_rep,\
288
+ action_0, actions, \
289
+ log_p
290
+
291
+ def process_single(smiles, k, p):
292
+ graph, atom_types, atom_ranks, bonds, bond_types = get_graph_from_smiles(smiles)
293
+
294
+ # original
295
+ X_0 = np.array(atom_types, dtype=np.int64)
296
+ A_0 = np.concatenate([np.array(bonds, dtype=np.int64),
297
+ np.array(bond_types, dtype=np.int64)[:, np.newaxis]],
298
+ axis=1)
299
+
300
+ X, A = [], []
301
+ NX, NA = [], []
302
+ mol_ids, rep_ids, iw_ids = [], [], []
303
+ action_0, actions = [], []
304
+ last_append_mask = []
305
+ log_p = []
306
+
307
+ # random sampling decoding route
308
+ for i in range(k):
309
+ step_ids_i, log_p_i = traverse_graph(graph, atom_ranks, p=p)
310
+ X_i, A_i = single_reorder(X_0, A_0, step_ids_i)
311
+ action_0_i, X_i, NX_i, A_i, NA_i, actions_i, last_atom_mask_i = single_expand(X_i, A_i)
312
+
313
+ # appends
314
+ X.append(X_i)
315
+ A.append(A_i)
316
+ NX.append(NX_i)
317
+ NA.append(NA_i)
318
+ action_0.append(action_0_i)
319
+ actions.append(actions_i)
320
+ last_append_mask.append(last_atom_mask_i)
321
+
322
+ mol_ids.append(np.zeros_like(NX_i, dtype=np.int64))
323
+ rep_ids.append(np.ones_like(NX_i, dtype=np.int64) * i)
324
+ iw_ids.append(np.ones_like(NX_i, dtype=np.int64) * i)
325
+
326
+ log_p.append(log_p_i)
327
+
328
+ # concatenate
329
+ X = np.concatenate(X, axis=0)
330
+ A = np.concatenate(A, axis = 0)
331
+ NX = np.concatenate(NX, axis = 0)
332
+ NA = np.concatenate(NA, axis = 0)
333
+ action_0 = np.concatenate(action_0, axis = 0)
334
+ actions = np.concatenate(actions, axis = 0)
335
+ last_append_mask = np.concatenate(last_append_mask, axis = 0)
336
+ mol_ids = np.concatenate(mol_ids, axis = 0)
337
+ rep_ids = np.concatenate(rep_ids, axis = 0)
338
+ iw_ids = np.concatenate(iw_ids, axis = 0)
339
+ log_p = np.array(log_p, dtype=np.float32)
340
+
341
+ return X, A, NX, NA, mol_ids, rep_ids, iw_ids, action_0, actions, last_append_mask, log_p
342
+
343
+
344
+ # noinspection PyArgumentList
345
+ def get_mol_from_graph(X, A, sanitize=True):
346
+ try:
347
+ mol = Chem.RWMol(Chem.Mol())
348
+
349
+ X, A = X.tolist(), A.tolist()
350
+ for i, atom_type in enumerate(X):
351
+ mol.AddAtom(get_mol_spec().index_to_atom(atom_type))
352
+
353
+ for atom_id1, atom_id2, bond_type in A:
354
+ get_mol_spec().index_to_bond(mol, atom_id1, atom_id2, bond_type)
355
+ except:
356
+ return None
357
+
358
+ if sanitize:
359
+ try:
360
+ mol = mol.GetMol()
361
+ Chem.SanitizeMol(mol)
362
+ return mol
363
+ except:
364
+ return None
365
+ else:
366
+ return mol
367
+
368
+ def get_mol_from_graph_list(graph_list, sanitize=True):
369
+ mol_list = [get_mol_from_graph(X, A, sanitize) for X, A in graph_list]
370
+ return mol_list
371
+
372
+ class GraphConvFn(Function):
373
+
374
+ def __init__(self, A):
375
+ self.A = A # type: nd.sparse.CSRNDArray
376
+ self.A_T = self.A # assume symmetric
377
+ super(GraphConvFn, self).__init__()
378
+
379
+ def forward(self, X):
380
+ if self.A is not None:
381
+ if len(X.shape) > 2:
382
+ X_resized = X.reshape((X.shape[0], -1))
383
+ output = nd.sparse.dot(self.A, X_resized)
384
+ output = output.reshape([-1, ] + [X.shape[i] for i in range(1, len(X.shape))])
385
+ else:
386
+ output = nd.sparse.dot(self.A, X)
387
+ return output
388
+ else:
389
+ return nd.zeros_like(X)
390
+
391
+ def backward(self, grad_output):
392
+
393
+ if self.A is not None:
394
+ if len(grad_output.shape) > 2:
395
+ grad_output_resized = grad_output.reshape((grad_output.shape[0], -1))
396
+ grad_input = nd.sparse.dot(self.A_T, grad_output_resized)
397
+ grad_input = grad_input.reshape([-1] + [grad_output.shape[i]
398
+ for i in range(1, len(grad_output.shape))])
399
+ else:
400
+ grad_input = nd.sparse.dot(self.A_T, grad_output)
401
+ return grad_input
402
+ else:
403
+ return nd.zeros_like(grad_output)
404
+
405
+
406
+ class EfficientGraphConvFn(Function):
407
+ """Save memory by re-computation"""
408
+
409
+ def __init__(self, A_list):
410
+ self.A_list = A_list
411
+ super(EfficientGraphConvFn, self).__init__()
412
+
413
+ def forward(self, X, W):
414
+ X_list = [X]
415
+ for A in self.A_list:
416
+ if A is not None:
417
+ X_list.append(nd.sparse.dot(A, X))
418
+ else:
419
+ X_list.append(nd.zeros_like(X))
420
+ X_out = nd.concat(*X_list, dim=1)
421
+ self.save_for_backward(X, W)
422
+
423
+ return nd.dot(X_out, W)
424
+
425
+ def backward(self, grad_output):
426
+ X, W = self.saved_tensors
427
+
428
+ # recompute X_out
429
+ X_list = [X, ]
430
+ for A in self.A_list:
431
+ if A is not None:
432
+ X_list.append(nd.sparse.dot(A, X))
433
+ else:
434
+ X_list.append(nd.zeros_like(X))
435
+ X_out = nd.concat(*X_list, dim=1)
436
+
437
+ grad_W = nd.dot(X_out.T, grad_output)
438
+
439
+ grad_X_out = nd.dot(grad_output, W.T)
440
+ grad_X_out_list = nd.split(grad_X_out, num_outputs=len(self.A_list) + 1)
441
+
442
+
443
+ grad_X = [grad_X_out_list[0], ]
444
+ for A, grad_X_out in zip(self.A_list, grad_X_out_list[1:]):
445
+ if A is not None:
446
+ grad_X.append(nd.sparse.dot(A, grad_X_out))
447
+ else:
448
+ grad_X.append(nd.zeros_like(grad_X_out))
449
+
450
+ grad_X = sum(grad_X)
451
+
452
+ return grad_X, grad_W
453
+
454
+
455
+ class SegmentSumFn(GraphConvFn):
456
+
457
+ def __init__(self, idx, num_seg):
458
+ # build A
459
+ # construct coo
460
+ data = nd.ones(idx.shape[0], ctx=idx.context, dtype='int64')
461
+ row, col = idx, nd.arange(idx.shape[0], ctx=idx.context, dtype='int64')
462
+ shape = (num_seg, int(idx.shape[0]))
463
+ sparse = nd.sparse.csr_matrix((data, (row, col)), shape=shape,
464
+ ctx=idx.context, dtype='float32')
465
+ super(SegmentSumFn, self).__init__(sparse)
466
+
467
+ sparse = nd.sparse.csr_matrix((data, (col, row)), shape=(shape[1], shape[0]),
468
+ ctx=idx.context, dtype='float32')
469
+ self.A_T = sparse
470
+
471
+
472
+ def squeeze(input, axis):
473
+ assert input.shape[axis] == 1
474
+
475
+ new_shape = list(input.shape)
476
+ del new_shape[axis]
477
+
478
+ return input.reshape(new_shape)
479
+
480
+
481
+ def unsqueeze(input, axis):
482
+ return nd.expand_dims(input, axis=axis)
483
+
484
+
485
+ def logsumexp(inputs, axis=None, keepdims=False):
486
+ """Numerically stable logsumexp.
487
+ Args:
488
+ inputs: A Variable with any shape.
489
+ axis: An integer.
490
+ keepdims: A boolean.
491
+ Returns:
492
+ Equivalent of log(sum(exp(inputs), dim=dim, keepdim=keepdim)).
493
+ Adopted from: https://github.com/pytorch/pytorch/issues/2591
494
+ """
495
+ # For a 1-D array x (any array along a single dimension),
496
+ # log sum exp(x) = s + log sum exp(x - s)
497
+ # with s = max(x) being a common choice.
498
+ if axis is None:
499
+ inputs = inputs.reshape([-1])
500
+ axis = 0
501
+ s = nd.max(inputs, axis=axis, keepdims=True)
502
+ outputs = s + (inputs - s).exp().sum(axis=axis, keepdims=True).log()
503
+ if not keepdims:
504
+ outputs = nd.sum(outputs, axis=axis, keepdims=False)
505
+ return outputs
506
+
507
+
508
+ def get_activation(name):
509
+ activation_dict = {
510
+ 'relu':nd.relu,
511
+ 'tanh':nd.tanh
512
+ }
513
+ return activation_dict[name]
514
+
515
+
516
+
517
+ class Linear_BN(nn.Sequential):
518
+ def __init__(self, F_in, F_out):
519
+ super(Linear_BN, self).__init__()
520
+ self.add(nn.Dense(F_out, in_units=F_in, use_bias=False))
521
+ self.add(BatchNorm(in_channels=F_out))
522
+
523
+
524
+ class GraphConv(nn.Block):
525
+
526
+ def __init__(self, Fin, Fout, D):
527
+ super(GraphConv, self).__init__()
528
+
529
+ # model settings
530
+ self.Fin = Fin
531
+ self.Fout = Fout
532
+ self.D = D
533
+
534
+ # model parameters
535
+ self.W = self.params.get('w', shape=(self.Fin * (self.D + 1), self.Fout),
536
+ init=None, allow_deferred_init=False)
537
+
538
+ def forward(self, X, A_list):
539
+ try:
540
+ assert len(A_list) == self.D
541
+ except AssertionError as e:
542
+ print(self.D, len(A_list))
543
+ raise e
544
+ return EfficientGraphConvFn(A_list)(X, self.W.data(X.context))
545
+
546
+
547
+ class Policy(nn.Block):
548
+
549
+ def __init__(self, F_in, F_h, N_A, N_B, k=1):
550
+ super(Policy, self).__init__()
551
+ self.F_in = F_in # number of input features for each atom
552
+ self.F_h = F_h # number of context variables
553
+ self.N_A = N_A # number of atom types
554
+ self.N_B = N_B # number of bond types
555
+ self.k = k # number of softmax used in the mixture
556
+
557
+
558
+ with self.name_scope():
559
+ self.linear_h = Linear_BN(F_in * 2, self.F_h * k)
560
+ self.linear_h_t = Linear_BN(F_in, self.F_h * k)
561
+
562
+ self.linear_x = nn.Dense(self.N_B + self.N_B*self.N_A, in_units=self.F_h)
563
+ self.linear_x_t = nn.Dense(1, in_units=self.F_h)
564
+
565
+ if self.k > 1:
566
+ self.linear_pi = nn.Dense(self.k, in_units=self.F_in)
567
+ else:
568
+ self.linear_pi = None
569
+
570
+ def forward(self, X, NX, NX_rep, X_end=None):
571
+ # segment mean for X
572
+ if X_end is None:
573
+ X_end = SegmentSumFn(NX_rep, NX.shape[0])(X)/nd.cast(fn.unsqueeze(NX, 1), 'float32')
574
+ X = nd.concat(X, X_end[NX_rep, :], dim=1)
575
+
576
+ X_h = nd.relu(self.linear_h(X)).reshape([-1, self.F_h])
577
+ X_h_end = nd.relu(self.linear_h_t(X_end)).reshape([-1, self.F_h])
578
+
579
+ X_x = nd.exp(self.linear_x(X_h)).reshape([-1, self.k, self.N_B + self.N_B*self.N_A])
580
+ X_x_end = nd.exp(self.linear_x_t(X_h_end)).reshape([-1, self.k, 1])
581
+
582
+ X_sum = nd.sum(SegmentSumFn(NX_rep, NX.shape[0])(X_x), -1, keepdims=True) + X_x_end
583
+ X_sum_gathered = X_sum[NX_rep, :, :]
584
+
585
+ X_softmax = X_x / X_sum_gathered
586
+ X_softmax_end = X_x_end/ X_sum
587
+
588
+ if self.k > 1:
589
+ pi = unsqueeze(nd.softmax(self.linear_pi(X_end), axis=1), -1)
590
+ pi_gathered = pi[NX_rep, :, :]
591
+
592
+ X_softmax = nd.sum(X_softmax * pi_gathered, axis=1)
593
+ X_softmax_end = nd.sum(X_softmax_end * pi, axis=1)
594
+ else:
595
+ X_softmax = squeeze(X_softmax, 1)
596
+ X_softmax_end = squeeze(X_softmax_end, 1)
597
+
598
+ # generate output probabilities
599
+ connect, append = X_softmax[:, :self.N_B], X_softmax[:, self.N_B:]
600
+ append = append.reshape([-1, self.N_A, self.N_B])
601
+ end = squeeze(X_softmax_end, -1)
602
+
603
+ return append, connect, end
604
+
605
+
606
+ class BatchNorm(nn.Block):
607
+
608
+ def __init__(self, in_channels, momentum=0.9, eps=1e-5):
609
+ super(BatchNorm, self).__init__()
610
+ self.F = in_channels
611
+
612
+ self.bn_weight = self.params.get('bn_weight', shape=(self.F,), init=mx.init.One(),
613
+ allow_deferred_init=False)
614
+ self.bn_bias = self.params.get('bn_bias', shape=(self.F,), init=mx.init.Zero(),
615
+ allow_deferred_init=False)
616
+
617
+ self.running_mean = self.params.get('running_mean', grad_req='null',
618
+ shape=(self.F,),
619
+ init=mx.init.Zero(),
620
+ allow_deferred_init=False,
621
+ differentiable=False)
622
+ self.running_var = self.params.get('running_var', grad_req='null',
623
+ shape=(self.F,),
624
+ init=mx.init.One(),
625
+ allow_deferred_init=False,
626
+ differentiable=False)
627
+ self.momentum = momentum
628
+ self.eps = eps
629
+
630
+ def forward(self, x):
631
+ if autograd.is_training():
632
+ return nd.BatchNorm(x,
633
+ gamma=self.bn_weight.data(x.context),
634
+ beta=self.bn_bias.data(x.context),
635
+ moving_mean=self.running_mean.data(x.context),
636
+ moving_var=self.running_var.data(x.context),
637
+ eps=self.eps, momentum=self.momentum,
638
+ use_global_stats=False)
639
+ else:
640
+ return nd.BatchNorm(x,
641
+ gamma=self.bn_weight.data(x.context),
642
+ beta=self.bn_bias.data(x.context),
643
+ moving_mean=self.running_mean.data(x.context),
644
+ moving_var=self.running_var.data(x.context),
645
+ eps=self.eps, momentum=self.momentum,
646
+ use_global_stats=True)
647
+
648
+ class MoleculeGenerator(nn.Block):
649
+
650
+ __metaclass__ = ABCMeta
651
+
652
+ def __init__(self, N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
653
+ *args, **kwargs):
654
+ super(MoleculeGenerator, self).__init__()
655
+ self.N_A = N_A
656
+ self.N_B = N_B
657
+ self.D = D
658
+ self.F_e = F_e
659
+ self.F_skip = F_skip
660
+ self.F_c = list(F_c) if isinstance(F_c, tuple) else F_c
661
+ self.Fh_policy = Fh_policy
662
+ self.activation = get_activation(activation)
663
+
664
+ with self.name_scope():
665
+ # embeddings
666
+ self.embedding_atom = nn.Embedding(self.N_A, self.F_e)
667
+ self.embedding_mask = nn.Embedding(3, self.F_e)
668
+
669
+ # graph conv
670
+ self._build_graph_conv(*args, **kwargs)
671
+
672
+ # fully connected
673
+ self.dense = nn.Sequential()
674
+ for i, (f_in, f_out) in enumerate(zip([self.F_skip, ] + self.F_c[:-1], self.F_c)):
675
+ self.dense.add(Linear_BN(f_in, f_out))
676
+
677
+ # policy
678
+ self.policy_0 = self.params.get('policy_0', shape=[self.N_A, ],
679
+ init=mx.init.Zero(),
680
+ allow_deferred_init=False)
681
+ self.policy_h = Policy(self.F_c[-1], self.Fh_policy, self.N_A, self.N_B)
682
+
683
+ self.mode = 'loss'
684
+
685
+ @abstractmethod
686
+ def _build_graph_conv(self, *args, **kwargs):
687
+ raise NotImplementedError
688
+
689
+ @abstractmethod
690
+ def _graph_conv_forward(self, X, A):
691
+ raise NotImplementedError
692
+
693
+ def _policy_0(self, ctx):
694
+ policy_0 = nd.exp(self.policy_0.data(ctx))
695
+ policy_0 = policy_0/policy_0.sum()
696
+ return policy_0
697
+
698
+ def _policy(self, X, A, NX, NX_rep, last_append_mask):
699
+ # get initial embedding
700
+ X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
701
+
702
+ # convolution
703
+ X = self._graph_conv_forward(X, A)
704
+
705
+ # linear
706
+ X = self.dense(X)
707
+
708
+ # policy
709
+ append, connect, end = self.policy_h(X, NX, NX_rep)
710
+
711
+ return append, connect, end
712
+
713
+ def _likelihood(self, init, append, connect, end,
714
+ action_0, actions, iw_ids, log_p_sigma,
715
+ batch_size, iw_size):
716
+
717
+ # decompose action:
718
+ action_type, node_type, edge_type, append_pos, connect_pos = \
719
+ actions[:, 0], actions[:, 1], actions[:, 2], actions[:, 3], actions[:, 4]
720
+ _log_mask = lambda _x, _mask: _mask * nd.log(_x + 1e-10) + (1- _mask) * nd.zeros_like(_x)
721
+
722
+ # init
723
+ init = init.reshape([batch_size * iw_size, self.N_A])
724
+ index = nd.stack(nd.arange(action_0.shape[0], ctx=action_0.context, dtype='int64'), action_0, axis=0)
725
+ loss_init = nd.log(nd.gather_nd(init, index) + 1e-10)
726
+
727
+ # end
728
+ loss_end = _log_mask(end, nd.cast(action_type == 2, 'float32'))
729
+
730
+ # append
731
+ index = nd.stack(append_pos, node_type, edge_type, axis=0)
732
+ loss_append = _log_mask(nd.gather_nd(append, index), nd.cast(action_type == 0, 'float32'))
733
+
734
+ # connect
735
+ index = nd.stack(connect_pos, edge_type, axis=0)
736
+ loss_connect = _log_mask(nd.gather_nd(connect, index), nd.cast(action_type == 1, 'float32'))
737
+
738
+ # sum up results
739
+ log_p_x = loss_end + loss_append + loss_connect
740
+ log_p_x = squeeze(SegmentSumFn(iw_ids, batch_size*iw_size)(unsqueeze(log_p_x, -1)), -1)
741
+ log_p_x = log_p_x + loss_init
742
+
743
+ # reshape
744
+ log_p_x = log_p_x.reshape([batch_size, iw_size])
745
+ log_p_sigma = log_p_sigma.reshape([batch_size, iw_size])
746
+ l = log_p_x - log_p_sigma
747
+ l = logsumexp(l, axis=1) - math.log(float(iw_size))
748
+ return l
749
+
750
+ def forward(self, *input):
751
+ if self.mode=='loss' or self.mode=='likelihood':
752
+ X, A, iw_ids, last_append_mask, \
753
+ NX, NX_rep, action_0, actions, log_p, \
754
+ batch_size, iw_size = input
755
+
756
+ init = self._policy_0(X.context).tile([batch_size * iw_size, 1])
757
+ append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask)
758
+ l = self._likelihood(init, append, connect, end, action_0, actions, iw_ids, log_p, batch_size, iw_size)
759
+ if self.mode=='likelihood':
760
+ return l
761
+ else:
762
+ return -l.mean()
763
+ elif self.mode == 'decode_0':
764
+ return self._policy_0(input[0])
765
+ elif self.mode == 'decode_step':
766
+ X, A, NX, NX_rep, last_append_mask = input
767
+ return self._policy(X, A, NX, NX_rep, last_append_mask)
768
+
769
+
770
+ class MoleculeGenerator_RNN(MoleculeGenerator):
771
+
772
+ __metaclass__ = ABCMeta
773
+
774
+ def __init__(self, N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
775
+ N_rnn, *args, **kwargs):
776
+ super(MoleculeGenerator_RNN, self).__init__(N_A, N_B, D, F_e, F_skip, F_c, Fh_policy, activation,
777
+ *args, **kwargs)
778
+ self.N_rnn = N_rnn
779
+
780
+ with self.name_scope():
781
+ self.rnn = gluon.rnn.GRU(hidden_size=self.F_c[-1],
782
+ num_layers=self.N_rnn,
783
+ layout='NTC', input_size=self.F_c[-1] * 2)
784
+
785
+ def _rnn_train(self, X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum):
786
+ X_avg = SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast(unsqueeze(NX, 1), 'float32')
787
+ X_curr = nd.take(X, indices=NX_cum-1)
788
+ X = nd.concat(X_avg, X_curr, dim=1)
789
+
790
+ # rnn
791
+ X = nd.take(X, indices=graph_to_rnn) # batch_size, iw_size, length, num_features
792
+ batch_size, iw_size, length, num_features = X.shape
793
+ X = X.reshape([batch_size*iw_size, length, num_features])
794
+ X = self.rnn(X)
795
+
796
+ X = X.reshape([batch_size, iw_size, length, -1])
797
+ X = nd.gather_nd(X, indices=rnn_to_graph)
798
+
799
+ return X
800
+
801
+ def _rnn_test(self, X, NX, NX_rep, NX_cum, h):
802
+ # note: one partition for one molecule
803
+ X_avg = SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast(unsqueeze(NX, 1), 'float32')
804
+ X_curr = nd.take(X, indices=NX_cum - 1)
805
+ X = nd.concat(X_avg, X_curr, dim=1) # size: [NX, F_in * 2]
806
+
807
+ # rnn
808
+ X = unsqueeze(X, axis=1)
809
+ X, h = self.rnn(X, h)
810
+
811
+ X = squeeze(X, axis=1)
812
+ return X, h
813
+
814
+ def _policy(self, X, A, NX, NX_rep, last_append_mask, graph_to_rnn, rnn_to_graph, NX_cum):
815
+ # get initial embedding
816
+ X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
817
+
818
+ # convolution
819
+ X = self._graph_conv_forward(X, A)
820
+
821
+ # linear
822
+ X = self.dense(X)
823
+
824
+ # rnn
825
+ X_mol = self._rnn_train(X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum)
826
+
827
+ # policy
828
+ append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
829
+
830
+ return append, connect, end
831
+
832
+ def _decode_step(self, X, A, NX, NX_rep, last_append_mask, NX_cum, h):
833
+ # get initial embedding
834
+ X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
835
+
836
+ # convolution
837
+ X = self._graph_conv_forward(X, A)
838
+
839
+ # linear
840
+ X = self.dense(X)
841
+
842
+ # rnn
843
+ X_mol, h = self._rnn_test(X, NX, NX_rep, NX_cum, h)
844
+
845
+ # policy
846
+ append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
847
+
848
+ return append, connect, end, h
849
+
850
+ def forward(self, *input):
851
+ if self.mode=='loss' or self.mode=='likelihood':
852
+ X, A, iw_ids, last_append_mask, \
853
+ NX, NX_rep, action_0, actions, log_p, \
854
+ batch_size, iw_size, \
855
+ graph_to_rnn, rnn_to_graph, NX_cum = input
856
+
857
+ init = self._policy_0(X.context).tile([batch_size * iw_size, 1])
858
+ append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask, graph_to_rnn, rnn_to_graph, NX_cum)
859
+ l = self._likelihood(init, append, connect, end, action_0, actions, iw_ids, log_p, batch_size, iw_size)
860
+ if self.mode=='likelihood':
861
+ return l
862
+ else:
863
+ return -l.mean()
864
+ elif self.mode == 'decode_0':
865
+ return self._policy_0(input[0])
866
+ elif self.mode == 'decode_step':
867
+ X, A, NX, NX_rep, last_append_mask, NX_cum, h = input
868
+ return self._decode_step(X, A, NX, NX_rep, last_append_mask, NX_cum, h)
869
+ else:
870
+ raise ValueError
871
+
872
+ class _TwoLayerDense(nn.Block):
873
+
874
+ def __init__(self, input_size, hidden_size, output_size):
875
+ super(_TwoLayerDense, self).__init__()
876
+
877
+ self.hidden_size = hidden_size
878
+ self.output_size = output_size
879
+ self.input_size = input_size
880
+
881
+ with self.name_scope():
882
+ # config 1
883
+ self.input = nn.Dense(self.hidden_size, use_bias=False, in_units=self.input_size)
884
+ self.bn_input = BatchNorm(in_channels=hidden_size)
885
+ self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.hidden_size)
886
+
887
+ # config 2
888
+ #self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.input_size)
889
+
890
+ # config 3
891
+ #self.input1 = nn.Dense(self.hidden_size, use_bias=False, in_units=self.input_size)
892
+ #self.bn_input1 = BatchNorm(in_channels=self.hidden_size)
893
+ #self.input2 = nn.Dense(self.hidden_size, use_bias=False, in_units=self.hidden_size)
894
+ #self.bn_input2 = BatchNorm(in_channels=self.hidden_size)
895
+ #self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.hidden_size)
896
+
897
+ # config 4
898
+ #self.bn_input = BatchNorm(in_channels=self.input_size)
899
+ #self.output = nn.Dense(self.output_size, use_bias=True, in_units=self.input_size)
900
+
901
+ # config 5
902
+ #self.bn_input = BatchNorm(in_channels=1024)
903
+ #self.output = nn.Dense(self.output_size, use_bias=True, in_units=1024)
904
+
905
+
906
+ def forward(self, c):
907
+ # config 1
908
+ return nd.softmax(self.output(nd.relu(self.bn_input(self.input(c)))), axis=-1)
909
+
910
+ # config 2
911
+ #return nd.softmax(self.output(c), axis=-1)
912
+
913
+ # config 3
914
+ #return nd.softmax(self.output(nd.relu(self.bn_input2(self.input2(nd.relu(self.bn_input1(self.input1(c))))))), axis=-1)
915
+
916
+ # config 4
917
+ #return nd.softmax(self.output(nd.relu(self.bn_input(c))), axis=-1)
918
+
919
+ # config 5
920
+ #return nd.softmax(self.output(c), axis=-1)
921
+
922
+
923
+ class CMoleculeGenerator_RNN(MoleculeGenerator_RNN):
924
+ __metaclass__ = ABCMeta
925
+
926
+ def __init__(self, N_A, N_B, N_C, D,
927
+ F_e, F_skip, F_c, Fh_policy,
928
+ activation, N_rnn,
929
+ *args, **kwargs):
930
+ self.N_C = N_C # number of conditional variables
931
+ super(CMoleculeGenerator_RNN, self).__init__(N_A, N_B, D,
932
+ F_e, F_skip, F_c, Fh_policy,
933
+ activation, N_rnn,
934
+ *args, **kwargs)
935
+ with self.name_scope():
936
+ self.dense_policy_0 = _TwoLayerDense(self.N_C, self.N_A * 3, self.N_A)
937
+
938
+ @abstractmethod
939
+ def _graph_conv_forward(self, X, A, c, ids):
940
+ raise NotImplementedError
941
+
942
+ def _policy_0(self, c):
943
+ return self.dense_policy_0(c) + 0.0 * self.policy_0.data(c.context)
944
+
945
+ def _policy(self, X, A, NX, NX_rep, last_append_mask,
946
+ graph_to_rnn, rnn_to_graph, NX_cum,
947
+ c, ids):
948
+ # get initial embedding
949
+ X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
950
+
951
+ # convolution
952
+ X = self._graph_conv_forward(X, A, c, ids)
953
+
954
+ # linear
955
+ X = self.dense(X)
956
+
957
+ # rnn
958
+ X_mol = self._rnn_train(X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum)
959
+
960
+ # policy
961
+ append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
962
+
963
+ return append, connect, end
964
+
965
+ def _decode_step(self, X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids):
966
+ # get initial embedding
967
+ X = self.embedding_atom(X) + self.embedding_mask(last_append_mask)
968
+
969
+ # convolution
970
+ X = self._graph_conv_forward(X, A, c, ids)
971
+
972
+ # linear
973
+ X = self.dense(X)
974
+
975
+ # rnn
976
+ X_mol, h = self._rnn_test(X, NX, NX_rep, NX_cum, h)
977
+
978
+ # policy
979
+ append, connect, end = self.policy_h(X, NX, NX_rep, X_mol)
980
+
981
+ return append, connect, end, h
982
+
983
+
984
+ def forward(self, *input):
985
+ if self.mode=='loss' or self.mode=='likelihood':
986
+ X, A, iw_ids, last_append_mask, \
987
+ NX, NX_rep, action_0, actions, log_p, \
988
+ batch_size, iw_size, \
989
+ graph_to_rnn, rnn_to_graph, NX_cum, \
990
+ c, ids = input
991
+
992
+ init = nd.tile(unsqueeze(self._policy_0(c), axis=1), [1, iw_size, 1])
993
+ append, connect, end = self._policy(X, A, NX, NX_rep, last_append_mask,
994
+ graph_to_rnn, rnn_to_graph, NX_cum,
995
+ c, ids)
996
+ l = self._likelihood(init, append, connect, end,
997
+ action_0, actions, iw_ids, log_p,
998
+ batch_size, iw_size)
999
+ if self.mode=='likelihood':
1000
+ return l
1001
+ else:
1002
+ return -l.mean()
1003
+ elif self.mode == 'decode_0':
1004
+ return self._policy_0(*input)
1005
+ elif self.mode == 'decode_step':
1006
+ X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids = input
1007
+ return self._decode_step(X, A, NX, NX_rep, last_append_mask, NX_cum, h, c, ids)
1008
+ else:
1009
+ raise ValueError
1010
+
1011
+ class CVanillaMolGen_RNN(CMoleculeGenerator_RNN):
1012
+
1013
+ def __init__(self, N_A, N_B, N_C, D,
1014
+ F_e, F_h, F_skip, F_c, Fh_policy,
1015
+ activation, N_rnn, rename=False):
1016
+ self.rename = rename
1017
+ super(CVanillaMolGen_RNN, self).__init__(N_A, N_B, N_C, D,
1018
+ F_e, F_skip, F_c, Fh_policy,
1019
+ activation, N_rnn,
1020
+ F_h)
1021
+
1022
+ def _build_graph_conv(self, F_h):
1023
+ self.F_h = list(F_h) if isinstance(F_h, tuple) else F_h
1024
+ self.conv, self.bn = [], []
1025
+ for i, (f_in, f_out) in enumerate(zip([self.F_e] + self.F_h[:-1], self.F_h)):
1026
+ conv = GraphConv(f_in, f_out, self.N_B + self.D)
1027
+ self.conv.append(conv)
1028
+ self.register_child(conv)
1029
+
1030
+ if i != 0:
1031
+ bn = BatchNorm(in_channels=f_in)
1032
+ self.register_child(bn)
1033
+ else:
1034
+ bn = None
1035
+ self.bn.append(bn)
1036
+
1037
+ self.bn_skip = BatchNorm(in_channels=sum(self.F_h))
1038
+ self.linear_skip = Linear_BN(sum(self.F_h), self.F_skip)
1039
+
1040
+ # projectors for conditional variable (protein embedding)
1041
+ self.linear_c = []
1042
+ for i, f_out in enumerate(self.F_h):
1043
+ if self.rename:
1044
+ linear_c = nn.Dense(f_out, use_bias=False, in_units=self.N_C, prefix='cond_{}'.format(i))
1045
+ else:
1046
+ linear_c = nn.Dense(f_out, use_bias=False, in_units=self.N_C)
1047
+ self.register_child(linear_c)
1048
+ self.linear_c.append(linear_c)
1049
+
1050
+ def _graph_conv_forward(self, X, A, c, ids):
1051
+ X_out = [X]
1052
+ for conv, bn, linear_c in zip(self.conv, self.bn, self.linear_c):
1053
+ X = X_out[-1]
1054
+ if bn is not None:
1055
+ X_out.append(conv(self.activation(bn(X)), A) + linear_c(c)[ids, :])
1056
+ else:
1057
+ X_out.append(conv(X, A) + linear_c(c)[ids, :])
1058
+ X_out = nd.concat(*X_out[1:], dim=1)
1059
+ return self.activation(self.linear_skip(self.activation(self.bn_skip(X_out))))
1060
+
1061
+ def _decode_step(X, A, NX, NA, last_action, finished,
1062
+ get_init, get_action,
1063
+ random=False, n_node_types=get_mol_spec().num_atom_types,
1064
+ n_edge_types=get_mol_spec().num_bond_types):
1065
+ if X is None:
1066
+ init = get_init()
1067
+
1068
+ if random:
1069
+ X = []
1070
+ for i in range(init.shape[0]):
1071
+ # init probabilities(for first atom)
1072
+ p = init[i, :]
1073
+ # Random sampling using init probability distribution
1074
+ selected_atom = np.random.choice(np.arange(init.shape[1]), 1, p=p)[0]
1075
+ X.append(selected_atom)
1076
+ X = np.array(X, dtype=np.int64)
1077
+ else:
1078
+ X = np.argmax(init, axis=1)
1079
+ A = np.zeros((0, 3), dtype=np.int64)
1080
+ NX = last_action = np.ones([X.shape[0]], dtype=np.int64)
1081
+ NA = np.zeros([X.shape[0]], dtype=np.int64)
1082
+ finished = np.array([False, ] * X.shape[0], dtype=np.bool)
1083
+
1084
+ return X, A, NX, NA, last_action, finished
1085
+ else:
1086
+ X_u = X[np.repeat(np.logical_not(finished), NX)]
1087
+ A_u = A[np.repeat(np.logical_not(finished), NA), :]
1088
+ NX_u = NX[np.logical_not(finished)]
1089
+ NA_u = NA[np.logical_not(finished)]
1090
+ last_action_u = last_action[np.logical_not(finished)]
1091
+
1092
+ # conv
1093
+ mol_ids_rep = NX_rep = np.repeat(np.arange(NX_u.shape[0]), NX_u)
1094
+ rep_ids_rep = np.zeros_like(mol_ids_rep)
1095
+
1096
+ if A.shape[0] == 0:
1097
+ D_2 = D_3 = np.zeros((0, 2), dtype=np.int64)
1098
+ A_u = [np.zeros((0, 2), dtype=np.int64) for _ in range(get_mol_spec().num_bond_types)]
1099
+ A_u += [D_2, D_3]
1100
+ else:
1101
+ cumsum = np.cumsum(np.pad(NX_u, [[1, 0]], mode='constant')[:-1])
1102
+ shift = np.repeat(cumsum, NA_u)
1103
+ A_u[:, :2] += np.stack([shift, ] * 2, axis=1)
1104
+ D_2, D_3 = get_d(A_u, X_u)
1105
+ A_u = [A_u[A_u[:, 2] == _i, :2] for _i in range(n_edge_types)]
1106
+ A_u += [D_2, D_3]
1107
+
1108
+ mask = np.zeros([X_u.shape[0]], dtype=np.int64)
1109
+ last_append_index = np.cumsum(NX_u) - 1
1110
+ mask[last_append_index] = np.where(last_action_u == 1,
1111
+ np.ones_like(last_append_index, dtype=np.int64),
1112
+ np.ones_like(last_append_index, dtype=np.int64) * 2)
1113
+
1114
+ decode_input = [X_u, A_u, NX_u, NX_rep, mask, mol_ids_rep, rep_ids_rep]
1115
+ append, connect, end = get_action(decode_input)
1116
+
1117
+ if A.shape[0] == 0:
1118
+ max_index = np.argmax(np.reshape(append, [-1, n_node_types * n_edge_types]), axis=1)
1119
+ atom_type, bond_type = np.unravel_index(max_index, [n_node_types, n_edge_types])
1120
+ X = np.reshape(np.stack([X, atom_type], axis=1), [-1])
1121
+ NX = np.array([2, ] * len(finished), dtype=np.int64)
1122
+ A = np.stack([np.zeros([len(finished), ], dtype=np.int64),
1123
+ np.ones([len(finished), ], dtype=np.int64),
1124
+ bond_type], axis=1)
1125
+ NA = np.ones([len(finished), ], dtype=np.int64)
1126
+ last_action = np.ones_like(NX, dtype=np.int64)
1127
+
1128
+ else:
1129
+ # process for each molecule
1130
+ append, connect = np.split(append, np.cumsum(NX_u)), np.split(connect, np.cumsum(NX_u))
1131
+ end = end.tolist()
1132
+
1133
+ unfinished_ids = np.where(np.logical_not(finished))[0].tolist()
1134
+ cumsum = np.cumsum(NX)
1135
+ cumsum_a = np.cumsum(NA)
1136
+
1137
+ X_insert = []
1138
+ X_insert_ids = []
1139
+ A_insert = []
1140
+ A_insert_ids = []
1141
+ finished_ids = []
1142
+
1143
+ for i, (unfinished_id, append_i, connect_i, end_i) \
1144
+ in enumerate(zip(unfinished_ids, append, connect, end)):
1145
+ if random:
1146
+ def _rand_id(*_x):
1147
+ _x_reshaped = [np.reshape(_xi, [-1]) for _xi in _x]
1148
+ _x_length = np.array([_x_reshape_i.shape[0] for _x_reshape_i in _x_reshaped],
1149
+ dtype=np.int64)
1150
+ _begin = np.cumsum(np.pad(_x_length, [[1, 0]], mode='constant')[:-1])
1151
+ _end = np.cumsum(_x_length) - 1
1152
+ _p = np.concatenate(_x_reshaped)
1153
+ _p = _p / np.sum(_p)
1154
+ # Count NaN values
1155
+ num_nan = np.isnan(_p).sum()
1156
+ if num_nan > 0:
1157
+ print(f'Number of NaN values in _p: {num_nan}')
1158
+ _rand_index = np.random.choice(np.arange(len(_p)), 1)[0]
1159
+
1160
+ else:
1161
+ _rand_index = np.random.choice(np.arange(_p.shape[0]), 1, p=_p)[0]
1162
+
1163
+
1164
+ _p_step = _p[_rand_index]
1165
+ _x_index = np.where(np.logical_and(_begin <= _rand_index, _end >= _rand_index))[0][0]
1166
+ _rand_index = _rand_index - _begin[_x_index]
1167
+ _rand_index = np.unravel_index(_rand_index, _x[_x_index].shape)
1168
+ return _x_index, _rand_index, _p_step
1169
+
1170
+ action_type, action_index, p_step = _rand_id(append_i, connect_i, np.array([end_i]))
1171
+ else:
1172
+ _argmax = lambda _x: np.unravel_index(np.argmax(_x), _x.shape)
1173
+ append_id, append_val = _argmax(append_i), np.max(append_i)
1174
+ connect_id, connect_val = _argmax(connect_i), np.max(connect_i)
1175
+ end_val = end_i
1176
+ if end_val >= append_val and end_val >= connect_val:
1177
+ action_type = 2
1178
+ action_index = None
1179
+ elif append_val >= connect_val and append_val >= end_val:
1180
+ action_type = 0
1181
+ action_index = append_id
1182
+ else:
1183
+ action_type = 1
1184
+ action_index = connect_id
1185
+ if action_type == 2:
1186
+ # finish growth
1187
+ finished_ids.append(unfinished_id)
1188
+ elif action_type == 0:
1189
+ # append action
1190
+ append_pos, atom_type, bond_type = action_index
1191
+ X_insert.append(atom_type)
1192
+ X_insert_ids.append(unfinished_id)
1193
+ A_insert.append([append_pos, NX[unfinished_id], bond_type])
1194
+ A_insert_ids.append(unfinished_id)
1195
+ else:
1196
+ # connect
1197
+ connect_ps, bond_type = action_index
1198
+ A_insert.append([NX[unfinished_id] - 1, connect_ps, bond_type])
1199
+ A_insert_ids.append(unfinished_id)
1200
+ if len(A_insert_ids) > 0:
1201
+ A = np.insert(A, cumsum_a[A_insert_ids], A_insert, axis=0)
1202
+ NA[A_insert_ids] += 1
1203
+ last_action[A_insert_ids] = 0
1204
+ if len(X_insert_ids) > 0:
1205
+ X = np.insert(X, cumsum[X_insert_ids], X_insert, axis=0)
1206
+ NX[X_insert_ids] += 1
1207
+ last_action[X_insert_ids] = 1
1208
+ if len(finished_ids) > 0:
1209
+ finished[finished_ids] = True
1210
+ # print finished
1211
+
1212
+ return X, A, NX, NA, last_action, finished
1213
+
1214
+ class Builder(object, metaclass=ABCMeta):
1215
+
1216
+ def __init__(self, model_loc, gpu_id=None):
1217
+ with open(os.path.join(model_loc, 'configs.json')) as f:
1218
+ configs = json.load(f)
1219
+
1220
+ self.mdl = self.__class__._get_model(configs)
1221
+
1222
+ self.ctx = mx.gpu(gpu_id) if gpu_id is not None else mx.cpu()
1223
+ self.mdl.load_parameters(os.path.join(model_loc, 'ckpt.params'), ctx=self.ctx, allow_missing=True)
1224
+
1225
+ @staticmethod
1226
+ def _get_model(configs):
1227
+ raise NotImplementedError
1228
+
1229
+ @abstractmethod
1230
+ def sample(self, num_samples, *args, **kwargs):
1231
+ raise NotImplementedError
1232
+
1233
+
1234
+ class CVanilla_RNN_Builder(Builder):
1235
+
1236
+ @staticmethod
1237
+ def _get_model(configs):
1238
+ return CVanillaMolGen_RNN(get_mol_spec().num_atom_types, get_mol_spec().num_bond_types, D=2, **configs)
1239
+
1240
+
1241
+ def sample(self, num_samples, c, output_type='mol', sanitize=True, random=True):
1242
+ if len(c.shape) == 1:
1243
+ c = np.stack([c, ]*num_samples, axis=0)
1244
+
1245
+ with autograd.predict_mode():
1246
+ # step one
1247
+ finished = [False, ] * num_samples
1248
+
1249
+ def get_init():
1250
+ self.mdl.mode = 'decode_0'
1251
+ _c = nd.array(c, dtype='float32', ctx=self.ctx)
1252
+ init = self.mdl(_c).asnumpy()
1253
+ return init
1254
+
1255
+ outputs = _decode_step(X=None, A=None, NX=None, NA=None, last_action=None, finished=finished,
1256
+ get_init=get_init, get_action=None,
1257
+ n_node_types=self.mdl.N_A, n_edge_types=self.mdl.N_B,
1258
+ random=random)
1259
+
1260
+ # If outputs is None
1261
+ if outputs is None:
1262
+ return None
1263
+ X, A, NX, NA, last_action, finished = outputs
1264
+
1265
+ count = 1
1266
+ h = np.zeros([self.mdl.N_rnn, num_samples, self.mdl.F_c[-1]], dtype=np.float32)
1267
+ while not np.all(finished) and count < 100:
1268
+ def get_action(inputs):
1269
+ self.mdl.mode = 'decode_step'
1270
+ _h = nd.array(h[:, np.logical_not(finished), :], ctx=self.ctx, dtype='float32')
1271
+ _c = nd.array(c[np.logical_not(finished), :], ctx=self.ctx, dtype='float32')
1272
+ _X, _A_sparse, _NX, _NX_rep, _mask, _NX_cum = self.to_nd(inputs)
1273
+ _append, _connect, _end, _h = self.mdl(_X, _A_sparse, _NX, _NX_rep, _mask, _NX_cum, _h, _c, _NX_rep)
1274
+ h[:, np.logical_not(finished), :] = _h[0].asnumpy()
1275
+ return _append.asnumpy(), _connect.asnumpy(), _end.asnumpy()
1276
+
1277
+ outputs = _decode_step(X, A, NX, NA, last_action, finished,
1278
+ get_init=None, get_action=get_action,
1279
+ n_node_types=self.mdl.N_A, n_edge_types=self.mdl.N_B,
1280
+ random=random)
1281
+ X, A, NX, NA, last_action, finished = outputs
1282
+
1283
+ count += 1
1284
+
1285
+ graph_list = []
1286
+
1287
+ cumsum_X_ = np.cumsum(np.pad(NX, [[1, 0]], mode='constant')).tolist()
1288
+ cumsum_A_ = np.cumsum(np.pad(NA, [[1, 0]], mode='constant')).tolist()
1289
+
1290
+ for cumsum_A_pre, cumsum_A_post, \
1291
+ cumsum_X_pre, cumsum_X_post in zip(cumsum_A_[:-1], cumsum_A_[1:],
1292
+ cumsum_X_[:-1], cumsum_X_[1:]):
1293
+ graph_list.append([X[cumsum_X_pre:cumsum_X_post], A[cumsum_A_pre:cumsum_A_post, :]])
1294
+
1295
+ if output_type=='graph':
1296
+ return graph_list
1297
+ elif output_type == 'mol':
1298
+ return get_mol_from_graph_list(graph_list, sanitize)
1299
+ elif output_type == 'smiles':
1300
+ mol_list = get_mol_from_graph_list(graph_list, sanitize=True)
1301
+ smiles_list = [Chem.MolToSmiles(m) if m is not None else None for m in mol_list]
1302
+ return smiles_list
1303
+ else:
1304
+ raise ValueError('Unrecognized output type')
1305
+
1306
+ def to_nd(self, inputs):
1307
+ X, A, NX, NX_rep, mask = inputs[:-2]
1308
+ NX_cum = np.cumsum(NX)
1309
+
1310
+ # convert to ndarray
1311
+ _to_ndarray = lambda _x: nd.array(_x, self.ctx, 'int64')
1312
+ X, NX, NX_rep, mask, NX_cum = \
1313
+ _to_ndarray(X), _to_ndarray(NX), _to_ndarray(NX_rep), _to_ndarray(mask), _to_ndarray(NX_cum)
1314
+ A_sparse = []
1315
+ for _A_i in A:
1316
+ if _A_i.shape[0] == 0:
1317
+ A_sparse.append(None)
1318
+ else:
1319
+ # transpose may not be supported in gpu
1320
+ _A_i = np.concatenate([_A_i, _A_i[:, [1, 0]]], axis=0)
1321
+
1322
+ # construct csr matrix ...
1323
+ _data = np.ones((_A_i.shape[0],), dtype=np.float32)
1324
+ _row, _col = _A_i[:, 0], _A_i[:, 1]
1325
+ _A_sparse_i = nd.sparse.csr_matrix((_data, (_row, _col)),
1326
+ shape=tuple([int(X.shape[0]), ] * 2),
1327
+ ctx=self.ctx, dtype='float32')
1328
+
1329
+ # append to list
1330
+ A_sparse.append(_A_sparse_i)
1331
+ return X, A_sparse, NX, NX_rep, mask, NX_cum
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ gunicorn
3
+ rdkit
4
+ transformers
5
+ bio-embeddings
6
+ torch
7
+ huggingface_hub
8
+ molvs
9
+ numpy==1.23.5
10
+ mxnet==1.8
11
+ networkx
12
+ scipy
13
+ pandas
14
+ ipython
15
+ accelerate>=0.26.0
16
+ gdown==4.6.0
17
+ requests
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.8.3
templates/index.html ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Protein to SMILES Generator</title>
7
+ <style>
8
+ body {
9
+ font-family: 'Arial', sans-serif;
10
+ background: linear-gradient(to right, #4facfe, #00f2fe);
11
+ display: flex;
12
+ justify-content: center;
13
+ align-items: center;
14
+ height: 100vh;
15
+ margin: 0;
16
+ }
17
+ .container {
18
+ background: white;
19
+ padding: 40px;
20
+ border-radius: 15px;
21
+ box-shadow: 0px 6px 15px rgba(0, 0, 0, 0.3);
22
+ width: 90%;
23
+ max-width: 1200px;
24
+ text-align: center;
25
+ }
26
+ h2 {
27
+ font-size: 32px;
28
+ color: #333;
29
+ margin-bottom: 20px;
30
+ }
31
+ textarea {
32
+ width: 100%;
33
+ height: 300px;
34
+ border: 2px solid #ddd;
35
+ border-radius: 10px;
36
+ padding: 15px;
37
+ resize: none;
38
+ font-size: 18px;
39
+ }
40
+ button {
41
+ margin-top: 20px;
42
+ padding: 14px 24px;
43
+ font-size: 18px;
44
+ background-color: #007bff;
45
+ color: white;
46
+ border: none;
47
+ border-radius: 8px;
48
+ cursor: pointer;
49
+ transition: background 0.3s ease;
50
+ }
51
+ button:hover {
52
+ background-color: #0056b3;
53
+ }
54
+ .message {
55
+ margin-top: 20px;
56
+ font-size: 18px;
57
+ color: #28a745;
58
+ }
59
+ .download-btn {
60
+ display: inline-block;
61
+ margin-top: 20px;
62
+ padding: 14px 24px;
63
+ font-size: 18px;
64
+ background-color: #28a745;
65
+ color: white;
66
+ text-decoration: none;
67
+ border-radius: 8px;
68
+ transition: background 0.3s ease;
69
+ }
70
+ .download-btn:hover {
71
+ background-color: #218838;
72
+ }
73
+ </style>
74
+ </head>
75
+ <body>
76
+ <div class="container">
77
+ <h2>Protein to SMILES Generator</h2>
78
+ <form method="POST">
79
+ <textarea name="sequence" placeholder="Enter non-FASTA protein sequence..."></textarea><br>
80
+ <button type="submit">Generate SMILES</button>
81
+ </form>
82
+
83
+ {% if message %}
84
+ <p class="message">{{ message }}</p>
85
+ {% endif %}
86
+
87
+ {% if file_path %}
88
+ <p>Time taken: {{ time_taken }} seconds</p>
89
+ <a href="{{ url_for('download_file') }}" class="download-btn">Download SMILES</a>
90
+ {% endif %}
91
+ </div>
92
+ </body>
93
+ </html>