Skip to content

Commit 719da58

Browse files
authored
Update pipeline (#9)
1 parent 4e99832 commit 719da58

File tree

2 files changed

+44
-26
lines changed

2 files changed

+44
-26
lines changed

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# Base image containing the installed gt4sd environment
22
#FROM drugilsberg/gt4sd-base:v1.4.2-cpu
3-
FROM quay.io/gt4sd/gt4sd-base:v1.4.2-cpu
3+
FROM quay.io/gt4sd/gt4sd-base:v1.5.0-cpu
44

55

66
# Certs for git clone
77
RUN apt-get update && \
88
apt-get install -y git ca-certificates && \
99
apt-get clean
1010

11-
RUN git clone https://github.com/GT4SD/molecular-design.git
1211
WORKDIR /workspace/molecular-design
12+
COPY . .
1313

1414
# hack: We need to use the pypi toxsmi package, not the default one
1515
RUN pip uninstall --yes toxsmi && pip install toxsmi && mkdir data

scripts/load_data.py

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
from typing import Optional
12
import requests
23
import os
34
import pandas as pd
45
import argparse
56
from sklearn.model_selection import train_test_split
67
from helpers import utils
8+
from time import sleep
79

810

911
parser = argparse.ArgumentParser()
@@ -38,19 +40,27 @@
3840
action="store_true",
3941
help="Enable binary classification. If not specified, the default mode is regression.",
4042
)
43+
parser.add_argument(
44+
"--max_retries",
45+
type=int,
46+
default=10,
47+
help="Maximal number of retries to fetch data from fickle BindingDB API",
48+
)
4149

4250

4351
def fetch(
4452
uniprot: str,
4553
affinity_cutoff: int,
4654
affinity_type: str,
47-
) -> pd.DataFrame:
48-
url = f"https://bindingdb.org/rest/getLigandsByUniprots?uniprot={uniprot}&cutoff={affinity_cutoff}&response=application/json"
55+
) -> Optional[pd.DataFrame]:
56+
url = f"https://www.bindingdb.org/rest/getLigandsByUniprots?uniprot={uniprot}&cutoff={affinity_cutoff}&response=application/json"
4957
response = requests.get(url)
50-
assert response.status_code == 200, "[x] Failed to fetch data from bindingdb"
58+
assert response.status_code == 200, f"Response {response.status_code}: Failed to fetch data from bindingdb"
5159

5260
data = response.json()
53-
affinities = data["getLigandsByUniprotsResponse"]["affinities"]
61+
if 'getLindsByUniprotsResponse' not in data:
62+
return
63+
affinities = data["getLindsByUniprotsResponse"]["affinities"]
5464
df = pd.DataFrame(affinities)
5565
df = df[df["affinity_type"] == affinity_type]
5666
df = df[["smile", "monomerid", "affinity"]]
@@ -64,25 +74,33 @@ def fetch(
6474

6575
if __name__ == "__main__":
6676
args = parser.parse_args()
67-
dataset = fetch(args.uniprot, args.affinity_cutoff, args.affinity_type)
6877

69-
# three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
70-
mol_path = os.path.join(args.output_dir, "mols.smi")
71-
train_path = os.path.join(args.output_dir, "train.csv")
72-
val_path = os.path.join(args.output_dir, "valid.csv")
73-
# Save smiles and id without header. Note that this dataset uses tab delimiter.
74-
dataset[["smile", "monomerid"]].to_csv(
75-
mol_path, index=False, header=False, sep="\t"
76-
)
77-
# Training dataset have columns Label,sampling_frequency,mol_id
78-
dataset = dataset.rename(columns={"affinity": "Label", "monomerid": "mol_id"})
79-
dataset["sampling_frequency"] = "high"
80-
dataset = dataset[["Label", "sampling_frequency", "mol_id"]]
78+
for attempt in range(args.max_retries):
79+
dataset = fetch(args.uniprot, args.affinity_cutoff, args.affinity_type)
80+
if dataset is not None:
81+
break
82+
sleep(5)
83+
84+
if dataset is None:
85+
print(f'BindingDB API does not respond even after {tries} attempts.')
86+
else:
87+
# three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
88+
mol_path = os.path.join(args.output_dir, "mols.smi")
89+
train_path = os.path.join(args.output_dir, "train.csv")
90+
val_path = os.path.join(args.output_dir, "valid.csv")
91+
# Save smiles and id without header. Note that this dataset uses tab delimiter.
92+
dataset[["smile", "monomerid"]].to_csv(
93+
mol_path, index=False, header=False, sep="\t"
94+
)
95+
# Training dataset have columns Label,sampling_frequency,mol_id
96+
dataset = dataset.rename(columns={"affinity": "Label", "monomerid": "mol_id"})
97+
dataset["sampling_frequency"] = "high"
98+
dataset = dataset[["Label", "sampling_frequency", "mol_id"]]
8199

82-
if args.binary_labels:
83-
dataset["Label"] = dataset["Label"].apply(lambda x: 1 if x > 6 else 0)
84-
train, validation = train_test_split(
85-
dataset, train_size=args.train_size, random_state=1911
86-
)
87-
train.to_csv(train_path, index=False, header=True)
88-
validation.to_csv(val_path, index=False, header=True)
100+
if args.binary_labels:
101+
dataset["Label"] = dataset["Label"].apply(lambda x: 1 if x > 6 else 0)
102+
train, validation = train_test_split(
103+
dataset, train_size=args.train_size, random_state=1911
104+
)
105+
train.to_csv(train_path, index=False, header=True)
106+
validation.to_csv(val_path, index=False, header=True)

0 commit comments

Comments
 (0)