Update pipeline (#9)

jannisborn · web-flow · commit 719da58f3271 · 2025-07-29T08:44:41.000+02:00
diff --git a/Dockerfile b/Dockerfile
@@ -1,15 +1,15 @@
 # Base image containing the installed gt4sd environment
 #FROM drugilsberg/gt4sd-base:v1.4.2-cpu
-FROM quay.io/gt4sd/gt4sd-base:v1.4.2-cpu
+FROM quay.io/gt4sd/gt4sd-base:v1.5.0-cpu
 
 
 # Certs for git clone
 RUN apt-get update && \
     apt-get install -y git ca-certificates && \
     apt-get clean
 
-RUN git clone https://github.com/GT4SD/molecular-design.git
 WORKDIR /workspace/molecular-design
+COPY . .
 
 # hack: We need to use the pypi toxsmi package, not the default one
 RUN pip uninstall --yes toxsmi && pip install toxsmi && mkdir data
diff --git a/scripts/load_data.py b/scripts/load_data.py
@@ -1,9 +1,11 @@
+from typing import Optional
 import requests
 import os
 import pandas as pd
 import argparse
 from sklearn.model_selection import train_test_split
 from helpers import utils
+from time import sleep
 
 
 parser = argparse.ArgumentParser()
@@ -38,19 +40,27 @@
     action="store_true",
     help="Enable binary classification. If not specified, the default mode is regression.",
 )
+parser.add_argument(
+    "--max_retries",
+    type=int,
+    default=10,
+    help="Maximal number of retries to fetch data from fickle BindingDB API",
+)
 
 
 def fetch(
     uniprot: str,
     affinity_cutoff: int,
     affinity_type: str,
-) -> pd.DataFrame:
-    url = f"https://bindingdb.org/rest/getLigandsByUniprots?uniprot={uniprot}&cutoff={affinity_cutoff}&response=application/json"
+) -> Optional[pd.DataFrame]:
+    url = f"https://www.bindingdb.org/rest/getLigandsByUniprots?uniprot={uniprot}&cutoff={affinity_cutoff}&response=application/json"
     response = requests.get(url)
-    assert response.status_code == 200, "[x] Failed to fetch data from bindingdb"
+    assert response.status_code == 200, f"Response {response.status_code}: Failed to fetch data from bindingdb"
 
     data = response.json()
-    affinities = data["getLigandsByUniprotsResponse"]["affinities"]
+    if 'getLindsByUniprotsResponse' not in data:
+        return
+    affinities = data["getLindsByUniprotsResponse"]["affinities"]
     df = pd.DataFrame(affinities)
     df = df[df["affinity_type"] == affinity_type]
     df = df[["smile", "monomerid", "affinity"]]
@@ -64,25 +74,33 @@ def fetch(
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    dataset = fetch(args.uniprot, args.affinity_cutoff, args.affinity_type)
 
-    # three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
-    mol_path = os.path.join(args.output_dir, "mols.smi")
-    train_path = os.path.join(args.output_dir, "train.csv")
-    val_path = os.path.join(args.output_dir, "valid.csv")
-    # Save smiles and id without header. Note that this dataset uses tab delimiter.
-    dataset[["smile", "monomerid"]].to_csv(
-        mol_path, index=False, header=False, sep="\t"
-    )
-    # Training dataset have columns Label,sampling_frequency,mol_id
-    dataset = dataset.rename(columns={"affinity": "Label", "monomerid": "mol_id"})
-    dataset["sampling_frequency"] = "high"
-    dataset = dataset[["Label", "sampling_frequency", "mol_id"]]
+    for attempt in range(args.max_retries):
+        dataset = fetch(args.uniprot, args.affinity_cutoff, args.affinity_type)
+        if dataset is not None:
+            break
+        sleep(5)
+
+    if dataset is None:
+        print(f'BindingDB API does not respond even after {tries} attempts.')
+    else:
+        # three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
+        mol_path = os.path.join(args.output_dir, "mols.smi")
+        train_path = os.path.join(args.output_dir, "train.csv")
+        val_path = os.path.join(args.output_dir, "valid.csv")
+        # Save smiles and id without header. Note that this dataset uses tab delimiter.
+        dataset[["smile", "monomerid"]].to_csv(
+            mol_path, index=False, header=False, sep="\t"
+        )
+        # Training dataset have columns Label,sampling_frequency,mol_id
+        dataset = dataset.rename(columns={"affinity": "Label", "monomerid": "mol_id"})
+        dataset["sampling_frequency"] = "high"
+        dataset = dataset[["Label", "sampling_frequency", "mol_id"]]
 
-    if args.binary_labels:
-        dataset["Label"] = dataset["Label"].apply(lambda x: 1 if x > 6 else 0)
-    train, validation = train_test_split(
-        dataset, train_size=args.train_size, random_state=1911
-    )
-    train.to_csv(train_path, index=False, header=True)
-    validation.to_csv(val_path, index=False, header=True)
+        if args.binary_labels:
+            dataset["Label"] = dataset["Label"].apply(lambda x: 1 if x > 6 else 0)
+        train, validation = train_test_split(
+            dataset, train_size=args.train_size, random_state=1911
+        )
+        train.to_csv(train_path, index=False, header=True)
+        validation.to_csv(val_path, index=False, header=True)