1+ from typing import Optional
12import requests
23import os
34import pandas as pd
45import argparse
56from sklearn .model_selection import train_test_split
67from helpers import utils
8+ from time import sleep
79
810
911parser = argparse .ArgumentParser ()
3840 action = "store_true" ,
3941 help = "Enable binary classification. If not specified, the default mode is regression." ,
4042)
43+ parser .add_argument (
44+ "--max_retries" ,
45+ type = int ,
46+ default = 10 ,
47+ help = "Maximal number of retries to fetch data from fickle BindingDB API" ,
48+ )
4149
4250
4351def fetch (
4452 uniprot : str ,
4553 affinity_cutoff : int ,
4654 affinity_type : str ,
47- ) -> pd .DataFrame :
48- url = f"https://bindingdb.org/rest/getLigandsByUniprots?uniprot={ uniprot } &cutoff={ affinity_cutoff } &response=application/json"
55+ ) -> Optional [ pd .DataFrame ] :
56+ url = f"https://www. bindingdb.org/rest/getLigandsByUniprots?uniprot={ uniprot } &cutoff={ affinity_cutoff } &response=application/json"
4957 response = requests .get (url )
50- assert response .status_code == 200 , "[x] Failed to fetch data from bindingdb"
58+ assert response .status_code == 200 , f"Response { response . status_code } : Failed to fetch data from bindingdb"
5159
5260 data = response .json ()
53- affinities = data ["getLigandsByUniprotsResponse" ]["affinities" ]
61+ if 'getLindsByUniprotsResponse' not in data :
62+ return
63+ affinities = data ["getLindsByUniprotsResponse" ]["affinities" ]
5464 df = pd .DataFrame (affinities )
5565 df = df [df ["affinity_type" ] == affinity_type ]
5666 df = df [["smile" , "monomerid" , "affinity" ]]
@@ -64,25 +74,33 @@ def fetch(
6474
6575if __name__ == "__main__" :
6676 args = parser .parse_args ()
67- dataset = fetch (args .uniprot , args .affinity_cutoff , args .affinity_type )
6877
69- # three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
70- mol_path = os .path .join (args .output_dir , "mols.smi" )
71- train_path = os .path .join (args .output_dir , "train.csv" )
72- val_path = os .path .join (args .output_dir , "valid.csv" )
73- # Save smiles and id without header. Note that this dataset uses tab delimiter.
74- dataset [["smile" , "monomerid" ]].to_csv (
75- mol_path , index = False , header = False , sep = "\t "
76- )
77- # Training dataset have columns Label,sampling_frequency,mol_id
78- dataset = dataset .rename (columns = {"affinity" : "Label" , "monomerid" : "mol_id" })
79- dataset ["sampling_frequency" ] = "high"
80- dataset = dataset [["Label" , "sampling_frequency" , "mol_id" ]]
78+ for attempt in range (args .max_retries ):
79+ dataset = fetch (args .uniprot , args .affinity_cutoff , args .affinity_type )
80+ if dataset is not None :
81+ break
82+ sleep (5 )
83+
84+ if dataset is None :
85+ print (f'BindingDB API does not respond even after { tries } attempts.' )
86+ else :
87+ # three files. mols.smi list of all the smiles. Then we have train.csv and val.csv
88+ mol_path = os .path .join (args .output_dir , "mols.smi" )
89+ train_path = os .path .join (args .output_dir , "train.csv" )
90+ val_path = os .path .join (args .output_dir , "valid.csv" )
91+ # Save smiles and id without header. Note that this dataset uses tab delimiter.
92+ dataset [["smile" , "monomerid" ]].to_csv (
93+ mol_path , index = False , header = False , sep = "\t "
94+ )
95+ # Training dataset have columns Label,sampling_frequency,mol_id
96+ dataset = dataset .rename (columns = {"affinity" : "Label" , "monomerid" : "mol_id" })
97+ dataset ["sampling_frequency" ] = "high"
98+ dataset = dataset [["Label" , "sampling_frequency" , "mol_id" ]]
8199
82- if args .binary_labels :
83- dataset ["Label" ] = dataset ["Label" ].apply (lambda x : 1 if x > 6 else 0 )
84- train , validation = train_test_split (
85- dataset , train_size = args .train_size , random_state = 1911
86- )
87- train .to_csv (train_path , index = False , header = True )
88- validation .to_csv (val_path , index = False , header = True )
100+ if args .binary_labels :
101+ dataset ["Label" ] = dataset ["Label" ].apply (lambda x : 1 if x > 6 else 0 )
102+ train , validation = train_test_split (
103+ dataset , train_size = args .train_size , random_state = 1911
104+ )
105+ train .to_csv (train_path , index = False , header = True )
106+ validation .to_csv (val_path , index = False , header = True )
0 commit comments