|
| 1 | +# |
| 2 | +# This script can be used for any purpose without limitation subject to the |
| 3 | +# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx |
| 4 | +# |
| 5 | +# This permission notice and the following statement of attribution must be |
| 6 | +# included in all copies or substantial portions of this script. |
| 7 | +# |
| 8 | +# 2022-02-28: Submitted by Abhik Mukhopadhyay , The Cambridge Crystallographic Data Centre |
| 9 | +# |
| 10 | + |
| 11 | +import requests |
| 12 | +import urllib.request, urllib.parse, urllib.error |
| 13 | +import os |
| 14 | +import argparse |
| 15 | + |
| 16 | +from ccdc import io |
| 17 | +from ccdc import conformer |
| 18 | +from ccdc.molecule import Molecule |
| 19 | +from ccdc.descriptors import MolecularDescriptors |
| 20 | + |
| 21 | +__doc__ = """ |
| 22 | +Starting from a list of PDB-codes, the script generates idealized conformers |
| 23 | +for ligands and evaluates their RMSD to the conformation in the PDB. |
| 24 | +""" |
| 25 | + |
| 26 | +__author__ = 'Brandl, Giangreco, Higueruelo, Schaerfer and Sykes' |
| 27 | + |
| 28 | + |
| 29 | +excluded_hetids = set(['NAP', 'MA4', 'EOH', 'AGC', 'EOM', 'BGC', 'HEM', 'CPS', 'CPT', |
| 30 | + 'TAU', 'TAR', 'TAS', 'SPK', '1FH', 'SPM', 'VA3', 'SPD', |
| 31 | + 'XPE', 'GD', 'GA', 'EHN', 'CO5', 'LAK', 'V70', 'PE4', |
| 32 | + 'MAC', 'LAT', 'CP2', 'CP3', 'AR', 'MAN', 'CMO', 'TSM', |
| 33 | + 'DTN', 'TSD', 'TSE', 'YBT', 'EDO', 'PCL', 'CB5', 'BEQ', |
| 34 | + 'F09', 'DTV', 'NO2', 'NO3', 'DZZ', 'IUM', 'UVW', 'NME', |
| 35 | + 'ZN', 'NFC', 'NFB', 'NFO', 'K', 'OH', 'NFS', 'NFR', |
| 36 | + 'MQD', 'MG', 'CBG', 'NOE', 'MO', 'MN', 'PC4', 'CBM', |
| 37 | + 'SE', 'CBX', 'MXE', 'BE7', 'BCT', 'DCE', 'W', 'GLO', |
| 38 | + 'MM4', 'GLC', 'DOD', 'SMO', 'GLY', 'DOX', 'FE', 'OF2', |
| 39 | + 'DET', 'C2C', 'FCY', 'MDD', 'AU3', 'FCL', 'FCO', 'HTG', |
| 40 | + 'AUC', 'VER', 'F', 'NFE', 'FMT', 'FMS', 'JEF', 'V', |
| 41 | + 'TOU', 'SX', 'OF3', 'SBT', 'SR', 'SBY', 'LMT', 'LMU', |
| 42 | + 'T3A', 'SM', 'SBE', 'SB', 'SBO', 'MMC', 'YH', 'DPR', |
| 43 | + 'YB', 'CAT', 'DPG', 'DPO', 'CAC', 'TF4', 'CAD', 'NHE', |
| 44 | + 'PGE', 'LA', 'PGO', 'PGL', 'HG2', 'LI', 'LU', 'PGR', |
| 45 | + 'PGQ', 'NHY', '144', 'NMU', 'PG6', 'NH4', 'WCC', |
| 46 | + 'PG5', 'ZRC', 'PG0', 'B7G', 'HGC', 'CFN', 'CFO', 'CFM', |
| 47 | + 'TFH', 'NCP', 'I42', 'TFA', 'PNL', 'NCO', 'Y1', 'CFT', |
| 48 | + 'KO4', 'SF4', 'ZNH', 'ZNO', 'SF3', 'FNE', 'EGL', '7PE', |
| 49 | + 'RU', 'LBT', 'SAL', 'RE', 'RB', 'BF4', 'URE', 'MRY', |
| 50 | + 'CHM', 'ACM', 'MRD', 'IPA', 'IPH', 'ZN3', 'ZN2', 'ZN1', |
| 51 | + 'LI1', '4OA', 'RHD', 'OCM', 'OCL', 'OCO', 'OCN', 'NGN', |
| 52 | + 'HDZ', 'P2K', 'V40', 'HDD', 'HDN', 'MG8', 'BMA', '2FU', |
| 53 | + 'BME', 'WO5', 'WO4', 'OX', 'YT3', 'WO2', 'CEQ', '2FH', |
| 54 | + 'AF3', 'ENC', 'GCD', '2HP', 'B3P', 'CE1', 'EU', 'XCC', |
| 55 | + 'MGF', 'TBR', 'TBU', 'PG4', 'OC8', 'BTC', 'BTB', 'OC1', |
| 56 | + 'OC3', 'OC2', 'OC5', 'OC4', 'OC7', 'OC6', 'NH3', '3OF', |
| 57 | + 'ATO', 'HGI', 'DMF', 'AST', 'MTG', 'SEA', 'SEK', 'GBL', |
| 58 | + 'EU3', 'CLN', 'CLO', 'DUM', 'UNX', 'BEF', 'CLF', 'PDT', |
| 59 | + 'XE', 'PDO', 'TLA', 'CLX', 'DMN', 'FRU', 'BBX', 'BBU', |
| 60 | + 'CLP', 'OMO', 'H2S', 'ZEM', 'KR', 'SRM', 'SE4', 'P6G', |
| 61 | + '2ME', '2MO', 'CYS', '3PO', 'POR', 'POP', 'PON', '2BM', |
| 62 | + 'CYN', 'GOL', '202', 'F50', 'DHE', 'DHD', 'PO2', 'PO3', |
| 63 | + 'MLP', 'OTE', 'TFP', 'E1H', 'MCR', 'NRU', '3CO', '3CN', |
| 64 | + '2PA', '2PE', 'NI2', 'NI3', 'NI1', '2PO', '2PN', 'TCN', |
| 65 | + 'HSG', 'HSH', 'TZZ', '12P', '543', 'OF0', 'OF1', 'HSW', |
| 66 | + 'C2O', 'MEE', 'N2O', 'PD', 'VXA', 'GPX', 'THJ', 'BNG', |
| 67 | + 'NIK', 'F2O', 'SYL', 'CCH', 'FDE', 'FDD', 'EMC', 'MO3', |
| 68 | + 'MO2', 'MO1', 'CRY', 'MO7', 'MO6', 'MO5', 'MO4', 'DMS', |
| 69 | + 'DMU', 'DMX', 'RGI', 'AEM', 'PS5', 'PR', 'LDA', 'PT', |
| 70 | + 'PB', 'ZO3', 'TCH', 'PI', 'MH2', 'MH3', 'MHM', 'BU2', |
| 71 | + 'BU3', 'DDQ', 'BU1', 'MHA', 'S', 'DDH', 'T1A', 'LCP', |
| 72 | + 'MOS', 'O4M', 'MOH', 'MOO', 'LCO', 'BCN', 'UMQ', 'NH2', |
| 73 | + 'HNI', 'PEU', 'PER', 'FSO', 'TMA', 'ETA', 'ETF', 'CD1', |
| 74 | + 'PEJ', 'CD3', 'FSX', 'DVT', 'PEG', 'COM', 'RU7', 'CON', |
| 75 | + 'HE5', 'HE6', 'BF2', '16D', '16A', 'P33', 'ND4', 'HEV', |
| 76 | + 'CO', 'CN', 'CM', 'HES', 'CA', 'HEZ', 'CD', 'HEG', |
| 77 | + 'HEA', 'HEB', 'HEC', 'CS', 'CR', 'CU', 'IOD', 'PE8', |
| 78 | + 'PSL', 'OES', 'PE5', 'PE7', 'PE3', 'FS1', 'FS2', 'FS3', |
| 79 | + 'FS4', 'PC3', 'OEC', 'NMO', 'NML', 'DIO', '6MO', 'CIT', |
| 80 | + 'DIA', 'C8E', '1PS', 'C10', 'C15', 'DIS', 'PPF', 'SOG', |
| 81 | + 'PPK', 'SOH', 'PPI', 'IR3', 'SOM', 'TBA', 'PPM', 'SOR', |
| 82 | + 'PPV', 'GAI', 'XL1', 'MBR', 'IR', '3NI', 'IRI', 'SO3', |
| 83 | + 'SO2', 'SO4', 'MTL', 'IN', 'VX', 'PP9', 'MTF', 'CM5', |
| 84 | + 'MTD', 'ARS', 'ART', 'SDS', 'ARF', '4MO', 'BA', 'NAO', |
| 85 | + 'CCN', 'NAW', 'BR', 'EPE', 'O2', 'BOG', 'PIN', 'MYQ', |
| 86 | + 'MYR', 'POL', 'PIG', 'N8E', 'F3S', 'IOH', 'PIS', 'CD5', |
| 87 | + 'BO3', 'BO4', 'DR6', 'NA2', 'PEO', 'NA6', 'CXE', 'NA5', |
| 88 | + 'OS', 'REO', 'U1', 'VO4', 'FLH', 'HO', 'FLO', 'COH', |
| 89 | + 'EEE', 'HG', 'PTN', 'ETI', 'SFO', 'ETN', 'MNR', 'VO3', |
| 90 | + 'MNH', 'OSM', 'ROP', 'SFN', 'MNC', 'PO4', '3BR', 'FMI', |
| 91 | + 'MN3', 'TRE', 'MN5', 'O', 'PBM', 'TRS', 'PT4', 'TRT', |
| 92 | + 'MW3', 'MW2', 'MPD', 'HTO', 'MPO', 'MTO', '6WO', 'MPR', |
| 93 | + 'MPS', 'AU', 'OXY', 'ACA', 'GD3', 'SUC', 'NEH', 'IDT', |
| 94 | + 'SUL', 'P4C', 'ACN', '2OF', 'S0H', 'IDO', 'HFM', 'ACU', |
| 95 | + 'ACT', '2OP', 'ACY', '2OS', 'NI', 'CU1', 'NO', 'NA', |
| 96 | + 'MW1', 'TEE', 'FE2', 'TEP', '1BO', 'FEC', 'FEA', 'FEO', |
| 97 | + 'FEL', 'CUZ', 'FES', 'CUA', 'CUO', 'CUN', 'CUM', 'CUL', |
| 98 | + 'PTL', 'MN6', 'BRP', 'BRO', 'BRM', 'BRJ', 'MES', 'OUT', |
| 99 | + 'AZI', 'MP1', '1PG', '1PE', 'MSM', 'HLT', 'MSF', 'CN1', |
| 100 | + 'CL', 'TL', 'SGM', 'HO3', 'TE', 'TB', 'ICI', 'AG', |
| 101 | + 'FPO', '1CP', 'AL', 'HOH', 'CNF', 'SCN', 'CNB', 'CE', |
| 102 | + 'CNN', 'CNM', 'ICT', '15P', 'ZH3', 'RTC', '2NO', 'DXG', |
| 103 | + 'DXE', 'VSO', 'MSE', 'FAR', 'FII', 'FPP', 'HFP', 'A4P', 'NAD', |
| 104 | + 'ADP', 'AMP', 'APC', 'ATP', 'IMP', 'RVP', 'XMP', 'SAH', 'SAM', |
| 105 | + 'FAD', 'NAP', 'NDP', 'UDP', 'ANP', 'COA', 'CME', 'UMP', 'NAI']) |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | +class SearchAPI: |
| 110 | + def __init__(self, search_url): |
| 111 | + self.search_url = search_url |
| 112 | + self.search_options = '&wt=json&rows=100000' |
| 113 | + |
| 114 | + |
| 115 | + def url_response(self, url): |
| 116 | + """ |
| 117 | + Getting JSON response from URL |
| 118 | + :param url: String |
| 119 | + :return: JSON |
| 120 | + """ |
| 121 | + r = requests.get(url=url) |
| 122 | + # Status code 200 means 'OK' |
| 123 | + if r.status_code == 200: |
| 124 | + json_result = r.json() |
| 125 | + return json_result |
| 126 | + else: |
| 127 | + print(r.status_code, r.reason) |
| 128 | + return None |
| 129 | + |
| 130 | + def run_search(self): |
| 131 | + """ |
| 132 | + Running search with terms |
| 133 | + Check pdbe search api documentation for more details |
| 134 | + :param pdbe_search_term: String |
| 135 | + :return: JSON |
| 136 | + """ |
| 137 | + full_query = self.search_url |
| 138 | + response = self.url_response(full_query) |
| 139 | + return response |
| 140 | + |
| 141 | + |
| 142 | +def get_sdf_hetid(pdbid, hetid, pdb_lig_filename): |
| 143 | + |
| 144 | + """ |
| 145 | + downloads ligand sdf file using RCSB ligand expo API |
| 146 | + """ |
| 147 | + |
| 148 | + hetid = str(hetid) |
| 149 | + sflag = False |
| 150 | + url = "http://ligand-expo.rcsb.org/files/%s/%s/isdf/" % ( |
| 151 | + hetid[0], hetid) |
| 152 | + |
| 153 | + try: |
| 154 | + response = urllib.request.urlopen(url) |
| 155 | + except Exception: |
| 156 | + print(" can't find the ligand for " + str(pdbid)) |
| 157 | + else: |
| 158 | + lines = response.readlines() |
| 159 | + sdf_file = '' |
| 160 | + for line in lines: |
| 161 | + try: |
| 162 | + sdf_filename = line.decode().split('<li><a href="')[1].split('"')[0] |
| 163 | + if sdf_filename.startswith(pdbid.lower()) and sdf_filename[-7] != 'D': |
| 164 | + sdf_url = url + sdf_filename |
| 165 | + sdf_file = urllib.request.urlopen(sdf_url).read() |
| 166 | + break |
| 167 | + except Exception: |
| 168 | + continue |
| 169 | + if sdf_file: |
| 170 | + with open(pdb_lig_filename, 'wb') as f: |
| 171 | + f.write(sdf_file) |
| 172 | + sflag = True |
| 173 | + else: |
| 174 | + print(" can't find a ligand for " + str(pdbid) + " or it is disordered") |
| 175 | + return sflag |
| 176 | + |
| 177 | + |
| 178 | +def get_smi_from_hetid(hetid): |
| 179 | + """ |
| 180 | + obtain SMILES string for hetid using PDBe REST API |
| 181 | + """ |
| 182 | + smi_query = SearchAPI( |
| 183 | + search_url='https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{}'.format(hetid)) |
| 184 | + try: |
| 185 | + smi_call = smi_query.run_search() |
| 186 | + except Exception: |
| 187 | + print("problem accessing the compound API service") |
| 188 | + else: |
| 189 | + for x, y in smi_call.items(): |
| 190 | + smi = (y[0]['smiles'][0]['name']) |
| 191 | + |
| 192 | + return smi |
| 193 | + |
| 194 | + |
| 195 | +def get_3d_sdf_from_smi(smi, sdf_filename): |
| 196 | + |
| 197 | + """ |
| 198 | + Generate 3D coordinate from SMILES string |
| 199 | + """ |
| 200 | + |
| 201 | + sflag = False |
| 202 | + conformer_generator = conformer.ConformerGenerator() |
| 203 | + conformer_generator.settings.max_conformers = 1 |
| 204 | + m = Molecule.from_string(smi) |
| 205 | + conformers = conformer_generator.generate(m) |
| 206 | + with io.EntryWriter(sdf_filename) as writer: |
| 207 | + writer.write(conformers[0].molecule) |
| 208 | + sflag = True |
| 209 | + |
| 210 | + return sflag |
| 211 | + |
| 212 | + |
| 213 | +def get_hetids_from_pdbid(pdbid): |
| 214 | + """ |
| 215 | + Find hetids in PDB entry using PDBe REST API and then return unique hetid that |
| 216 | + are not present in the excluded_hetids list above |
| 217 | + """ |
| 218 | + chem_comp_query = SearchAPI( |
| 219 | + search_url='https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/{}'.format(pdbid)) |
| 220 | + try: |
| 221 | + cc_call = chem_comp_query.run_search() |
| 222 | + except Exception: |
| 223 | + print("problem accesing the entry API") |
| 224 | + else: |
| 225 | + all_cc_list = [] |
| 226 | + for x, y in cc_call.items(): |
| 227 | + for z in y: |
| 228 | + cc_id = (z['chem_comp_id']) |
| 229 | + all_cc_list.append(cc_id) |
| 230 | + cc_list = list(set(all_cc_list)) |
| 231 | + |
| 232 | + hetids = [x for x in cc_list if x not in excluded_hetids] |
| 233 | + |
| 234 | + return hetids |
| 235 | + |
| 236 | + |
| 237 | +def write_conformers(mol_input, mol_reference, conf_file): |
| 238 | + """ |
| 239 | + Generate conformer and calculate rmsd wrt reference |
| 240 | + """ |
| 241 | + top_rmsd = -1 |
| 242 | + conformer_generator = conformer.ConformerGenerator() |
| 243 | + conformer_generator.settings.superimpose_conformers_onto_reference = True |
| 244 | + conformers = conformer_generator.generate(mol_input) |
| 245 | + if conformers: |
| 246 | + best_rmsd = [ |
| 247 | + MolecularDescriptors.rmsd(mol_reference, c.molecule, None, |
| 248 | + True, True, True) |
| 249 | + for c in conformers |
| 250 | + ] |
| 251 | + best_rmsd.sort() |
| 252 | + top_rmsd = best_rmsd[0] |
| 253 | + with io.MoleculeWriter(conf_file) as mol_writer: |
| 254 | + for c in conformers: |
| 255 | + mol_writer.write(c.molecule) |
| 256 | + pass |
| 257 | + rmsd, norm_score = round(conformers[0].rmsd(), 2), conformers[ |
| 258 | + 0].normalised_score |
| 259 | + else: |
| 260 | + rmsd, norm_score = '', '' |
| 261 | + return rmsd, norm_score, top_rmsd |
| 262 | + |
| 263 | + |
| 264 | +def generate_conformers_for_all_hetids(pdbid, folder_path='conformers_'): |
| 265 | + """ |
| 266 | + generate conformer for all hetid |
| 267 | + """ |
| 268 | + working_dir = folder_path + pdbid |
| 269 | + if not os.path.exists(working_dir): |
| 270 | + os.makedirs(working_dir) |
| 271 | + |
| 272 | + list_hetids = get_hetids_from_pdbid(pdbid) |
| 273 | + results = [] |
| 274 | + for hetid in list_hetids: |
| 275 | + print('doing hetid ' + str(hetid)) |
| 276 | + reference_flag, input_flag = False, False |
| 277 | + pdb_lig_filename = os.path.join(working_dir, str(hetid) + '_reference.sdf') |
| 278 | + model_lig_filename = os.path.join(working_dir, str(hetid) + '_model.sdf') |
| 279 | + reference_flag = get_sdf_hetid(pdbid, hetid, pdb_lig_filename) |
| 280 | + input_smi = get_smi_from_hetid(hetid) |
| 281 | + if input_smi: |
| 282 | + input_flag = get_3d_sdf_from_smi(input_smi, model_lig_filename) |
| 283 | + if reference_flag: |
| 284 | + mol_reader_ref = io.MoleculeReader(pdb_lig_filename) |
| 285 | + try: |
| 286 | + mol_reference = mol_reader_ref[0] |
| 287 | + except Exception: |
| 288 | + continue |
| 289 | + if input_flag: |
| 290 | + mol_reader_input = io.MoleculeReader(model_lig_filename) |
| 291 | + try: |
| 292 | + mol_input = mol_reader_input[0] |
| 293 | + except Exception: |
| 294 | + continue |
| 295 | + count = 0 |
| 296 | + if reference_flag and input_flag: |
| 297 | + if len(mol_input.heavy_atoms) == len(mol_reference.heavy_atoms): |
| 298 | + rb = len([b for b in mol_input.bonds if b.is_rotatable]) |
| 299 | + if len(mol_input.heavy_atoms) > 6 and rb > 0: |
| 300 | + # molecule needs hydrogens, set atom types, |
| 301 | + # and aromatic bonds to match mogul libraries |
| 302 | + mol_input.add_hydrogens() |
| 303 | + mol_reference.add_hydrogens() |
| 304 | + mol_input.standardise_delocalised_bonds() |
| 305 | + mol_reference.standardise_delocalised_bonds() |
| 306 | + mol_input.assign_bond_types(which='unknown') |
| 307 | + mol_reference.assign_bond_types(which='unknown') |
| 308 | + mol_input.standardise_aromatic_bonds() |
| 309 | + mol_reference.standardise_aromatic_bonds() |
| 310 | + mw = int(round(mol_input.molecular_weight, 0)) |
| 311 | + conf_file = model_lig_filename.replace('.sdf', '_conformers.sdf') |
| 312 | + rmsd, norm_score, top_rmsd = write_conformers(mol_input, |
| 313 | + mol_reference, |
| 314 | + conf_file) |
| 315 | + output_list = ','.join(map(str, |
| 316 | + [pdbid, hetid, mw, rmsd, |
| 317 | + norm_score, top_rmsd])) |
| 318 | + results.append(output_list) |
| 319 | + return results |
| 320 | + |
| 321 | + |
| 322 | +############################################################################### |
| 323 | +# main |
| 324 | +############################################################################### |
| 325 | + |
| 326 | + |
| 327 | +def main(): |
| 328 | + parser = argparse.ArgumentParser(description=__doc__) |
| 329 | + parser.add_argument("targets", help="list of PDB-codes", type=str) |
| 330 | + |
| 331 | + args = parser.parse_args() |
| 332 | + input_filename = args.targets |
| 333 | + base_name = os.path.splitext(input_filename)[0] |
| 334 | + output_filename = base_name + "_CSD_Conformer_Results.csv" |
| 335 | + |
| 336 | + with open(input_filename) as input_file: |
| 337 | + pdbs_raw = input_file.readlines() |
| 338 | + |
| 339 | + pdbs = [] |
| 340 | + for pdb_raw in pdbs_raw: |
| 341 | + pdb = pdb_raw.strip() |
| 342 | + if pdb and len(pdb) == 4: |
| 343 | + pdbs.append(pdb) |
| 344 | + else: |
| 345 | + print('this line is not a pdbid ' + str(pdb_raw)) |
| 346 | + |
| 347 | + total = str(len(pdbs)) |
| 348 | + i = 1 |
| 349 | + |
| 350 | + with open(output_filename, 'w') as output_file: |
| 351 | + output_file.write('pdbid,hetid,mw,rmsd_lowest_energy_conformer,norm_score,lowest_rmsd\n') |
| 352 | + for pdbid in pdbs: |
| 353 | + print('processing ' + str(i) + ' of ' + total + ' PDB: ' + pdbid) |
| 354 | + i += 1 |
| 355 | + results = generate_conformers_for_all_hetids(pdbid.strip()) |
| 356 | + for result in results: |
| 357 | + output_file.write(result + '\n') |
| 358 | + |
| 359 | + |
| 360 | +if __name__ == '__main__': |
| 361 | + main() |
| 362 | + |
| 363 | + |
| 364 | + |
0 commit comments