Skip to content

Commit 0865ab9

Browse files
authored
Merge pull request #6 from ccdc-opensource/conf_gen_script_commit
2 parents 067cde8 + 7eba4ce commit 0865ab9

File tree

3 files changed

+398
-0
lines changed

3 files changed

+398
-0
lines changed
Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,364 @@
1+
#
2+
# This script can be used for any purpose without limitation subject to the
3+
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
4+
#
5+
# This permission notice and the following statement of attribution must be
6+
# included in all copies or substantial portions of this script.
7+
#
8+
# 2022-02-28: Submitted by Abhik Mukhopadhyay , The Cambridge Crystallographic Data Centre
9+
#
10+
11+
import requests
12+
import urllib.request, urllib.parse, urllib.error
13+
import os
14+
import argparse
15+
16+
from ccdc import io
17+
from ccdc import conformer
18+
from ccdc.molecule import Molecule
19+
from ccdc.descriptors import MolecularDescriptors
20+
21+
__doc__ = """
22+
Starting from a list of PDB-codes, the script generates idealized conformers
23+
for ligands and evaluates their RMSD to the conformation in the PDB.
24+
"""
25+
26+
__author__ = 'Brandl, Giangreco, Higueruelo, Schaerfer and Sykes'
27+
28+
29+
excluded_hetids = set(['NAP', 'MA4', 'EOH', 'AGC', 'EOM', 'BGC', 'HEM', 'CPS', 'CPT',
30+
'TAU', 'TAR', 'TAS', 'SPK', '1FH', 'SPM', 'VA3', 'SPD',
31+
'XPE', 'GD', 'GA', 'EHN', 'CO5', 'LAK', 'V70', 'PE4',
32+
'MAC', 'LAT', 'CP2', 'CP3', 'AR', 'MAN', 'CMO', 'TSM',
33+
'DTN', 'TSD', 'TSE', 'YBT', 'EDO', 'PCL', 'CB5', 'BEQ',
34+
'F09', 'DTV', 'NO2', 'NO3', 'DZZ', 'IUM', 'UVW', 'NME',
35+
'ZN', 'NFC', 'NFB', 'NFO', 'K', 'OH', 'NFS', 'NFR',
36+
'MQD', 'MG', 'CBG', 'NOE', 'MO', 'MN', 'PC4', 'CBM',
37+
'SE', 'CBX', 'MXE', 'BE7', 'BCT', 'DCE', 'W', 'GLO',
38+
'MM4', 'GLC', 'DOD', 'SMO', 'GLY', 'DOX', 'FE', 'OF2',
39+
'DET', 'C2C', 'FCY', 'MDD', 'AU3', 'FCL', 'FCO', 'HTG',
40+
'AUC', 'VER', 'F', 'NFE', 'FMT', 'FMS', 'JEF', 'V',
41+
'TOU', 'SX', 'OF3', 'SBT', 'SR', 'SBY', 'LMT', 'LMU',
42+
'T3A', 'SM', 'SBE', 'SB', 'SBO', 'MMC', 'YH', 'DPR',
43+
'YB', 'CAT', 'DPG', 'DPO', 'CAC', 'TF4', 'CAD', 'NHE',
44+
'PGE', 'LA', 'PGO', 'PGL', 'HG2', 'LI', 'LU', 'PGR',
45+
'PGQ', 'NHY', '144', 'NMU', 'PG6', 'NH4', 'WCC',
46+
'PG5', 'ZRC', 'PG0', 'B7G', 'HGC', 'CFN', 'CFO', 'CFM',
47+
'TFH', 'NCP', 'I42', 'TFA', 'PNL', 'NCO', 'Y1', 'CFT',
48+
'KO4', 'SF4', 'ZNH', 'ZNO', 'SF3', 'FNE', 'EGL', '7PE',
49+
'RU', 'LBT', 'SAL', 'RE', 'RB', 'BF4', 'URE', 'MRY',
50+
'CHM', 'ACM', 'MRD', 'IPA', 'IPH', 'ZN3', 'ZN2', 'ZN1',
51+
'LI1', '4OA', 'RHD', 'OCM', 'OCL', 'OCO', 'OCN', 'NGN',
52+
'HDZ', 'P2K', 'V40', 'HDD', 'HDN', 'MG8', 'BMA', '2FU',
53+
'BME', 'WO5', 'WO4', 'OX', 'YT3', 'WO2', 'CEQ', '2FH',
54+
'AF3', 'ENC', 'GCD', '2HP', 'B3P', 'CE1', 'EU', 'XCC',
55+
'MGF', 'TBR', 'TBU', 'PG4', 'OC8', 'BTC', 'BTB', 'OC1',
56+
'OC3', 'OC2', 'OC5', 'OC4', 'OC7', 'OC6', 'NH3', '3OF',
57+
'ATO', 'HGI', 'DMF', 'AST', 'MTG', 'SEA', 'SEK', 'GBL',
58+
'EU3', 'CLN', 'CLO', 'DUM', 'UNX', 'BEF', 'CLF', 'PDT',
59+
'XE', 'PDO', 'TLA', 'CLX', 'DMN', 'FRU', 'BBX', 'BBU',
60+
'CLP', 'OMO', 'H2S', 'ZEM', 'KR', 'SRM', 'SE4', 'P6G',
61+
'2ME', '2MO', 'CYS', '3PO', 'POR', 'POP', 'PON', '2BM',
62+
'CYN', 'GOL', '202', 'F50', 'DHE', 'DHD', 'PO2', 'PO3',
63+
'MLP', 'OTE', 'TFP', 'E1H', 'MCR', 'NRU', '3CO', '3CN',
64+
'2PA', '2PE', 'NI2', 'NI3', 'NI1', '2PO', '2PN', 'TCN',
65+
'HSG', 'HSH', 'TZZ', '12P', '543', 'OF0', 'OF1', 'HSW',
66+
'C2O', 'MEE', 'N2O', 'PD', 'VXA', 'GPX', 'THJ', 'BNG',
67+
'NIK', 'F2O', 'SYL', 'CCH', 'FDE', 'FDD', 'EMC', 'MO3',
68+
'MO2', 'MO1', 'CRY', 'MO7', 'MO6', 'MO5', 'MO4', 'DMS',
69+
'DMU', 'DMX', 'RGI', 'AEM', 'PS5', 'PR', 'LDA', 'PT',
70+
'PB', 'ZO3', 'TCH', 'PI', 'MH2', 'MH3', 'MHM', 'BU2',
71+
'BU3', 'DDQ', 'BU1', 'MHA', 'S', 'DDH', 'T1A', 'LCP',
72+
'MOS', 'O4M', 'MOH', 'MOO', 'LCO', 'BCN', 'UMQ', 'NH2',
73+
'HNI', 'PEU', 'PER', 'FSO', 'TMA', 'ETA', 'ETF', 'CD1',
74+
'PEJ', 'CD3', 'FSX', 'DVT', 'PEG', 'COM', 'RU7', 'CON',
75+
'HE5', 'HE6', 'BF2', '16D', '16A', 'P33', 'ND4', 'HEV',
76+
'CO', 'CN', 'CM', 'HES', 'CA', 'HEZ', 'CD', 'HEG',
77+
'HEA', 'HEB', 'HEC', 'CS', 'CR', 'CU', 'IOD', 'PE8',
78+
'PSL', 'OES', 'PE5', 'PE7', 'PE3', 'FS1', 'FS2', 'FS3',
79+
'FS4', 'PC3', 'OEC', 'NMO', 'NML', 'DIO', '6MO', 'CIT',
80+
'DIA', 'C8E', '1PS', 'C10', 'C15', 'DIS', 'PPF', 'SOG',
81+
'PPK', 'SOH', 'PPI', 'IR3', 'SOM', 'TBA', 'PPM', 'SOR',
82+
'PPV', 'GAI', 'XL1', 'MBR', 'IR', '3NI', 'IRI', 'SO3',
83+
'SO2', 'SO4', 'MTL', 'IN', 'VX', 'PP9', 'MTF', 'CM5',
84+
'MTD', 'ARS', 'ART', 'SDS', 'ARF', '4MO', 'BA', 'NAO',
85+
'CCN', 'NAW', 'BR', 'EPE', 'O2', 'BOG', 'PIN', 'MYQ',
86+
'MYR', 'POL', 'PIG', 'N8E', 'F3S', 'IOH', 'PIS', 'CD5',
87+
'BO3', 'BO4', 'DR6', 'NA2', 'PEO', 'NA6', 'CXE', 'NA5',
88+
'OS', 'REO', 'U1', 'VO4', 'FLH', 'HO', 'FLO', 'COH',
89+
'EEE', 'HG', 'PTN', 'ETI', 'SFO', 'ETN', 'MNR', 'VO3',
90+
'MNH', 'OSM', 'ROP', 'SFN', 'MNC', 'PO4', '3BR', 'FMI',
91+
'MN3', 'TRE', 'MN5', 'O', 'PBM', 'TRS', 'PT4', 'TRT',
92+
'MW3', 'MW2', 'MPD', 'HTO', 'MPO', 'MTO', '6WO', 'MPR',
93+
'MPS', 'AU', 'OXY', 'ACA', 'GD3', 'SUC', 'NEH', 'IDT',
94+
'SUL', 'P4C', 'ACN', '2OF', 'S0H', 'IDO', 'HFM', 'ACU',
95+
'ACT', '2OP', 'ACY', '2OS', 'NI', 'CU1', 'NO', 'NA',
96+
'MW1', 'TEE', 'FE2', 'TEP', '1BO', 'FEC', 'FEA', 'FEO',
97+
'FEL', 'CUZ', 'FES', 'CUA', 'CUO', 'CUN', 'CUM', 'CUL',
98+
'PTL', 'MN6', 'BRP', 'BRO', 'BRM', 'BRJ', 'MES', 'OUT',
99+
'AZI', 'MP1', '1PG', '1PE', 'MSM', 'HLT', 'MSF', 'CN1',
100+
'CL', 'TL', 'SGM', 'HO3', 'TE', 'TB', 'ICI', 'AG',
101+
'FPO', '1CP', 'AL', 'HOH', 'CNF', 'SCN', 'CNB', 'CE',
102+
'CNN', 'CNM', 'ICT', '15P', 'ZH3', 'RTC', '2NO', 'DXG',
103+
'DXE', 'VSO', 'MSE', 'FAR', 'FII', 'FPP', 'HFP', 'A4P', 'NAD',
104+
'ADP', 'AMP', 'APC', 'ATP', 'IMP', 'RVP', 'XMP', 'SAH', 'SAM',
105+
'FAD', 'NAP', 'NDP', 'UDP', 'ANP', 'COA', 'CME', 'UMP', 'NAI'])
106+
107+
108+
109+
class SearchAPI:
110+
def __init__(self, search_url):
111+
self.search_url = search_url
112+
self.search_options = '&wt=json&rows=100000'
113+
114+
115+
def url_response(self, url):
116+
"""
117+
Getting JSON response from URL
118+
:param url: String
119+
:return: JSON
120+
"""
121+
r = requests.get(url=url)
122+
# Status code 200 means 'OK'
123+
if r.status_code == 200:
124+
json_result = r.json()
125+
return json_result
126+
else:
127+
print(r.status_code, r.reason)
128+
return None
129+
130+
def run_search(self):
131+
"""
132+
Running search with terms
133+
Check pdbe search api documentation for more details
134+
:param pdbe_search_term: String
135+
:return: JSON
136+
"""
137+
full_query = self.search_url
138+
response = self.url_response(full_query)
139+
return response
140+
141+
142+
def get_sdf_hetid(pdbid, hetid, pdb_lig_filename):
143+
144+
"""
145+
downloads ligand sdf file using RCSB ligand expo API
146+
"""
147+
148+
hetid = str(hetid)
149+
sflag = False
150+
url = "http://ligand-expo.rcsb.org/files/%s/%s/isdf/" % (
151+
hetid[0], hetid)
152+
153+
try:
154+
response = urllib.request.urlopen(url)
155+
except Exception:
156+
print(" can't find the ligand for " + str(pdbid))
157+
else:
158+
lines = response.readlines()
159+
sdf_file = ''
160+
for line in lines:
161+
try:
162+
sdf_filename = line.decode().split('<li><a href="')[1].split('"')[0]
163+
if sdf_filename.startswith(pdbid.lower()) and sdf_filename[-7] != 'D':
164+
sdf_url = url + sdf_filename
165+
sdf_file = urllib.request.urlopen(sdf_url).read()
166+
break
167+
except Exception:
168+
continue
169+
if sdf_file:
170+
with open(pdb_lig_filename, 'wb') as f:
171+
f.write(sdf_file)
172+
sflag = True
173+
else:
174+
print(" can't find a ligand for " + str(pdbid) + " or it is disordered")
175+
return sflag
176+
177+
178+
def get_smi_from_hetid(hetid):
179+
"""
180+
obtain SMILES string for hetid using PDBe REST API
181+
"""
182+
smi_query = SearchAPI(
183+
search_url='https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{}'.format(hetid))
184+
try:
185+
smi_call = smi_query.run_search()
186+
except Exception:
187+
print("problem accessing the compound API service")
188+
else:
189+
for x, y in smi_call.items():
190+
smi = (y[0]['smiles'][0]['name'])
191+
192+
return smi
193+
194+
195+
def get_3d_sdf_from_smi(smi, sdf_filename):
196+
197+
"""
198+
Generate 3D coordinate from SMILES string
199+
"""
200+
201+
sflag = False
202+
conformer_generator = conformer.ConformerGenerator()
203+
conformer_generator.settings.max_conformers = 1
204+
m = Molecule.from_string(smi)
205+
conformers = conformer_generator.generate(m)
206+
with io.EntryWriter(sdf_filename) as writer:
207+
writer.write(conformers[0].molecule)
208+
sflag = True
209+
210+
return sflag
211+
212+
213+
def get_hetids_from_pdbid(pdbid):
214+
"""
215+
Find hetids in PDB entry using PDBe REST API and then return unique hetid that
216+
are not present in the excluded_hetids list above
217+
"""
218+
chem_comp_query = SearchAPI(
219+
search_url='https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/{}'.format(pdbid))
220+
try:
221+
cc_call = chem_comp_query.run_search()
222+
except Exception:
223+
print("problem accesing the entry API")
224+
else:
225+
all_cc_list = []
226+
for x, y in cc_call.items():
227+
for z in y:
228+
cc_id = (z['chem_comp_id'])
229+
all_cc_list.append(cc_id)
230+
cc_list = list(set(all_cc_list))
231+
232+
hetids = [x for x in cc_list if x not in excluded_hetids]
233+
234+
return hetids
235+
236+
237+
def write_conformers(mol_input, mol_reference, conf_file):
238+
"""
239+
Generate conformer and calculate rmsd wrt reference
240+
"""
241+
top_rmsd = -1
242+
conformer_generator = conformer.ConformerGenerator()
243+
conformer_generator.settings.superimpose_conformers_onto_reference = True
244+
conformers = conformer_generator.generate(mol_input)
245+
if conformers:
246+
best_rmsd = [
247+
MolecularDescriptors.rmsd(mol_reference, c.molecule, None,
248+
True, True, True)
249+
for c in conformers
250+
]
251+
best_rmsd.sort()
252+
top_rmsd = best_rmsd[0]
253+
with io.MoleculeWriter(conf_file) as mol_writer:
254+
for c in conformers:
255+
mol_writer.write(c.molecule)
256+
pass
257+
rmsd, norm_score = round(conformers[0].rmsd(), 2), conformers[
258+
0].normalised_score
259+
else:
260+
rmsd, norm_score = '', ''
261+
return rmsd, norm_score, top_rmsd
262+
263+
264+
def generate_conformers_for_all_hetids(pdbid, folder_path='conformers_'):
265+
"""
266+
generate conformer for all hetid
267+
"""
268+
working_dir = folder_path + pdbid
269+
if not os.path.exists(working_dir):
270+
os.makedirs(working_dir)
271+
272+
list_hetids = get_hetids_from_pdbid(pdbid)
273+
results = []
274+
for hetid in list_hetids:
275+
print('doing hetid ' + str(hetid))
276+
reference_flag, input_flag = False, False
277+
pdb_lig_filename = os.path.join(working_dir, str(hetid) + '_reference.sdf')
278+
model_lig_filename = os.path.join(working_dir, str(hetid) + '_model.sdf')
279+
reference_flag = get_sdf_hetid(pdbid, hetid, pdb_lig_filename)
280+
input_smi = get_smi_from_hetid(hetid)
281+
if input_smi:
282+
input_flag = get_3d_sdf_from_smi(input_smi, model_lig_filename)
283+
if reference_flag:
284+
mol_reader_ref = io.MoleculeReader(pdb_lig_filename)
285+
try:
286+
mol_reference = mol_reader_ref[0]
287+
except Exception:
288+
continue
289+
if input_flag:
290+
mol_reader_input = io.MoleculeReader(model_lig_filename)
291+
try:
292+
mol_input = mol_reader_input[0]
293+
except Exception:
294+
continue
295+
count = 0
296+
if reference_flag and input_flag:
297+
if len(mol_input.heavy_atoms) == len(mol_reference.heavy_atoms):
298+
rb = len([b for b in mol_input.bonds if b.is_rotatable])
299+
if len(mol_input.heavy_atoms) > 6 and rb > 0:
300+
# molecule needs hydrogens, set atom types,
301+
# and aromatic bonds to match mogul libraries
302+
mol_input.add_hydrogens()
303+
mol_reference.add_hydrogens()
304+
mol_input.standardise_delocalised_bonds()
305+
mol_reference.standardise_delocalised_bonds()
306+
mol_input.assign_bond_types(which='unknown')
307+
mol_reference.assign_bond_types(which='unknown')
308+
mol_input.standardise_aromatic_bonds()
309+
mol_reference.standardise_aromatic_bonds()
310+
mw = int(round(mol_input.molecular_weight, 0))
311+
conf_file = model_lig_filename.replace('.sdf', '_conformers.sdf')
312+
rmsd, norm_score, top_rmsd = write_conformers(mol_input,
313+
mol_reference,
314+
conf_file)
315+
output_list = ','.join(map(str,
316+
[pdbid, hetid, mw, rmsd,
317+
norm_score, top_rmsd]))
318+
results.append(output_list)
319+
return results
320+
321+
322+
###############################################################################
323+
# main
324+
###############################################################################
325+
326+
327+
def main():
328+
parser = argparse.ArgumentParser(description=__doc__)
329+
parser.add_argument("targets", help="list of PDB-codes", type=str)
330+
331+
args = parser.parse_args()
332+
input_filename = args.targets
333+
base_name = os.path.splitext(input_filename)[0]
334+
output_filename = base_name + "_CSD_Conformer_Results.csv"
335+
336+
with open(input_filename) as input_file:
337+
pdbs_raw = input_file.readlines()
338+
339+
pdbs = []
340+
for pdb_raw in pdbs_raw:
341+
pdb = pdb_raw.strip()
342+
if pdb and len(pdb) == 4:
343+
pdbs.append(pdb)
344+
else:
345+
print('this line is not a pdbid ' + str(pdb_raw))
346+
347+
total = str(len(pdbs))
348+
i = 1
349+
350+
with open(output_filename, 'w') as output_file:
351+
output_file.write('pdbid,hetid,mw,rmsd_lowest_energy_conformer,norm_score,lowest_rmsd\n')
352+
for pdbid in pdbs:
353+
print('processing ' + str(i) + ' of ' + total + ' PDB: ' + pdbid)
354+
i += 1
355+
results = generate_conformers_for_all_hetids(pdbid.strip())
356+
for result in results:
357+
output_file.write(result + '\n')
358+
359+
360+
if __name__ == '__main__':
361+
main()
362+
363+
364+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
3zds
2+
3pka
3+
3i4b
4+
2viq

0 commit comments

Comments
 (0)