Skip to content

Commit b23ec52

Browse files
TablewareBoxwanghan-iapcmHan Wang
authored
add sanitize_guanidine_Catom, and formatting code (#231)
Co-authored-by: Han Wang <[email protected]> Co-authored-by: Han Wang <[email protected]>
1 parent 6d62862 commit b23ec52

File tree

1 file changed

+89
-19
lines changed

1 file changed

+89
-19
lines changed

dpdata/rdkit/sanitize.py

Lines changed: 89 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,29 @@
33
from rdkit.Chem.rdchem import Atom, Bond, Mol, BondType
44
import os
55
import time
6+
67
# openbabel
78
try:
89
from openbabel import openbabel
10+
911
USE_OBABEL = True
1012
except ModuleNotFoundError as e:
1113
USE_OBABEL = False
1214

15+
1316
def get_explicit_valence(atom, verbose=False):
1417
exp_val_calculated_from_bonds = int(sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]))
1518
try:
1619
exp_val = atom.GetExplicitValence()
1720
if exp_val != exp_val_calculated_from_bonds:
1821
if verbose:
19-
print(f"Explicit valence given by GetExplicitValence() and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx()+1}, using sum of bond order.")
22+
print(
23+
f"Explicit valence given by GetExplicitValence() and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx() + 1}, using sum of bond order.")
2024
return exp_val_calculated_from_bonds
2125
except:
2226
return exp_val_calculated_from_bonds
2327

28+
2429
def regularize_formal_charges(mol, sanitize=True, verbose=False):
2530
"""
2631
Regularize formal charges of atoms
@@ -37,6 +42,7 @@ def regularize_formal_charges(mol, sanitize=True, verbose=False):
3742
else:
3843
return mol
3944

45+
4046
def assign_formal_charge_for_atom(atom, verbose=False):
4147
"""
4248
assigen formal charge according to 8-electron rule for element B,C,N,O,S,P,As
@@ -48,12 +54,13 @@ def assign_formal_charge_for_atom(atom, verbose=False):
4854
elif atom.GetSymbol() == "C":
4955
atom.SetFormalCharge(valence - 4)
5056
if valence == 3:
51-
print(f"Detect a valence of 3 on #C{atom.GetIdx()+1}, the formal charge of this atom will be assigned to -1")
57+
print(
58+
f"Detect a valence of 3 on #C{atom.GetIdx() + 1}, the formal charge of this atom will be assigned to -1")
5259
elif valence > 4:
53-
raise ValueError(f"#C{atom.GetIdx()+1} has a valence larger than 4")
60+
raise ValueError(f"#C{atom.GetIdx() + 1} has a valence larger than 4")
5461
elif atom.GetSymbol() == "N":
5562
if valence > 4:
56-
raise ValueError(f"#N{atom.GetIdx()+1} has a valence larger than 4")
63+
raise ValueError(f"#N{atom.GetIdx() + 1} has a valence larger than 4")
5764
else:
5865
atom.SetFormalCharge(valence - 3)
5966
elif atom.GetSymbol() == "O":
@@ -64,34 +71,35 @@ def assign_formal_charge_for_atom(atom, verbose=False):
6471
elif valence == 3:
6572
atom.SetFormalCharge(1)
6673
elif valence > 6:
67-
raise ValueError(f"#S{atom.GetIdx()+1} has a valence larger than 6")
74+
raise ValueError(f"#S{atom.GetIdx() + 1} has a valence larger than 6")
6875
else:
6976
atom.SetFormalCharge(0)
7077
elif atom.GetSymbol() == "P" or atom.GetSymbol() == "As":
7178
if valence == 5:
7279
atom.SetFormalCharge(0)
7380
elif valence > 5:
74-
raise ValueError(f"#{atom.GetSymbol()}{atom.GetIdx()+1} has a valence larger than 5")
81+
raise ValueError(f"#{atom.GetSymbol()}{atom.GetIdx() + 1} has a valence larger than 5")
7582
else:
7683
atom.SetFormalCharge(valence - 3)
7784

85+
7886
# print bond and atom information (for debugger)
7987
def print_bonds(mol):
8088
for bond in mol.GetBonds():
8189
begin_atom = bond.GetBeginAtom()
8290
end_atom = bond.GetEndAtom()
83-
print(f'{begin_atom.GetSymbol()}{begin_atom.GetIdx()+1} {end_atom.GetSymbol()}{end_atom.GetIdx()+1} {bond.GetBondType()}')
91+
print(
92+
f'{begin_atom.GetSymbol()}{begin_atom.GetIdx() + 1} {end_atom.GetSymbol()}{end_atom.GetIdx() + 1} {bond.GetBondType()}')
93+
8494

8595
def print_atoms(mol):
8696
for atom in mol.GetAtoms():
87-
print(f'{atom.GetSymbol()}{atom.GetIdx()+1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}')
97+
print(f'{atom.GetSymbol()}{atom.GetIdx() + 1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}')
8898

8999

90100
def is_terminal_oxygen(O_atom):
91-
if len(O_atom.GetNeighbors()) == 1:
92-
return True
93-
else:
94-
return False
101+
return len(O_atom.GetNeighbors()) == 1
102+
95103

96104
def get_terminal_oxygens(atom):
97105
terminal_oxygens = []
@@ -101,6 +109,21 @@ def get_terminal_oxygens(atom):
101109
terminal_oxygens.append(nei)
102110
return terminal_oxygens
103111

112+
113+
def is_terminal_NR2(N_atom):
114+
return len(N_atom.GetNeighbors()) == 3
115+
116+
117+
def get_terminal_NR2s(atom):
118+
terminal_NR2s = []
119+
for nei in atom.GetNeighbors():
120+
if nei.GetSymbol() == "N":
121+
if is_terminal_NR2(nei):
122+
terminal_NR2s.append(nei)
123+
terminal_NR2s.sort(key=lambda N_atom: len([atom for atom in N_atom.GetNeighbors() if atom.GetSymbol() == 'H']))
124+
return terminal_NR2s
125+
126+
104127
def sanitize_phosphate_Patom(P_atom, verbose=True):
105128
if P_atom.GetSymbol() == "P":
106129
terminal_oxygens = get_terminal_oxygens(P_atom)
@@ -116,11 +139,13 @@ def sanitize_phosphate_Patom(P_atom, verbose=True):
116139
bond.SetBondType(Chem.rdchem.BondType.SINGLE)
117140
terminal_oxygens[ii].SetFormalCharge(-1)
118141

142+
119143
def sanitize_phosphate(mol):
120144
for atom in mol.GetAtoms():
121145
sanitize_phosphate_Patom(atom)
122146
return mol
123147

148+
124149
def sanitize_sulfate_Satom(S_atom, verbose=True):
125150
if S_atom.GetSymbol() == "S":
126151
terminal_oxygens = get_terminal_oxygens(S_atom)
@@ -136,11 +161,13 @@ def sanitize_sulfate_Satom(S_atom, verbose=True):
136161
bond = mol.GetBondBetweenAtoms(S_atom.GetIdx(), terminal_oxygens[ii].GetIdx())
137162
bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
138163

164+
139165
def sanitize_sulfate(mol):
140166
for atom in mol.GetAtoms():
141167
sanitize_sulfate_Satom(atom)
142168
return mol
143169

170+
144171
def sanitize_carboxyl_Catom(C_atom, verbose=True):
145172
if C_atom.GetSymbol() == "C":
146173
terminal_oxygens = get_terminal_oxygens(C_atom)
@@ -157,11 +184,40 @@ def sanitize_carboxyl_Catom(C_atom, verbose=True):
157184
bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
158185
terminal_oxygens[1].SetFormalCharge(0)
159186

187+
160188
def sanitize_carboxyl(mol):
161189
for atom in mol.GetAtoms():
162190
sanitize_carboxyl_Catom(atom)
163191
return mol
164192

193+
194+
def sanitize_guanidine_Catom(C_atom, verbose=True):
195+
if C_atom.GetSymbol() == "C":
196+
terminal_NR2s = get_terminal_NR2s(C_atom)
197+
mol = C_atom.GetOwningMol()
198+
if len(terminal_NR2s) == 3:
199+
if verbose:
200+
print("Guanidyl group detected, sanitizing it...")
201+
# set two C-N and one C=N+
202+
bond1 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[0].GetIdx())
203+
bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
204+
terminal_NR2s[0].SetFormalCharge(-1)
205+
206+
bond2 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[1].GetIdx())
207+
bond2.SetBondType(Chem.rdchem.BondType.SINGLE)
208+
terminal_NR2s[1].SetFormalCharge(0)
209+
210+
bond3 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[2].GetIdx())
211+
bond3.SetBondType(Chem.rdchem.BondType.DOUBLE)
212+
terminal_NR2s[2].SetFormalCharge(1)
213+
214+
215+
def sanitize_guanidine(mol):
216+
for atom in mol.GetAtoms():
217+
sanitize_guanidine_Catom(atom)
218+
return mol
219+
220+
165221
def sanitize_nitro_Natom(N_atom, verbose=True):
166222
if N_atom.GetSymbol() == "N":
167223
terminal_oxygens = get_terminal_oxygens(N_atom)
@@ -178,17 +234,20 @@ def sanitize_nitro_Natom(N_atom, verbose=True):
178234
bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
179235
terminal_oxygens[1].SetFormalCharge(0)
180236

237+
181238
def sanitize_nitro(mol):
182239
for atom in mol.GetAtoms():
183240
sanitize_nitro_Natom(atom)
184241
return mol
185242

243+
186244
def is_terminal_nitrogen(N_atom):
187245
if N_atom.GetSymbol() == 'N' and len(N_atom.GetNeighbors()) == 1:
188246
return True
189247
else:
190248
return False
191249

250+
192251
def sanitize_nitrine_Natom(atom, verbose=True):
193252
if atom.GetSymbol() == "N" and len(atom.GetNeighbors()) == 2:
194253
mol = atom.GetOwningMol()
@@ -213,7 +272,8 @@ def sanitize_nitrine_Natom(atom, verbose=True):
213272

214273
bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_non_terminal.GetIdx())
215274
bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
216-
atom.SetFormalCharge(1)
275+
atom.SetFormalCharge(1)
276+
217277

218278
def contain_hetero_aromatic(mol):
219279
flag = False
@@ -223,6 +283,7 @@ def contain_hetero_aromatic(mol):
223283
break
224284
return flag
225285

286+
226287
# for carbon with explicit valence > 4
227288
def regularize_carbon_bond_order(atom, verbose=True):
228289
if atom.GetSymbol() == "C" and get_explicit_valence(atom) > 4:
@@ -240,6 +301,7 @@ def regularize_carbon_bond_order(atom, verbose=True):
240301
if bond.GetIdx() != double_bond_idx:
241302
bond.SetBondType(Chem.rdchem.BondType.SINGLE)
242303

304+
243305
# for nitrogen with explicit valence > 4
244306
def regularize_nitrogen_bond_order(atom, verbose=True):
245307
mol = atom.GetOwningMol()
@@ -255,6 +317,7 @@ def regularize_nitrogen_bond_order(atom, verbose=True):
255317
def sanitize_mol(mol, verbose=False):
256318
for atom in mol.GetAtoms():
257319
sanitize_carboxyl_Catom(atom, verbose)
320+
sanitize_guanidine_Catom(atom, verbose)
258321
sanitize_phosphate_Patom(atom, verbose)
259322
sanitize_sulfate_Satom(atom, verbose)
260323
sanitize_nitro_Natom(atom, verbose)
@@ -272,6 +335,7 @@ def mol_edit_log(mol, i, j):
272335
edited = mol.GetProp("edit")
273336
mol.SetProp("edit", edited + ",%d_%d" % (i, j))
274337

338+
275339
def kekulize_aromatic_heterocycles(mol_in, assign_formal_charge=True, sanitize=True):
276340
mol = Chem.RWMol(mol_in)
277341
rings = Chem.rdmolops.GetSymmSSSR(mol)
@@ -345,7 +409,7 @@ def hetero_priority(idx, mol):
345409
elif bAllAr and not bAllC:
346410
HAr.append(ring)
347411

348-
if len(HAr) == 0:
412+
if len(HAr) == 0:
349413
# no hetrerocycles
350414
return mol_in
351415
else:
@@ -364,7 +428,7 @@ def hetero_priority(idx, mol):
364428
fuseCAr[i] = j
365429
break
366430
if i > 1:
367-
if (fuseCAr[i] == fuseCAr[i-1]) & (fuseCAr[i] >= 0):
431+
if (fuseCAr[i] == fuseCAr[i - 1]) & (fuseCAr[i] >= 0):
368432
fuseDouble.append(i)
369433
atom = mol.GetAtomWithIdx(ring[i])
370434
if atom.GetSymbol() != 'C':
@@ -376,7 +440,7 @@ def hetero_priority(idx, mol):
376440
hasDouble.append(i)
377441
bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
378442

379-
if (fuseCAr[0] == fuseCAr[lring-1]) & (fuseCAr[0] >= 0):
443+
if (fuseCAr[0] == fuseCAr[lring - 1]) & (fuseCAr[0] >= 0):
380444
fuseDouble.append(0)
381445

382446
if (len(hetero) > 0) | (len(hasDouble) > 0):
@@ -446,6 +510,7 @@ def hetero_priority(idx, mol):
446510
except Exception as e:
447511
raise RuntimeError(f"Manual kekulization for aromatic heterocycles failed, below are errors:\n\t {e}")
448512

513+
449514
def convert_by_obabel(mol, cache_dir=os.path.join(os.getcwd(), '.cache'), obabel_path="obabel"):
450515
if not os.path.exists(cache_dir):
451516
os.mkdir(cache_dir)
@@ -464,6 +529,7 @@ def convert_by_obabel(mol, cache_dir=os.path.join(os.getcwd(), '.cache'), obabel
464529
mol_obabel = Chem.MolFromMolFile(mol_file_out, removeHs=False, sanitize=False)
465530
return mol_obabel
466531

532+
467533
def super_sanitize_mol(mol, name=None, verbose=True):
468534
if name is None:
469535
if mol.HasProp("_Name"):
@@ -484,11 +550,13 @@ def super_sanitize_mol(mol, name=None, verbose=True):
484550
except Exception as e:
485551
try:
486552
if verbose:
487-
print("Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel")
553+
print(
554+
"Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel")
488555
print("=====Stage 2: re-try with obabel=====")
489556
mol = convert_by_obabel(mol)
490557
mol = sanitize_mol(mol, verbose)
491-
mol = kekulize_aromatic_heterocycles(mol, assign_formal_charge=False, sanitize=False) # aromatic heterocycles
558+
mol = kekulize_aromatic_heterocycles(mol, assign_formal_charge=False,
559+
sanitize=False) # aromatic heterocycles
492560
mol = regularize_formal_charges(mol, sanitize=False)
493561
mol_copy = deepcopy(mol)
494562
Chem.SanitizeMol(mol_copy)
@@ -501,6 +569,7 @@ def super_sanitize_mol(mol, name=None, verbose=True):
501569
print(name, "Failed!")
502570
return None
503571

572+
504573
class Sanitizer(object):
505574
def __init__(self, level='medium', raise_errors=True, verbose=False):
506575
'''
@@ -526,7 +595,7 @@ def _check_level(self, level):
526595
else:
527596
if level == 'high' and not USE_OBABEL:
528597
raise ModuleNotFoundError("obabel not installed, high level sanitizer cannot work")
529-
598+
530599
def _handle_exception(self, error_info):
531600
if self.raise_errors:
532601
raise SanitizeError(error_info)
@@ -561,6 +630,7 @@ def sanitize(self, mol):
561630
self._handle_exception(error_info)
562631
return mol
563632

633+
564634
class SanitizeError(Exception):
565635
def __init__(self, content="Sanitization Failed."):
566636
self.content = content

0 commit comments

Comments
 (0)