Skip to content

Commit d4a7c61

Browse files
Merge pull request PolusAI#225 from misterbrandonwalker/diffdock_sanitize
remove ligands with kekulization error in diffdock workflow
2 parents f205a65 + 8539e33 commit d4a7c61

File tree

8 files changed

+260
-10
lines changed

8 files changed

+260
-10
lines changed

cwl_adapters/sanitize_ligand.cwl

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env cwl-runner
2+
cwlVersion: v1.0
3+
4+
class: CommandLineTool
5+
6+
label: Sanitize input ligand
7+
8+
doc: |-
9+
Sanitize input ligand
10+
11+
baseCommand: ["python", "/sanitize_ligand.py"]
12+
13+
hints:
14+
DockerRequirement:
15+
dockerPull: mrbrandonwalker/sanitize_ligand
16+
17+
requirements:
18+
InlineJavascriptRequirement: {}
19+
InitialWorkDirRequirement: # conditionally overwrite the input ligand, otherwise cwltool will symlink to the original
20+
listing:
21+
- $(inputs.input_small_mol_ligand)
22+
23+
inputs:
24+
25+
input_small_mol_ligand:
26+
type: File
27+
format:
28+
- edam:format_3814
29+
inputBinding:
30+
prefix: --input_small_mol_ligand
31+
32+
output_ligand:
33+
type: string?
34+
35+
valid_ligand:
36+
type: string?
37+
38+
outputs:
39+
40+
output_ligand:
41+
type: File
42+
format: edam:format_3814
43+
outputBinding:
44+
glob: "*.sdf"
45+
46+
valid_ligand:
47+
type: boolean
48+
outputBinding:
49+
glob: valid.txt
50+
loadContents: true
51+
outputEval: |
52+
${
53+
// Read the contents of the file
54+
const lines = self[0].contents.split("\n");
55+
// Read boolean value from the first line
56+
const valid = lines[0].trim() === "True";
57+
return valid;
58+
59+
}
60+
61+
$namespaces:
62+
edam: https://edamontology.org/
63+
64+
$schemas:
65+
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl

docker/dockerBuild.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,5 @@ sudo docker build --no-cache --pull -f Dockerfile_diffdock_cpu -t mrbrandonwalke
4545
sudo docker build --no-cache --pull -f Dockerfile_diffdock_gpu -t mrbrandonwalker/diffdock_gpu .
4646
sudo docker build --no-cache --pull -f Dockerfile_rmsd_pose_cluster -t mrbrandonwalker/rmsd_pose_cluster .
4747
sudo docker build --no-cache --pull -f Dockerfile_rank_diffdock_poses -t mrbrandonwalker/rank_diffdock_poses .
48-
cd ../..
48+
sudo docker build --no-cache --pull -f Dockerfile_sanitize_ligand -t mrbrandonwalker/sanitize_ligand .
49+
cd ../..

examples/diffdock/Dockerfile_diffdock_cpu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ RUN python -m inference --protein_ligand_csv data/protein_ligand_example_csv.csv
3434
# Delete output results so not in same output folder as future runs
3535
RUN rm -r results/user_predictions_small
3636

37+
# Clean up temp files
3738
RUN mamba clean --all --yes
3839

3940
RUN pip cache purge

examples/diffdock/Dockerfile_diffdock_gpu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ RUN python -m inference --protein_ligand_csv data/protein_ligand_example_csv.csv
4343
# Delete output results so not in same output folder as future runs
4444
RUN rm -r results/user_predictions_small
4545

46+
# Clean up temp files
47+
RUN mamba clean --all --yes
48+
4649
# Now copy everything into a minimal cuda runtime base image.
4750
FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04 as runtime
4851

@@ -52,4 +55,4 @@ COPY --from=devel mambaforge/ mambaforge/
5255
# shell file to copy cached files, run diffdock and remove large cached files after execution
5356
ADD diffdock_cmds.sh /DiffDock/
5457

55-
ENV PATH /mambaforge/bin:$PATH
58+
ENV PATH /mambaforge/bin:$PATH
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
2+
# docker build -f Dockerfile_sanitize_ligand -t mrbrandonwalker/sanitize_ligand .
3+
4+
FROM condaforge/mambaforge
5+
# NOT mambaforge-pypy3 (rdkit is incompatible with pypy)
6+
7+
RUN conda install -c conda-forge rdkit --yes
8+
9+
RUN conda init bash
10+
11+
COPY sanitize_ligand.py /
12+
13+
RUN mamba clean --all --yes
14+
15+
ADD Dockerfile_sanitize_ligand .

examples/diffdock/run_diffdock.yml

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,47 @@ steps:
1111
convert_Kd_dG: True
1212
output_pdb_paths: '&pdbbind_pdbs'
1313
output_sdf_paths: '&pdbbind_sdfs'
14+
experimental_dGs: '&exp_dGs'
1415

1516
- fix_side_chain:
1617
scatter: [input_pdb_path]
1718
in:
1819
input_pdb_path: '*pdbbind_pdbs'
1920
output_pdb_path: '&pdbbind_pdbs.pdb'
2021

22+
- sanitize_ligand:
23+
scatter: [input_small_mol_ligand]
24+
in:
25+
input_small_mol_ligand: "*pdbbind_sdfs"
26+
output_ligand: "&sanitized_sdfs"
27+
valid_ligand: "&valid_ligands"
28+
29+
- filter_array: # remove invalid ligands from sanitized_ligand, avoid using null
30+
in:
31+
input_array: "*sanitized_sdfs"
32+
input_bool_array: "*valid_ligands"
33+
output_array: "&final_sanitized_sdfs"
34+
35+
- filter_array: # remove proteins corresponding to invalid ligands from sanitized_ligand
36+
in:
37+
input_array: "*pdbbind_pdbs.pdb"
38+
input_bool_array: "*valid_ligands"
39+
output_array: "&final_pdbbind_pdbs.pdb"
40+
41+
- filter_array: # remove dGs corresponding to invalid ligands from sanitized_ligand
42+
in:
43+
input_array: "*exp_dGs"
44+
input_bool_array: "*valid_ligands"
45+
output_array: "&final_exp_dGs"
46+
2147
- diffdock:
2248
scatter: [protein_path, ligand_path]
2349
scatterMethod: dotproduct
2450
in:
25-
protein_path: "*pdbbind_pdbs.pdb"
26-
ligand_path: "*pdbbind_sdfs"
51+
protein_path: "*final_pdbbind_pdbs.pdb"
52+
ligand_path: "*final_sanitized_sdfs"
2753
samples_per_complex: 20 # figure 3 left in DiffDock paper
2854
inference_steps: 20 # figure S11 in DiffDock paper
29-
batch_size: 16 # section D.3 in DiffDock paper
3055
output_files: "&diffdock_poses"
3156

3257
- rank_diffdock_poses:
@@ -57,16 +82,32 @@ wic:
5782
wic:
5883
graphviz:
5984
label: Fix Side Chains
60-
(3, diffdock):
85+
(3, sanitize_ligand):
86+
wic:
87+
graphviz:
88+
label: Sanitize Ligands
89+
(4, filter_array):
90+
wic:
91+
graphviz:
92+
label: Filter Ligands
93+
(5, filter_array):
94+
wic:
95+
graphviz:
96+
label: Filter Proteins
97+
(6, filter_array):
98+
wic:
99+
graphviz:
100+
label: Filter dG's
101+
(7, diffdock):
61102
wic:
62103
namespace: gpu
63104
graphviz:
64105
label: Executing DiffDock
65-
(4, rank_diffdock_poses):
106+
(8, rank_diffdock_poses):
66107
wic:
67108
graphviz:
68109
label: Rank all poses
69-
(5, pose_cluster_filter):
110+
(9, pose_cluster_filter):
70111
wic:
71112
graphviz:
72-
label: Cluster poses
113+
label: Cluster poses
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# pylint: disable=E0401,E1101,I1101
2+
"""Filter molecules with kekulization errors.
3+
Handle molecules with sanitization errors by assign formal charge based on valence."""
4+
# https://depth-first.com/articles/2020/02/10/a-comprehensive-treatment-of-aromaticity-in-the-smiles-language/
5+
import argparse
6+
from pathlib import Path
7+
from typing import Dict, Tuple
8+
9+
import rdkit
10+
from rdkit import Chem
11+
from rdkit.Chem import AllChem
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument('--input_small_mol_ligand', type=str, help='Ligand file')
15+
args = parser.parse_args()
16+
input_small_mol_ligand = Path(args.input_small_mol_ligand).name
17+
18+
19+
def adjust_formal_charges(molecule: Chem.SDMolSupplier) -> Chem.SDMolSupplier:
20+
"""Sometimes input structures do not have correct formal charges corresponding
21+
to bond order topology. So choose to trust bond orders assigned and generate formal
22+
charges based on that.
23+
Explicit valence determined what the formal charge should be from dictionary of valence
24+
to formal charge for that atomic number. Special case if atom == carbon or nitrogen
25+
and if neighbors contain nitrogen, oyxgen or sulfur (polarizable atoms) then if carbon
26+
and explicit valence only 3, give formal charge of +1 (more stable then -1 case).
27+
28+
Args:
29+
molecule (Chem.SDMolSupplier): The rdkit molecule object
30+
31+
Returns:
32+
Chem.SDMolSupplier: Molecule object with adjusted formal charges
33+
"""
34+
# 1=H, 5=B, 6=C, 7=N, 8=O, 15=P, 16=S, 17=Cl, 9=F, 35=Br, 53=I
35+
atomicnumtoformalchg: Dict[int, Dict[int, int]] = {1: {2: 1}, 5: {4: 1}, 6: {3: -1}, 7: {2: -1, 4: 1},
36+
8: {1: -1, 3: 1}, 15: {4: 1}, 16: {1: -1, 3: 1, 5: -1},
37+
17: {0: -1, 4: 3}, 9: {0: -1}, 35: {0: -1}, 53: {0: -1}}
38+
for atom in molecule.GetAtoms():
39+
atomnum = atom.GetAtomicNum()
40+
val = atom.GetExplicitValence()
41+
if atomicnumtoformalchg.get(atomnum) is None:
42+
continue
43+
valtochg = atomicnumtoformalchg[atomnum]
44+
chg = valtochg.get(val, 0)
45+
# special case of polar neighbors surrounding carbon or nitrogen
46+
# https://docs.eyesopen.com/toolkits/cpp/oechemtk/valence.html#openeye-charge-model
47+
#
48+
# See Section 6: Factors That Stabilize Carbocations – Adjacent Lone Pairs
49+
# https://www.masterorganicchemistry.com/2011/03/11/3-factors-that-stabilize-carbocations/#six
50+
polneighb = False
51+
if atomnum in (6, 7):
52+
for natom in atom.GetNeighbors():
53+
natomicnum = natom.GetAtomicNum()
54+
if natomicnum in (7, 8, 16):
55+
polneighb = True
56+
if polneighb and val == 3 and atomnum == 6:
57+
chg = 1
58+
59+
atom.SetFormalCharge(chg)
60+
return molecule
61+
62+
63+
def generate_conformer(molecule: Chem.SDMolSupplier) -> None:
64+
""" Generate conformer for molecule. Sometimes rdkit embedding can fail,
65+
so use random coordinates if failed at first.
66+
67+
Args:
68+
molecule (Chem.SDMolSupplier): _description_
69+
"""
70+
ps = AllChem.ETKDGv2()
71+
confid = AllChem.EmbedMolecule(molecule, ps)
72+
if confid == -1:
73+
ps.useRandomCoords = True
74+
AllChem.EmbedMolecule(molecule, ps)
75+
# only want to catch error Bad Conformation Id, dont need to spend time optimizing
76+
AllChem.MMFFOptimizeMolecule(molecule, confId=0, maxIters=1)
77+
78+
79+
def is_valid_ligand(molecule: Chem.SDMolSupplier) -> Tuple[bool, bool, Chem.SDMolSupplier]:
80+
"""Check for sanitization errors, attempt to fix formal charges/valence consistency errors.
81+
DiffDock uses rdkit to generate a seed conformation that will sometimes crash, so generating
82+
conformations here to catch that error and prevent DiffDock from running that ligand.
83+
84+
Args:
85+
mol (Chem.SDMolSupplier): The rdkit small molecule object
86+
87+
Returns:
88+
bool: if ligand is valid
89+
bool: if symlink should be used
90+
Chem.SDMolSupplier: molecule object
91+
"""
92+
valid_lig = True
93+
use_symlink = True
94+
try:
95+
Chem.SanitizeMol(molecule)
96+
generate_conformer(molecule)
97+
except rdkit.Chem.rdchem.KekulizeException as e:
98+
valid_lig = False
99+
# Not handling kekulization error now so just remove file to prevent DiffDock execution
100+
except rdkit.Chem.rdchem.MolSanitizeException as e:
101+
# can also be explicit valence error (i.e.) formal charge not consistent with bond topology
102+
# choose to trust bond topology around atom and add formal charge based on that
103+
molecule = adjust_formal_charges(molecule)
104+
use_symlink = False
105+
except ValueError:
106+
# assuming this is Bad Conformer Id error from generate_conformer
107+
valid_lig = False
108+
except Exception: # pylint: disable=broad-exception-caught
109+
# catch *all* exceptions rdkit can throw
110+
valid_lig = False
111+
112+
return valid_lig, use_symlink, molecule
113+
114+
115+
mol: Chem.SDMolSupplier = Chem.SDMolSupplier(args.input_small_mol_ligand, sanitize=False, removeHs=False)[0]
116+
117+
valid_ligand, symlink, rdkit_mol = is_valid_ligand(mol)
118+
119+
if not symlink and valid_ligand:
120+
with Chem.SDWriter(input_small_mol_ligand) as w:
121+
w.write(rdkit_mol)
122+
123+
with open('valid.txt', 'w', encoding="utf-8") as f:
124+
f.write(str(valid_ligand))

gpu/diffdock.cwl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ inputs:
5656
type: int?
5757
inputBinding:
5858
prefix: --batch_size
59-
default: 10
59+
default: 1
6060

6161
out_dir:
6262
label: where output from diffdock is saved

0 commit comments

Comments
 (0)