Skip to content

Commit 9f7abf3

Browse files
author
Alan Christie
committed
- Adds a number of modules from original pipelines
1 parent 645e90f commit 9f7abf3

File tree

4 files changed

+415
-0
lines changed

4 files changed

+415
-0
lines changed

src/python/rdkit_utils/filter.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2017 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import argparse
18+
from rdkit.Chem import Descriptors
19+
from pipelines.utils import utils
20+
from pipelines.rdkit import mol_utils
21+
22+
23+
### start function definitions #########################################
24+
25+
26+
27+
def filter_by_heavy_atom_count(mol, minCount, maxCount, quiet=False):
28+
hac = mol.GetNumHeavyAtoms()
29+
if minCount is not None and hac < minCount:
30+
if not quiet:
31+
utils.log("HAC", hac, "<", minCount)
32+
return False
33+
if maxCount is not None and hac > maxCount:
34+
if not quiet:
35+
utils.log("HAC", hac, ">", maxCount)
36+
return False
37+
return True
38+
39+
def filter_by_molwt(mol, minMw, maxMw, quiet=False):
40+
mw = Descriptors.MolWt(mol)
41+
if minMw is not None and mw < minMw:
42+
if not quiet:
43+
utils.log("MolWt", mw, "<", minMw)
44+
return False
45+
if maxMw is not None and mw > maxMw:
46+
if not quiet:
47+
utils.log("MolWt", mw, ">", maxMw)
48+
return False
49+
return True
50+
51+
def filter(mol, minHac=None, maxHac=None, minMw=None, maxMw=None, quiet=False):
52+
if minHac or maxHac:
53+
if not filter_by_heavy_atom_count(mol, minHac, maxHac, quiet):
54+
return False
55+
if minMw or maxMw:
56+
if not filter_by_molwt(mol, minMw, maxMw, quiet):
57+
return False
58+
return True
59+
60+
### start main execution #########################################
61+
62+
def main():
63+
64+
### command line args defintions #########################################
65+
66+
parser = argparse.ArgumentParser(description='RDKit filter')
67+
parser.add_argument('-f', '--fragment', choices=['hac', 'mw'], help='Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
68+
parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
69+
parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
70+
parser.add_argument('--mwmin', type=float, help='Min mol weight')
71+
parser.add_argument('--mwmax', type=float, help='Max mol weight')
72+
parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records')
73+
parser.add_argument('-c', '--chunksize', type=int, help='Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...')
74+
parser.add_argument('-d', '--digits', type=int, default=0, help='When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...')
75+
parser.add_argument('-r', '--rename', action='append', help='Rename field (fromname:toname)')
76+
parser.add_argument('--delete', action='append', help='Delete field')
77+
parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
78+
# WARNING: thin output is not appropriate when using --fragment
79+
parser.add_argument('--thin', action='store_true', help='Thin output mode')
80+
parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering')
81+
utils.add_default_io_args(parser)
82+
args = parser.parse_args()
83+
utils.log("Filter Args: ", args)
84+
85+
field_renames = {}
86+
if args.rename:
87+
for t in args.rename:
88+
parts = t.split(':')
89+
if len(parts) != 2:
90+
raise ValueError('Invalid field rename argument:',t)
91+
field_renames[parts[0]] = parts[1]
92+
if args.delete:
93+
for f in args.delete:
94+
field_renames[f] = None
95+
96+
input,suppl = utils.default_open_input(args.input, args.informat)
97+
98+
if args.chunksize:
99+
chunkNum = 1
100+
if args.output:
101+
output_base = args.output
102+
else:
103+
output_base = 'filter'
104+
output_base_chunk = output_base + str(chunkNum).zfill(args.digits)
105+
output,writer,output_base_chunk = utils.default_open_output(output_base_chunk, output_base_chunk, args.outformat, compress=not args.no_gzip)
106+
else:
107+
output,writer,output_base_chunk = utils.default_open_output(args.output, "filter", args.outformat, compress=not args.no_gzip)
108+
output_base = output_base_chunk
109+
110+
utils.log("Writing to " + output_base_chunk)
111+
112+
i=0
113+
count = 0
114+
chunkNum = 1
115+
for mol in suppl:
116+
if args.limit and count >= args.limit:
117+
break
118+
i +=1
119+
if mol is None: continue
120+
if args.fragment:
121+
mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
122+
if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet):
123+
continue
124+
if args.chunksize:
125+
if count > 0 and count % args.chunksize == 0:
126+
# new chunk, so create new writer
127+
writer.close()
128+
output.close()
129+
chunkNum += 1
130+
output_chunk_base = output_base + str(chunkNum).zfill(args.digits)
131+
utils.log("Writing to " + output_chunk_base)
132+
output,writer,output_chunk_base = utils.default_open_output(output_chunk_base, output_chunk_base, args.outformat, compress=not args.no_gzip)
133+
134+
for from_name in field_renames:
135+
to_name = field_renames[from_name]
136+
if mol.HasProp(from_name):
137+
val = mol.GetProp(from_name)
138+
mol.ClearProp(from_name)
139+
if to_name:
140+
mol.SetProp(to_name, val)
141+
142+
count += 1
143+
writer.write(mol)
144+
145+
utils.log("Filtered", i, "down to", count, "molecules")
146+
if args.chunksize:
147+
utils.log("Wrote", chunkNum, "chunks")
148+
if (args.digits > 0 and len(str(chunkNum)) > args.digits):
149+
utils.log("WARNING: not enough digits specified for the number of chunks")
150+
151+
writer.flush()
152+
writer.close()
153+
input.close()
154+
output.close()
155+
156+
if args.meta:
157+
utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitFilter':i})
158+
159+
160+
if __name__ == "__main__":
161+
main()
162+
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2017 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import logging
18+
from rdkit import Chem
19+
from rdkit.Chem import Descriptors
20+
from pipelines.utils import utils
21+
22+
def fragment(mol, mode, quiet=False):
23+
frags = Chem.GetMolFrags(mol, asMols=True)
24+
25+
if len(frags) == 1:
26+
return mol
27+
else:
28+
# TODO - handle ties
29+
biggest_index = -1
30+
i = 0
31+
if mode == 'hac':
32+
biggest_count = 0
33+
for frag in frags:
34+
hac = frag.GetNumHeavyAtoms()
35+
if hac > biggest_count:
36+
biggest_count = hac
37+
biggest_mol = frag
38+
biggest_index = i
39+
i+=1
40+
if not quiet:
41+
utils.log("Chose fragment", biggest_index, "from", len(frags), "based on HAC")
42+
elif mode == 'mw':
43+
biggest_mw = 0
44+
for frag in frags:
45+
mw = Descriptors.MolWt(frag)
46+
if mw > biggest_mw:
47+
biggest_mw = mw
48+
biggest_mol = frag
49+
biggest_index = i
50+
i+=1
51+
if not quiet:
52+
utils.log("Chose fragment", biggest_index, "from", len(frags), "based on MW")
53+
else:
54+
raise ValueError('Invalid fragment mode:',mode)
55+
56+
# copy the properties across
57+
for name in mol.GetPropNames():
58+
biggest_mol.SetProp(name, mol.GetProp(name))
59+
name = mol.GetProp("_Name")
60+
if name:
61+
biggest_mol.SetProp("_Name", name)
62+
return biggest_mol
63+
64+
def fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod='hac', outputFragment=False, quiet=False):
65+
"""
66+
Fragment the molecule if it has multiple fragments and generate fingerprints on the fragment.
67+
68+
:param suppl: MolSupplier from which to read the molecules
69+
:param mols: List to which the molecules are added
70+
:param fps: List to which the fingerprints are added
71+
:param descriptor: Function to generate the fingerprints from the molecule
72+
:param fragmentMethod: The fragmentation method to use when there are multiple fragments (hac or mw)
73+
:param outputFragment: Boolean that specifies whether to write the fragment or the original molecule to the mols list
74+
:param quiet: Quiet mode
75+
:return: The number of errors encountered
76+
"""
77+
errors = 0
78+
for mol in suppl:
79+
try:
80+
if mol is not None:
81+
frag = fragment(mol, fragmentMethod, quiet)
82+
d = descriptor(frag)
83+
if d:
84+
if outputFragment:
85+
mols.append(frag)
86+
else:
87+
mols.append(mol)
88+
fps.append(d)
89+
continue
90+
except:
91+
logging.exception('')
92+
errors += 1
93+
return errors

src/python/rdkit_utils/sdf2json.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2017 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import argparse
18+
19+
from pipelines.utils import utils
20+
21+
22+
def main():
23+
24+
### command line args defintions #########################################
25+
26+
parser = argparse.ArgumentParser(description='RDKit Sdf2Json')
27+
parser.add_argument('-i', '--input', help="Input SD file, if not defined the STDIN is used")
28+
parser.add_argument('-o', '--output', help="Base name for output json file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.")
29+
parser.add_argument('--exclude', help="Optional list of fields (comma separated) to exclude from the output.")
30+
31+
32+
args = parser.parse_args()
33+
utils.log("Screen Args: ", args)
34+
35+
if args.input:
36+
if args.input.lower().endswith(".sdf"):
37+
base = args.input[:-4]
38+
elif args.input.lower().endswith(".sdf.gz"):
39+
base = args.input[:-7]
40+
else:
41+
base = "json"
42+
utils.log("Base:", base)
43+
44+
45+
input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, "sdf", args.output, base, "json")
46+
if args.exclude:
47+
excludes = args.exclude.split(",")
48+
utils.log("Excluding", excludes)
49+
else:
50+
excludes = None
51+
52+
i=0
53+
count = 0
54+
for mol in suppl:
55+
i +=1
56+
if mol is None: continue
57+
if excludes:
58+
for exclude in excludes:
59+
if mol.HasProp(exclude): mol.ClearProp(exclude)
60+
writer.write(mol)
61+
count += 1
62+
63+
utils.log("Converted", count, " molecules")
64+
65+
writer.flush()
66+
writer.close()
67+
input.close()
68+
output.close()
69+
70+
utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitSdf2Json':count})
71+
72+
return count
73+
74+
if __name__ == "__main__":
75+
main()
76+

0 commit comments

Comments
 (0)