|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2017 Informatics Matters Ltd. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +import argparse |
| 18 | +from rdkit.Chem import Descriptors |
| 19 | +from pipelines.utils import utils |
| 20 | +from pipelines.rdkit import mol_utils |
| 21 | + |
| 22 | + |
| 23 | +### start function definitions ######################################### |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +def filter_by_heavy_atom_count(mol, minCount, maxCount, quiet=False): |
| 28 | + hac = mol.GetNumHeavyAtoms() |
| 29 | + if minCount is not None and hac < minCount: |
| 30 | + if not quiet: |
| 31 | + utils.log("HAC", hac, "<", minCount) |
| 32 | + return False |
| 33 | + if maxCount is not None and hac > maxCount: |
| 34 | + if not quiet: |
| 35 | + utils.log("HAC", hac, ">", maxCount) |
| 36 | + return False |
| 37 | + return True |
| 38 | + |
| 39 | +def filter_by_molwt(mol, minMw, maxMw, quiet=False): |
| 40 | + mw = Descriptors.MolWt(mol) |
| 41 | + if minMw is not None and mw < minMw: |
| 42 | + if not quiet: |
| 43 | + utils.log("MolWt", mw, "<", minMw) |
| 44 | + return False |
| 45 | + if maxMw is not None and mw > maxMw: |
| 46 | + if not quiet: |
| 47 | + utils.log("MolWt", mw, ">", maxMw) |
| 48 | + return False |
| 49 | + return True |
| 50 | + |
| 51 | +def filter(mol, minHac=None, maxHac=None, minMw=None, maxMw=None, quiet=False): |
| 52 | + if minHac or maxHac: |
| 53 | + if not filter_by_heavy_atom_count(mol, minHac, maxHac, quiet): |
| 54 | + return False |
| 55 | + if minMw or maxMw: |
| 56 | + if not filter_by_molwt(mol, minMw, maxMw, quiet): |
| 57 | + return False |
| 58 | + return True |
| 59 | + |
| 60 | +### start main execution ######################################### |
| 61 | + |
| 62 | +def main(): |
| 63 | + |
| 64 | + ### command line args defintions ######################################### |
| 65 | + |
| 66 | + parser = argparse.ArgumentParser(description='RDKit filter') |
| 67 | + parser.add_argument('-f', '--fragment', choices=['hac', 'mw'], help='Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)') |
| 68 | + parser.add_argument('--hacmin', type=int, help='Min heavy atom count') |
| 69 | + parser.add_argument('--hacmax', type=int, help='Max heavy atom count') |
| 70 | + parser.add_argument('--mwmin', type=float, help='Min mol weight') |
| 71 | + parser.add_argument('--mwmax', type=float, help='Max mol weight') |
| 72 | + parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') |
| 73 | + parser.add_argument('-c', '--chunksize', type=int, help='Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...') |
| 74 | + parser.add_argument('-d', '--digits', type=int, default=0, help='When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...') |
| 75 | + parser.add_argument('-r', '--rename', action='append', help='Rename field (fromname:toname)') |
| 76 | + parser.add_argument('--delete', action='append', help='Delete field') |
| 77 | + parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') |
| 78 | + # WARNING: thin output is not appropriate when using --fragment |
| 79 | + parser.add_argument('--thin', action='store_true', help='Thin output mode') |
| 80 | + parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') |
| 81 | + utils.add_default_io_args(parser) |
| 82 | + args = parser.parse_args() |
| 83 | + utils.log("Filter Args: ", args) |
| 84 | + |
| 85 | + field_renames = {} |
| 86 | + if args.rename: |
| 87 | + for t in args.rename: |
| 88 | + parts = t.split(':') |
| 89 | + if len(parts) != 2: |
| 90 | + raise ValueError('Invalid field rename argument:',t) |
| 91 | + field_renames[parts[0]] = parts[1] |
| 92 | + if args.delete: |
| 93 | + for f in args.delete: |
| 94 | + field_renames[f] = None |
| 95 | + |
| 96 | + input,suppl = utils.default_open_input(args.input, args.informat) |
| 97 | + |
| 98 | + if args.chunksize: |
| 99 | + chunkNum = 1 |
| 100 | + if args.output: |
| 101 | + output_base = args.output |
| 102 | + else: |
| 103 | + output_base = 'filter' |
| 104 | + output_base_chunk = output_base + str(chunkNum).zfill(args.digits) |
| 105 | + output,writer,output_base_chunk = utils.default_open_output(output_base_chunk, output_base_chunk, args.outformat, compress=not args.no_gzip) |
| 106 | + else: |
| 107 | + output,writer,output_base_chunk = utils.default_open_output(args.output, "filter", args.outformat, compress=not args.no_gzip) |
| 108 | + output_base = output_base_chunk |
| 109 | + |
| 110 | + utils.log("Writing to " + output_base_chunk) |
| 111 | + |
| 112 | + i=0 |
| 113 | + count = 0 |
| 114 | + chunkNum = 1 |
| 115 | + for mol in suppl: |
| 116 | + if args.limit and count >= args.limit: |
| 117 | + break |
| 118 | + i +=1 |
| 119 | + if mol is None: continue |
| 120 | + if args.fragment: |
| 121 | + mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) |
| 122 | + if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): |
| 123 | + continue |
| 124 | + if args.chunksize: |
| 125 | + if count > 0 and count % args.chunksize == 0: |
| 126 | + # new chunk, so create new writer |
| 127 | + writer.close() |
| 128 | + output.close() |
| 129 | + chunkNum += 1 |
| 130 | + output_chunk_base = output_base + str(chunkNum).zfill(args.digits) |
| 131 | + utils.log("Writing to " + output_chunk_base) |
| 132 | + output,writer,output_chunk_base = utils.default_open_output(output_chunk_base, output_chunk_base, args.outformat, compress=not args.no_gzip) |
| 133 | + |
| 134 | + for from_name in field_renames: |
| 135 | + to_name = field_renames[from_name] |
| 136 | + if mol.HasProp(from_name): |
| 137 | + val = mol.GetProp(from_name) |
| 138 | + mol.ClearProp(from_name) |
| 139 | + if to_name: |
| 140 | + mol.SetProp(to_name, val) |
| 141 | + |
| 142 | + count += 1 |
| 143 | + writer.write(mol) |
| 144 | + |
| 145 | + utils.log("Filtered", i, "down to", count, "molecules") |
| 146 | + if args.chunksize: |
| 147 | + utils.log("Wrote", chunkNum, "chunks") |
| 148 | + if (args.digits > 0 and len(str(chunkNum)) > args.digits): |
| 149 | + utils.log("WARNING: not enough digits specified for the number of chunks") |
| 150 | + |
| 151 | + writer.flush() |
| 152 | + writer.close() |
| 153 | + input.close() |
| 154 | + output.close() |
| 155 | + |
| 156 | + if args.meta: |
| 157 | + utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitFilter':i}) |
| 158 | + |
| 159 | + |
| 160 | +if __name__ == "__main__": |
| 161 | + main() |
| 162 | + |
0 commit comments