|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2018 Informatics Matters Ltd. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | + |
| 18 | +from __future__ import print_function |
| 19 | +import sys, gzip, json, uuid |
| 20 | +from math import log10, floor |
| 21 | +from rdkit_utils import BasicObjectWriter, TsvWriter # neither of these are RDKit dependent so should be moved |
| 22 | + |
| 23 | +def log(*args, **kwargs): |
| 24 | + """ |
| 25 | + Log output to STDERR |
| 26 | + """ |
| 27 | + print(*args, file=sys.stderr, **kwargs) |
| 28 | + |
| 29 | +def round_sig(x, sig): |
| 30 | + """Round the number to the specified number of significant figures""" |
| 31 | + return round(x,a sig-int(floor(log10(abs(x))))-1) |
| 32 | + |
| 33 | +def add_default_input_args(parser): |
| 34 | + parser.add_argument('-i', '--input', help="Input file, if not defined the STDIN is used") |
| 35 | + parser.add_argument('-if', '--informat', choices=['sdf', 'json'], help="Input format. When using STDIN this must be specified.") |
| 36 | + |
| 37 | +def add_default_output_args(parser): |
| 38 | + parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.") |
| 39 | + parser.add_argument('-of', '--outformat', choices=['sdf', 'json'], help="Output format. Defaults to 'sdf'.") |
| 40 | + parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files') |
| 41 | + |
| 42 | +def add_default_io_args(parser): |
| 43 | + add_default_input_args(parser) |
| 44 | + add_default_output_args(parser) |
| 45 | + |
| 46 | + |
| 47 | +def default_open_input_output(inputDef, inputFormat, outputDef, defaultOutput, outputFormat, thinOutput=False, valueClassMappings=None, |
| 48 | + datasetMetaProps=None, fieldMetaProps=None): |
| 49 | + """Default approach to handling the inputs and outputs""" |
| 50 | + input, suppl = default_open_input(inputDef, inputFormat) |
| 51 | + output,writer,outputBase = default_open_output(outputDef, defaultOutput, outputFormat, thinOutput=thinOutput, |
| 52 | + valueClassMappings=valueClassMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) |
| 53 | + return input,output,suppl,writer,outputBase |
| 54 | + |
| 55 | + |
| 56 | +def default_open_input(inputDef, inputFormat): |
| 57 | + if not inputDef and not inputFormat: |
| 58 | + raise ValueError('Must specify either an input file name or an input format (or both)') |
| 59 | + elif inputFormat == 'sdf' or (inputDef and (inputDef.lower().endswith('.sdf') or inputDef.lower().endswith('.sdf.gz'))): |
| 60 | + input, suppl = default_open_input_sdf(inputDef) |
| 61 | + elif inputFormat == 'json' or (inputDef and (inputDef.lower().endswith('.data') or inputDef.lower().endswith('.data.gz'))): |
| 62 | + input, suppl = default_open_input_json(inputDef) |
| 63 | + else: |
| 64 | + raise ValueError('Unsupported input format') |
| 65 | + |
| 66 | + return input, suppl |
| 67 | + |
| 68 | + |
| 69 | +def open_file(filename): |
| 70 | + """Open the file gunzipping it if it ends with .gz""" |
| 71 | + if filename.lower().endswith('.gz'): |
| 72 | + return gzip.open(filename) |
| 73 | + else: |
| 74 | + return open(filename, 'r') |
| 75 | + |
| 76 | + |
| 77 | +def create_simple_writer(outputDef, defaultOutput, outputFormat, fieldNames, compress=True, valueClassMappings=None, datasetMetaProps=None, fieldMetaProps=None): |
| 78 | + """Create a simple writer suitable for writing flat data e.g. as BasicObject or TSV""" |
| 79 | + |
| 80 | + if not outputDef: |
| 81 | + outputBase = defaultOutput |
| 82 | + else: |
| 83 | + outputBase = outputDef |
| 84 | + |
| 85 | + if outputFormat == 'json': |
| 86 | + |
| 87 | + write_squonk_datasetmetadata(outputBase, True, valueClassMappings, datasetMetaProps, fieldMetaProps) |
| 88 | + |
| 89 | + return BasicObjectWriter(open_output(outputDef, 'data', compress)), outputBase |
| 90 | + |
| 91 | + elif outputFormat == 'tsv': |
| 92 | + return TsvWriter(open_output(outputDef, 'tsv', compress), fieldNames), outputBase |
| 93 | + |
| 94 | + else: |
| 95 | + raise ValueError("Unsupported format: " + outputFormat) |
| 96 | + |
| 97 | +def open_output(basename, ext, compress): |
| 98 | + if basename: |
| 99 | + fname = basename + '.' + ext |
| 100 | + if compress: |
| 101 | + fname += ".gz" |
| 102 | + return gzip.open(fname, 'w+') |
| 103 | + else: |
| 104 | + return open(fname, 'w+') |
| 105 | + else: |
| 106 | + if compress: |
| 107 | + # TODO - work out how to write compressed data to STDOUT |
| 108 | + return sys.stdout |
| 109 | + else: |
| 110 | + return sys.stdout |
| 111 | + |
| 112 | +def write_squonk_datasetmetadata(outputBase, thinOutput, valueClassMappings, datasetMetaProps, fieldMetaProps): |
| 113 | + """This is a temp hack to write the minimal metadata that Squonk needs. |
| 114 | + Will needs to be replaced with something that allows something more complete to be written. |
| 115 | +
|
| 116 | + :param outputBase: Base name for the file to write to |
| 117 | + :param thinOutput: Write only new data, not structures. Result type will be BasicObject |
| 118 | + :param valueClasses: A dict that describes the Java class of the value properties (used by Squonk) |
| 119 | + :param datasetMetaProps: A dict with metadata properties that describe the datset as a whole. |
| 120 | + The keys used for these metadata are up to the user, but common ones include source, description, created, history. |
| 121 | + :param fieldMetaProps: A list of dicts with the additional field metadata. Each dict has a key named fieldName whose value |
| 122 | + is the name of the field being described, and a key name values wholes values is a map of metadata properties. |
| 123 | + The keys used for these metadata are up to the user, but common ones include source, description, created, history. |
| 124 | + """ |
| 125 | + meta = {} |
| 126 | + props = {} |
| 127 | + # TODO add created property - how to handle date formats? |
| 128 | + if datasetMetaProps: |
| 129 | + props.update(datasetMetaProps) |
| 130 | + |
| 131 | + if fieldMetaProps: |
| 132 | + meta["fieldMetaProps"] = fieldMetaProps |
| 133 | + |
| 134 | + if len(props) > 0: |
| 135 | + meta["properties"] = props |
| 136 | + |
| 137 | + if valueClassMappings: |
| 138 | + meta["valueClassMappings"] = valueClassMappings |
| 139 | + if thinOutput: |
| 140 | + meta['type'] = 'org.squonk.types.BasicObject' |
| 141 | + else: |
| 142 | + meta['type'] = 'org.squonk.types.MoleculeObject' |
| 143 | + s = json.dumps(meta) |
| 144 | + meta = open(outputBase + '.metadata', 'w') |
| 145 | + meta.write(s) |
| 146 | + meta.close() |
| 147 | + |
| 148 | + |
| 149 | +def write_metrics(baseName, values): |
| 150 | + """Write the metrics data |
| 151 | +
|
| 152 | + :param baseName: The base name of the output files. e.g. extensions will be appended to this base name |
| 153 | + :param values dictionary of values to write |
| 154 | + """ |
| 155 | + m = open(baseName + '_metrics.txt', 'w') |
| 156 | + for key in values: |
| 157 | + m.write(key + '=' + str(values[key]) + "\n") |
| 158 | + m.flush() |
| 159 | + m.close() |
| 160 | + |
| 161 | + |
| 162 | +def generate_molecule_object_dict(source, format, values): |
| 163 | + """Generate a dictionary that represents a Squonk MoleculeObject when writen as JSON |
| 164 | +
|
| 165 | + :param source: Molecules in molfile or smiles format |
| 166 | + :param format: The format of the molecule. Either 'mol' or 'smiles' |
| 167 | + :param values: Optional dict of values (properties) for the MoleculeObject |
| 168 | + """ |
| 169 | + m = {"uuid": str(uuid.uuid4()), "source": source, "format": format} |
| 170 | + if values: |
| 171 | + m["values"] = values |
| 172 | + return m |
0 commit comments