Skip to content

Commit b160d6e

Browse files
committed
new files added!
1 parent e14819b commit b160d6e

File tree

2 files changed

+613
-0
lines changed

2 files changed

+613
-0
lines changed
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
from __future__ import print_function
19+
import sys, gzip, json, uuid
20+
from math import log10, floor
21+
from rdkit_utils import BasicObjectWriter, TsvWriter # neither of these are RDKit dependent so should be moved
22+
23+
def log(*args, **kwargs):
24+
"""
25+
Log output to STDERR
26+
"""
27+
print(*args, file=sys.stderr, **kwargs)
28+
29+
def round_sig(x, sig):
30+
"""Round the number to the specified number of significant figures"""
31+
return round(x,a sig-int(floor(log10(abs(x))))-1)
32+
33+
def add_default_input_args(parser):
34+
parser.add_argument('-i', '--input', help="Input file, if not defined the STDIN is used")
35+
parser.add_argument('-if', '--informat', choices=['sdf', 'json'], help="Input format. When using STDIN this must be specified.")
36+
37+
def add_default_output_args(parser):
38+
parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.")
39+
parser.add_argument('-of', '--outformat', choices=['sdf', 'json'], help="Output format. Defaults to 'sdf'.")
40+
parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files')
41+
42+
def add_default_io_args(parser):
43+
add_default_input_args(parser)
44+
add_default_output_args(parser)
45+
46+
47+
def default_open_input_output(inputDef, inputFormat, outputDef, defaultOutput, outputFormat, thinOutput=False, valueClassMappings=None,
48+
datasetMetaProps=None, fieldMetaProps=None):
49+
"""Default approach to handling the inputs and outputs"""
50+
input, suppl = default_open_input(inputDef, inputFormat)
51+
output,writer,outputBase = default_open_output(outputDef, defaultOutput, outputFormat, thinOutput=thinOutput,
52+
valueClassMappings=valueClassMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps)
53+
return input,output,suppl,writer,outputBase
54+
55+
56+
def default_open_input(inputDef, inputFormat):
57+
if not inputDef and not inputFormat:
58+
raise ValueError('Must specify either an input file name or an input format (or both)')
59+
elif inputFormat == 'sdf' or (inputDef and (inputDef.lower().endswith('.sdf') or inputDef.lower().endswith('.sdf.gz'))):
60+
input, suppl = default_open_input_sdf(inputDef)
61+
elif inputFormat == 'json' or (inputDef and (inputDef.lower().endswith('.data') or inputDef.lower().endswith('.data.gz'))):
62+
input, suppl = default_open_input_json(inputDef)
63+
else:
64+
raise ValueError('Unsupported input format')
65+
66+
return input, suppl
67+
68+
69+
def open_file(filename):
70+
"""Open the file gunzipping it if it ends with .gz"""
71+
if filename.lower().endswith('.gz'):
72+
return gzip.open(filename)
73+
else:
74+
return open(filename, 'r')
75+
76+
77+
def create_simple_writer(outputDef, defaultOutput, outputFormat, fieldNames, compress=True, valueClassMappings=None, datasetMetaProps=None, fieldMetaProps=None):
78+
"""Create a simple writer suitable for writing flat data e.g. as BasicObject or TSV"""
79+
80+
if not outputDef:
81+
outputBase = defaultOutput
82+
else:
83+
outputBase = outputDef
84+
85+
if outputFormat == 'json':
86+
87+
write_squonk_datasetmetadata(outputBase, True, valueClassMappings, datasetMetaProps, fieldMetaProps)
88+
89+
return BasicObjectWriter(open_output(outputDef, 'data', compress)), outputBase
90+
91+
elif outputFormat == 'tsv':
92+
return TsvWriter(open_output(outputDef, 'tsv', compress), fieldNames), outputBase
93+
94+
else:
95+
raise ValueError("Unsupported format: " + outputFormat)
96+
97+
def open_output(basename, ext, compress):
98+
if basename:
99+
fname = basename + '.' + ext
100+
if compress:
101+
fname += ".gz"
102+
return gzip.open(fname, 'w+')
103+
else:
104+
return open(fname, 'w+')
105+
else:
106+
if compress:
107+
# TODO - work out how to write compressed data to STDOUT
108+
return sys.stdout
109+
else:
110+
return sys.stdout
111+
112+
def write_squonk_datasetmetadata(outputBase, thinOutput, valueClassMappings, datasetMetaProps, fieldMetaProps):
113+
"""This is a temp hack to write the minimal metadata that Squonk needs.
114+
Will needs to be replaced with something that allows something more complete to be written.
115+
116+
:param outputBase: Base name for the file to write to
117+
:param thinOutput: Write only new data, not structures. Result type will be BasicObject
118+
:param valueClasses: A dict that describes the Java class of the value properties (used by Squonk)
119+
:param datasetMetaProps: A dict with metadata properties that describe the datset as a whole.
120+
The keys used for these metadata are up to the user, but common ones include source, description, created, history.
121+
:param fieldMetaProps: A list of dicts with the additional field metadata. Each dict has a key named fieldName whose value
122+
is the name of the field being described, and a key name values wholes values is a map of metadata properties.
123+
The keys used for these metadata are up to the user, but common ones include source, description, created, history.
124+
"""
125+
meta = {}
126+
props = {}
127+
# TODO add created property - how to handle date formats?
128+
if datasetMetaProps:
129+
props.update(datasetMetaProps)
130+
131+
if fieldMetaProps:
132+
meta["fieldMetaProps"] = fieldMetaProps
133+
134+
if len(props) > 0:
135+
meta["properties"] = props
136+
137+
if valueClassMappings:
138+
meta["valueClassMappings"] = valueClassMappings
139+
if thinOutput:
140+
meta['type'] = 'org.squonk.types.BasicObject'
141+
else:
142+
meta['type'] = 'org.squonk.types.MoleculeObject'
143+
s = json.dumps(meta)
144+
meta = open(outputBase + '.metadata', 'w')
145+
meta.write(s)
146+
meta.close()
147+
148+
149+
def write_metrics(baseName, values):
150+
"""Write the metrics data
151+
152+
:param baseName: The base name of the output files. e.g. extensions will be appended to this base name
153+
:param values dictionary of values to write
154+
"""
155+
m = open(baseName + '_metrics.txt', 'w')
156+
for key in values:
157+
m.write(key + '=' + str(values[key]) + "\n")
158+
m.flush()
159+
m.close()
160+
161+
162+
def generate_molecule_object_dict(source, format, values):
163+
"""Generate a dictionary that represents a Squonk MoleculeObject when writen as JSON
164+
165+
:param source: Molecules in molfile or smiles format
166+
:param format: The format of the molecule. Either 'mol' or 'smiles'
167+
:param values: Optional dict of values (properties) for the MoleculeObject
168+
"""
169+
m = {"uuid": str(uuid.uuid4()), "source": source, "format": format}
170+
if values:
171+
m["values"] = values
172+
return m

0 commit comments

Comments
 (0)