Skip to content

Commit 4dc1587

Browse files
committed
Improvements
Many different issues and improvements done coming from things found while getting results for the paper. Added CLI option to save out validation JSON. Added CLI option for silent to validate. Added CLI option for force to validate and convert. Added some validations and removed some from JSON Schema to reduce spurious messages.
1 parent dd864fd commit 4dc1587

File tree

8 files changed

+375
-152
lines changed

8 files changed

+375
-152
lines changed

src/mwtab/cli.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,22 @@
1010
Usage:
1111
mwtab -h | --help
1212
mwtab --version
13-
mwtab convert (<from-path> <to-path>) [--from-format=<format>] [--to-format=<format>] [--mw-rest=<url>] [--verbose]
14-
mwtab validate <from-path> [--mw-rest=<url>]
13+
mwtab convert (<from-path> <to-path>) [--from-format=<format>] [--to-format=<format>] [--mw-rest=<url>] [--force] [--verbose]
14+
mwtab validate <from-path> [--to-path=<path>] [--mw-rest=<url>] [--force] [--silent]
1515
mwtab download url <url> [--to-path=<path>] [--verbose]
1616
mwtab download study all [--to-path=<path>] [--input-item=<item>] [--output-format=<format>] [--mw-rest=<url>] [--verbose]
1717
mwtab download study <input-value> [--to-path=<path>] [--input-item=<item>] [--output-item=<item>] [--output-format=<format>] [--mw-rest=<url>] [--verbose]
1818
mwtab download (study | compound | refmet | gene | protein) <input-item> <input-value> <output-item> [--output-format=<format>] [--to-path=<path>] [--mw-rest=<url>] [--verbose]
1919
mwtab download moverz <input-item> <m/z-value> <ion-type-value> <m/z-tolerance-value> [--to-path=<path>] [--mw-rest=<url>] [--verbose]
2020
mwtab download exactmass <LIPID-abbreviation> <ion-type-value> [--to-path=<path>] [--mw-rest=<url>] [--verbose]
21-
mwtab extract metadata <from-path> <to-path> <key> ... [--to-format=<format>] [--no-header]
22-
mwtab extract metabolites <from-path> <to-path> (<key> <value>) ... [--to-format=<format>] [--no-header]
21+
mwtab extract metadata <from-path> <to-path> <key> ... [--to-format=<format>] [--no-header] [--force]
22+
mwtab extract metabolites <from-path> <to-path> (<key> <value>) ... [--to-format=<format>] [--no-header] [--force]
2323
2424
Options:
2525
-h, --help Show this screen.
2626
--version Show version.
2727
--verbose Print what files are processing.
28+
--silent Silence all standard output.
2829
--from-format=<format> Input file format, available formats: mwtab, json [default: mwtab].
2930
--to-format=<format> Output file format [default: json].
3031
Available formats for convert:
@@ -34,6 +35,9 @@
3435
--mw-rest=<url> URL to MW REST interface
3536
[default: https://www.metabolomicsworkbench.org/rest/].
3637
--to-path=<path> Directory to save outputs into. Defaults to the current working directory.
38+
For the validate command, if the given path ends in '.json', then
39+
all JSON file outputs will be condensed into that 1 file. Also for
40+
the validate command no output files are saved unless this option is given.
3741
--prefix=<prefix> Prefix to add at the beginning of the output file name. Defaults to no prefix.
3842
--suffix=<suffix> Suffix to add at the end of the output file name. Defaults to no suffix.
3943
--context=<context> Type of resource to access from MW REST interface, available contexts: study,
@@ -42,14 +46,14 @@
4246
--output-item=<item> Item to be retrieved from Metabolomics Workbench.
4347
--output-format=<format> Format for item to be retrieved in, available formats: mwtab, json.
4448
--no-header Include header at the top of csv formatted files.
49+
--force Ignore non-dictionary values in METABOLITES_DATA, METABOLITES, and EXTENDED tables for JSON files.
4550
4651
For extraction <to-path> can take a "-" which will use stdout.
4752
All <from-path>'s can be single files, directories, or URLs.
4853
4954
Documentation webpage: https://moseleybioinformaticslab.github.io/mwtab/
5055
GitHub webpage: https://github.com/MoseleyBioinformaticsLab/mwtab
5156
"""
52-
## TODO add options to vlaidate to save out the new JSON.
5357

5458
from os import getcwd
5559
from os.path import join, isfile
@@ -66,6 +70,7 @@
6670
from .converter import Converter
6771
from .validator import validate_file
6872
from .mwschema import ms_required_schema, nmr_required_schema
73+
from .mwtab import MWTabFile
6974

7075

7176
OUTPUT_FORMATS = {
@@ -205,7 +210,9 @@ def cli(cmdargs):
205210
"""
206211

207212
VERBOSE = cmdargs["--verbose"]
213+
force = cmdargs['--force']
208214
fileio.VERBOSE = cmdargs["--verbose"]
215+
silent = cmdargs['--silent']
209216
mwrest_base_url = cmdargs['--mw-rest']
210217
fileio.MWREST_URL = mwrest_base_url
211218
mwrest.BASE_URL = mwrest_base_url
@@ -229,19 +236,43 @@ def cli(cmdargs):
229236

230237
# mwtab validate ...
231238
elif cmdargs["validate"]:
232-
for i, (mwfile, e) in enumerate(fileio.read_files(cmdargs["<from-path>"], return_exceptions=True)):
239+
save_files = False
240+
consolidate_files = False
241+
consolidated_json = {}
242+
if optional_to_path:
243+
save_files = True
244+
fileio._create_save_path(optional_to_path)
245+
to_path = pathlib.Path(optional_to_path)
246+
if pathlib.Path(to_path).suffix == '.json':
247+
consolidate_files = True
248+
249+
for i, (mwfile, e) in enumerate(fileio.read_with_class(cmdargs["<from-path>"],
250+
MWTabFile,
251+
{'duplicate_keys':True, 'force':force},
252+
return_exceptions=True)):
233253
if e is not None:
234254
file_source = mwfile if isinstance(mwfile, str) else cmdargs["<from-path>"]
235255
print("Something went wrong when trying to read " + file_source)
236256
traceback.print_exception(e, file=sys.stdout)
237257
print()
238258
continue
239-
validate_file(
240-
mwtabfile = mwfile,
241-
ms_schema = ms_required_schema,
242-
nmr_schema = nmr_required_schema,
243-
verbose = True
244-
)
259+
_, errors_list = validate_file(
260+
mwtabfile = mwfile,
261+
ms_schema = ms_required_schema,
262+
nmr_schema = nmr_required_schema,
263+
verbose = not silent
264+
)
265+
if save_files:
266+
if consolidate_files:
267+
consolidated_json[pathlib.Path(mwfile.source).stem] = errors_list
268+
else:
269+
filename = pathlib.Path(mwfile.source).stem + '_validations.json'
270+
with open(join(to_path, filename), "w", encoding="utf-8") as fh:
271+
fh.write(json.dumps(errors_list, indent=2))
272+
273+
if consolidate_files:
274+
with open(to_path, "w", encoding="utf-8") as fh:
275+
fh.write(json.dumps(consolidated_json, indent=2))
245276

246277
# mwtab download ...
247278
elif cmdargs["download"]:
@@ -349,7 +380,10 @@ def cli(cmdargs):
349380

350381
# mwtab extract ...
351382
elif cmdargs["extract"]:
352-
mwfile_generator = fileio.read_files(cmdargs["<from-path>"], return_exceptions=True)
383+
mwfile_generator = fileio.read_with_class(cmdargs["<from-path>"],
384+
MWTabFile,
385+
{'duplicate_keys':True, "force": force},
386+
return_exceptions=True)
353387
if cmdargs["metabolites"]:
354388
metabolites_dict = mwextract.extract_metabolites(
355389
mwfile_generator,

src/mwtab/converter.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
import sys
113113

114114
from . import fileio
115+
from .mwtab import MWTabFile
115116

116117

117118
class Translator(object):
@@ -142,21 +143,26 @@ class MWTabFileToMWTabFile(Translator):
142143
file_extension = {"json": ".json",
143144
"mwtab": ".txt"}
144145

145-
def __init__(self, from_path, to_path, from_format=None, to_format=None):
146+
def __init__(self, from_path, to_path, from_format=None, to_format=None, force=False):
146147
"""MWTabFileToMWTabFile translator initializer.
147148
:param str from_path: Path to input file(s).
148149
:param str to_path: Path to output file(s).
149150
:param str from_format: Input format: `mwtab` or `json`.
150151
:param str to_format: Output format: `mwtab` or `json`.
152+
:param bool force: If True, replace non-dictionary values in METABOLITES_DATA, METABOLITES,
153+
and EXTENDED with empty dicts on read in for JSON.
151154
"""
152-
super(MWTabFileToMWTabFile, self).__init__(from_path, to_path, from_format, to_format)
155+
super(MWTabFileToMWTabFile, self).__init__(from_path, to_path, from_format, to_format, force)
153156

154157
def __iter__(self):
155158
"""Iterator that yields instances of :class:`~mwtab.mwtab.MWTabFile` instances.
156159
:return: instance of :class:`~mwtab.mwtab.MWTabFile` object instance.
157160
:rtype: :class:`~mwtab.mwtab.MWTabFile`
158161
"""
159-
for mwtabfile, e in fileio.read_files(self.from_path, return_exceptions=True):
162+
for mwtabfile, e in fileio.read_with_class(self.from_path,
163+
MWTabFile,
164+
{'duplicate_keys':True, 'force':self.force},
165+
return_exceptions=True):
160166
if e is not None:
161167
file_source = mwtabfile if isinstance(mwtabfile, str) else self.from_path
162168
print("Something went wrong when trying to read " + file_source)
@@ -169,14 +175,16 @@ def __iter__(self):
169175
class Converter(object):
170176
"""Converter class to convert ``mwTab`` files from ``mwTab`` to ``JSON`` or from ``JSON`` to ``mwTab`` format."""
171177

172-
def __init__(self, from_path, to_path, from_format="mwtab", to_format="json"):
178+
def __init__(self, from_path, to_path, from_format="mwtab", to_format="json", force=False):
173179
"""Converter initializer.
174180
:param str from_path: Path to input file(s).
175181
:param str to_path: Path to output file(s).
176182
:param str from_format: Input format: `mwtab` or `json`.
177183
:param str to_format: Output format: `mwtab` or `json`.
184+
:param bool force: If True, replace non-dictionary values in METABOLITES_DATA, METABOLITES,
185+
and EXTENDED with empty dicts on read in for JSON.
178186
"""
179-
self.file_generator = MWTabFileToMWTabFile(from_path, to_path, from_format, to_format)
187+
self.file_generator = MWTabFileToMWTabFile(from_path, to_path, from_format, to_format, force)
180188

181189
def convert(self):
182190
"""Convert file(s) from ``mwTab`` format to ``JSON`` format or from ``JSON`` format to ``mwTab`` format.

src/mwtab/fileio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def read_with_class(sources: str|list[str], read_class: type, class_kwds: dict,
155155
Returns:
156156
Returns the instantiated class and any exceptions, or None and any exceptions, or the source and any exceptions.
157157
"""
158-
sources = [sources] if isinstance(sources, str) else sources
158+
sources = [sources] if not isinstance(sources, list) else sources
159159
try:
160160
filenames = _generate_filenames(sources, True)
161161
filehandles = _generate_handles(filenames, True)

src/mwtab/metadata_column_matching.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def __init__(self, values_type: None|str = None, values_regex: None|str = None,
291291
self.values_regex = values_regex if isinstance(values_regex, str) else ''
292292
self.values_inverse_regex = values_inverse_regex if isinstance(values_inverse_regex, str) else ''
293293

294-
def series_match(self, series: pandas.Series, na_values: list|None = None) -> pandas.Series:
294+
def series_match(self, series: pandas.Series, na_values: list|None = None, match_na_values: bool = True) -> pandas.Series:
295295
"""Return a mask for the series based on type and regex matching.
296296
297297
"values_regex" and "values_inverse_regex" are mutually exclusive and "values_regex" will take precedence if both are given.
@@ -300,7 +300,8 @@ def series_match(self, series: pandas.Series, na_values: list|None = None) -> pa
300300
301301
Args:
302302
series: series to match values based on type and/or regex.
303-
na_values: list of values to consider NA values. NA values are ignored for type and regex matching.
303+
na_values: list of values to consider NA values.
304+
match_na_values: if True, NA values will be consider a match and return True, False otherwise.
304305
305306
Returns:
306307
A pandas Series the same length as "series" with Boolean values that can be used to select the matching values in the series.
@@ -319,7 +320,8 @@ def series_match(self, series: pandas.Series, na_values: list|None = None) -> pa
319320
else:
320321
regex_match = pandas.Series([True]*len(series), index=series.index)
321322

322-
regex_match = regex_match | old_NAs
323+
if match_na_values:
324+
regex_match = regex_match | old_NAs
323325

324326

325327
column_to_numeric = pandas.to_numeric(stripped_series, errors='coerce')
@@ -389,10 +391,10 @@ def name_dict_match(self, name_map):
389391
"""
390392
return self.name_matcher.dict_match(name_map)
391393

392-
def values_series_match(self, series, na_values = None):
394+
def values_series_match(self, series, na_values = None, match_na_values = True):
393395
"""Convenience method to use the series_match method for value_matcher.
394396
"""
395-
return self.value_matcher.series_match(series, na_values)
397+
return self.value_matcher.series_match(series, na_values, match_na_values)
396398

397399

398400

@@ -752,7 +754,7 @@ def make_list_regex(element_regex: str, delimiter: str , quoted_elements: bool =
752754
ColumnFinder("name",
753755
NameMatcher(in_strings = ['name'],
754756
in_string_sets = [['name', 'refmet']],
755-
not_in_strings = ['adduct', 'named', 'internal', 'ion'],),
757+
not_in_strings = ['adduct', 'named', 'internal', 'ion', 'metabolite_name'],),
756758
ValueMatcher(values_type = 'non-numeric',)),
757759

758760
ColumnFinder("refmet",

src/mwtab/mwschema.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -577,26 +577,28 @@ def _create_int_regex_and_message(can_be_range: bool = False) -> str:
577577
'SPECTROMETER_FREQUENCY'],
578578
'additionalProperties': False}
579579

580+
# The 'properties' and 'required' keys are commented out because they are validated in separate
581+
# functions. Doing it here causes many more spurious errors to be printed.
580582
data_schema = \
581583
{'type': 'array',
582584
'items': {'type': 'object',
583-
'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}}},
584-
'required': ['Metabolite'],
585+
# 'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}}},
586+
# 'required': ['Metabolite'],
585587
'additionalProperties': True}}
586588

587589
extended_schema = \
588590
{'type': 'array',
589591
'items': {'type': 'object',
590-
'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}},
591-
'sample_id': {'type': 'string', 'not':{'enum': NA_VALUES}}},
592-
'required': ['Metabolite', 'sample_id'],
592+
# 'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}},
593+
# 'sample_id': {'type': 'string', 'not':{'enum': NA_VALUES}}},
594+
# 'required': ['Metabolite', 'sample_id'],
593595
'additionalProperties': True}}
594596

595597
binned_data_schema = \
596598
{'type': 'array',
597599
'items': {'type': 'object',
598-
'properties': {'Bin range(ppm)': {'type': 'string', 'not':{'enum': NA_VALUES}}},
599-
'required': ['Bin range(ppm)'],
600+
# 'properties': {'Bin range(ppm)': {'type': 'string', 'not':{'enum': NA_VALUES}}},
601+
# 'required': ['Bin range(ppm)'],
600602
'additionalProperties': True}}
601603

602604
ms_metabolite_data_schema = \

0 commit comments

Comments
 (0)