MoseleyBioinformaticsLab
diff --git a/‎src/mwtab/cli.py‎
Lines changed: 47 additions & 13 deletions b/‎src/mwtab/cli.py‎
Lines changed: 47 additions & 13 deletions
diff --git a/‎src/mwtab/converter.py‎
Lines changed: 13 additions & 5 deletions b/‎src/mwtab/converter.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎src/mwtab/fileio.py‎
Lines changed: 1 addition & 1 deletion b/‎src/mwtab/fileio.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mwtab/metadata_column_matching.py‎
Lines changed: 8 additions & 6 deletions b/‎src/mwtab/metadata_column_matching.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/mwtab/mwschema.py‎
Lines changed: 9 additions & 7 deletions b/‎src/mwtab/mwschema.py‎
Lines changed: 9 additions & 7 deletions
@@ -10,21 +10,22 @@
     Usage:
         mwtab -h | --help
         mwtab --version
-        mwtab convert (<from-path> <to-path>) [--from-format=<format>] [--to-format=<format>] [--mw-rest=<url>] [--verbose]
-        mwtab validate <from-path> [--mw-rest=<url>]
+        mwtab convert (<from-path> <to-path>) [--from-format=<format>] [--to-format=<format>] [--mw-rest=<url>] [--force] [--verbose]
+        mwtab validate <from-path> [--to-path=<path>] [--mw-rest=<url>] [--force] [--silent]
         mwtab download url <url> [--to-path=<path>] [--verbose]
         mwtab download study all [--to-path=<path>] [--input-item=<item>] [--output-format=<format>] [--mw-rest=<url>] [--verbose]
         mwtab download study <input-value> [--to-path=<path>] [--input-item=<item>] [--output-item=<item>] [--output-format=<format>] [--mw-rest=<url>] [--verbose]
         mwtab download (study | compound | refmet | gene | protein) <input-item> <input-value> <output-item> [--output-format=<format>] [--to-path=<path>] [--mw-rest=<url>] [--verbose]
         mwtab download moverz <input-item> <m/z-value> <ion-type-value> <m/z-tolerance-value> [--to-path=<path>] [--mw-rest=<url>] [--verbose]
         mwtab download exactmass <LIPID-abbreviation> <ion-type-value> [--to-path=<path>] [--mw-rest=<url>] [--verbose]
-        mwtab extract metadata <from-path> <to-path> <key> ... [--to-format=<format>] [--no-header]
-        mwtab extract metabolites <from-path> <to-path> (<key> <value>) ... [--to-format=<format>] [--no-header]
+        mwtab extract metadata <from-path> <to-path> <key> ... [--to-format=<format>] [--no-header] [--force]
+        mwtab extract metabolites <from-path> <to-path> (<key> <value>) ... [--to-format=<format>] [--no-header] [--force]
     
     Options:
         -h, --help                           Show this screen.
         --version                            Show version.
         --verbose                            Print what files are processing.
+        --silent                             Silence all standard output.
         --from-format=<format>               Input file format, available formats: mwtab, json [default: mwtab].
         --to-format=<format>                 Output file format [default: json].
                                              Available formats for convert:
@@ -34,6 +35,9 @@
         --mw-rest=<url>                      URL to MW REST interface
                                                 [default: https://www.metabolomicsworkbench.org/rest/].
         --to-path=<path>                     Directory to save outputs into. Defaults to the current working directory.
+                                             For the validate command, if the given path ends in '.json', then 
+                                             all JSON file outputs will be condensed into that 1 file. Also for 
+                                             the validate command no output files are saved unless this option is given.
         --prefix=<prefix>                    Prefix to add at the beginning of the output file name. Defaults to no prefix.
         --suffix=<suffix>                    Suffix to add at the end of the output file name. Defaults to no suffix.
         --context=<context>                  Type of resource to access from MW REST interface, available contexts: study,
@@ -42,14 +46,14 @@
         --output-item=<item>                 Item to be retrieved from Metabolomics Workbench.
         --output-format=<format>             Format for item to be retrieved in, available formats: mwtab, json.
         --no-header                          Include header at the top of csv formatted files.
+        --force                              Ignore non-dictionary values in METABOLITES_DATA, METABOLITES, and EXTENDED tables for JSON files.
     
         For extraction <to-path> can take a "-" which will use stdout.
         All <from-path>'s can be single files, directories, or URLs.
     
     Documentation webpage: https://moseleybioinformaticslab.github.io/mwtab/
     GitHub webpage: https://github.com/MoseleyBioinformaticsLab/mwtab
 """
-## TODO add options to vlaidate to save out the new JSON.
 
 from os import getcwd
 from os.path import join, isfile
@@ -66,6 +70,7 @@
 from .converter import Converter
 from .validator import validate_file
 from .mwschema import ms_required_schema, nmr_required_schema
+from .mwtab import MWTabFile
 
 
 OUTPUT_FORMATS = {
@@ -205,7 +210,9 @@ def cli(cmdargs):
     """
 
     VERBOSE = cmdargs["--verbose"]
+    force = cmdargs['--force']
     fileio.VERBOSE = cmdargs["--verbose"]
+    silent = cmdargs['--silent']
     mwrest_base_url = cmdargs['--mw-rest']
     fileio.MWREST_URL = mwrest_base_url
     mwrest.BASE_URL = mwrest_base_url
@@ -229,19 +236,43 @@ def cli(cmdargs):
 
     # mwtab validate ...
     elif cmdargs["validate"]:
-        for i, (mwfile, e) in enumerate(fileio.read_files(cmdargs["<from-path>"], return_exceptions=True)):
+        save_files = False
+        consolidate_files = False
+        consolidated_json = {}
+        if optional_to_path:
+            save_files = True
+            fileio._create_save_path(optional_to_path)
+            to_path = pathlib.Path(optional_to_path)
+            if pathlib.Path(to_path).suffix == '.json':
+                consolidate_files = True
+        
+        for i, (mwfile, e) in enumerate(fileio.read_with_class(cmdargs["<from-path>"], 
+                                                               MWTabFile, 
+                                                               {'duplicate_keys':True, 'force':force}, 
+                                                               return_exceptions=True)):
             if e is not None:
                 file_source = mwfile if isinstance(mwfile, str) else cmdargs["<from-path>"]
                 print("Something went wrong when trying to read " + file_source)
                 traceback.print_exception(e, file=sys.stdout)
                 print()
                 continue
-            validate_file(
-                mwtabfile = mwfile,
-                ms_schema = ms_required_schema, 
-                nmr_schema = nmr_required_schema,
-                verbose = True
-            )
+            _, errors_list = validate_file(
+                                            mwtabfile = mwfile,
+                                            ms_schema = ms_required_schema, 
+                                            nmr_schema = nmr_required_schema,
+                                            verbose = not silent
+                                            )
+            if save_files:
+                if consolidate_files:
+                    consolidated_json[pathlib.Path(mwfile.source).stem] = errors_list
+                else:
+                    filename = pathlib.Path(mwfile.source).stem + '_validations.json'
+                    with open(join(to_path, filename), "w", encoding="utf-8") as fh:
+                        fh.write(json.dumps(errors_list, indent=2))
+        
+        if consolidate_files:
+            with open(to_path, "w", encoding="utf-8") as fh:
+                fh.write(json.dumps(consolidated_json, indent=2))
 
     # mwtab download ...
     elif cmdargs["download"]:
@@ -349,7 +380,10 @@ def cli(cmdargs):
 
     # mwtab extract ...
     elif cmdargs["extract"]:
-        mwfile_generator = fileio.read_files(cmdargs["<from-path>"], return_exceptions=True)
+        mwfile_generator = fileio.read_with_class(cmdargs["<from-path>"], 
+                                                  MWTabFile, 
+                                                  {'duplicate_keys':True, "force": force}, 
+                                                  return_exceptions=True)
         if cmdargs["metabolites"]:
             metabolites_dict = mwextract.extract_metabolites(
                 mwfile_generator,
 
@@ -112,6 +112,7 @@
 import sys
 
 from . import fileio
+from .mwtab import MWTabFile
 
 
 class Translator(object):
@@ -142,21 +143,26 @@ class MWTabFileToMWTabFile(Translator):
     file_extension = {"json": ".json",
                       "mwtab": ".txt"}
 
-    def __init__(self, from_path, to_path, from_format=None, to_format=None):
+    def __init__(self, from_path, to_path, from_format=None, to_format=None, force=False):
         """MWTabFileToMWTabFile translator initializer.
         :param str from_path: Path to input file(s).
         :param str to_path: Path to output file(s).
         :param str from_format: Input format: `mwtab` or `json`.
         :param str to_format: Output format: `mwtab` or `json`.
+        :param bool force: If True, replace non-dictionary values in METABOLITES_DATA, METABOLITES, 
+                           and EXTENDED with empty dicts on read in for JSON.
         """
-        super(MWTabFileToMWTabFile, self).__init__(from_path, to_path, from_format, to_format)
+        super(MWTabFileToMWTabFile, self).__init__(from_path, to_path, from_format, to_format, force)
 
     def __iter__(self):
         """Iterator that yields instances of :class:`~mwtab.mwtab.MWTabFile` instances.
         :return: instance of :class:`~mwtab.mwtab.MWTabFile` object instance.
         :rtype: :class:`~mwtab.mwtab.MWTabFile`
         """
-        for mwtabfile, e in fileio.read_files(self.from_path, return_exceptions=True):
+        for mwtabfile, e in fileio.read_with_class(self.from_path, 
+                                                   MWTabFile, 
+                                                   {'duplicate_keys':True, 'force':self.force}, 
+                                                   return_exceptions=True):
             if e is not None:
                 file_source = mwtabfile if isinstance(mwtabfile, str) else self.from_path
                 print("Something went wrong when trying to read " + file_source)
@@ -169,14 +175,16 @@ def __iter__(self):
 class Converter(object):
     """Converter class to convert ``mwTab`` files from ``mwTab`` to ``JSON`` or from ``JSON`` to ``mwTab`` format."""
 
-    def __init__(self, from_path, to_path, from_format="mwtab", to_format="json"):
+    def __init__(self, from_path, to_path, from_format="mwtab", to_format="json", force=False):
         """Converter initializer.
         :param str from_path: Path to input file(s).
         :param str to_path: Path to output file(s).
         :param str from_format: Input format: `mwtab` or `json`.
         :param str to_format: Output format: `mwtab` or `json`.
+        :param bool force: If True, replace non-dictionary values in METABOLITES_DATA, METABOLITES, 
+                           and EXTENDED with empty dicts on read in for JSON.
         """
-        self.file_generator = MWTabFileToMWTabFile(from_path, to_path, from_format, to_format)
+        self.file_generator = MWTabFileToMWTabFile(from_path, to_path, from_format, to_format, force)
 
     def convert(self):
         """Convert file(s) from ``mwTab`` format to ``JSON`` format or from ``JSON`` format to ``mwTab`` format.
 
@@ -155,7 +155,7 @@ def read_with_class(sources: str|list[str], read_class: type, class_kwds: dict,
     Returns:
         Returns the instantiated class and any exceptions, or None and any exceptions, or the source and any exceptions.
     """
-    sources = [sources] if isinstance(sources, str) else sources
+    sources = [sources] if not isinstance(sources, list) else sources
     try:
         filenames = _generate_filenames(sources, True)
         filehandles = _generate_handles(filenames, True)
 
@@ -291,7 +291,7 @@ def __init__(self, values_type: None|str = None, values_regex: None|str = None,
         self.values_regex = values_regex if isinstance(values_regex, str) else ''
         self.values_inverse_regex = values_inverse_regex if isinstance(values_inverse_regex, str) else ''
 
-    def series_match(self, series: pandas.Series, na_values: list|None = None) -> pandas.Series:
+    def series_match(self, series: pandas.Series, na_values: list|None = None, match_na_values: bool = True) -> pandas.Series:
         """Return a mask for the series based on type and regex matching.
         
         "values_regex" and "values_inverse_regex" are mutually exclusive and "values_regex" will take precedence if both are given. 
@@ -300,7 +300,8 @@ def series_match(self, series: pandas.Series, na_values: list|None = None) -> pa
         
         Args:
             series: series to match values based on type and/or regex.
-            na_values: list of values to consider NA values. NA values are ignored for type and regex matching.
+            na_values: list of values to consider NA values.
+            match_na_values: if True, NA values will be consider a match and return True, False otherwise.
         
         Returns:
             A pandas Series the same length as "series" with Boolean values that can be used to select the matching values in the series.
@@ -319,7 +320,8 @@ def series_match(self, series: pandas.Series, na_values: list|None = None) -> pa
         else:
             regex_match = pandas.Series([True]*len(series), index=series.index)
 
-        regex_match = regex_match | old_NAs
+        if match_na_values:
+            regex_match = regex_match | old_NAs
 
 
         column_to_numeric = pandas.to_numeric(stripped_series, errors='coerce')
@@ -389,10 +391,10 @@ def name_dict_match(self, name_map):
         """
         return self.name_matcher.dict_match(name_map)
 
-    def values_series_match(self, series, na_values = None):
+    def values_series_match(self, series, na_values = None, match_na_values = True):
         """Convenience method to use the series_match method for value_matcher.
         """
-        return self.value_matcher.series_match(series, na_values)
+        return self.value_matcher.series_match(series, na_values, match_na_values)
 
 
 
@@ -752,7 +754,7 @@ def make_list_regex(element_regex: str, delimiter: str , quoted_elements: bool =
     ColumnFinder("name",
                  NameMatcher(in_strings = ['name'],
                              in_string_sets = [['name', 'refmet']],
-                             not_in_strings = ['adduct', 'named', 'internal', 'ion'],),
+                             not_in_strings = ['adduct', 'named', 'internal', 'ion', 'metabolite_name'],),
                  ValueMatcher(values_type = 'non-numeric',)),
 
     ColumnFinder("refmet",
 
@@ -577,26 +577,28 @@ def _create_int_regex_and_message(can_be_range: bool = False) -> str:
               'SPECTROMETER_FREQUENCY'],
  'additionalProperties': False}
 
+# The 'properties' and 'required' keys are commented out because they are validated in separate 
+# functions. Doing it here causes many more spurious errors to be printed.
 data_schema = \
 {'type': 'array',
  'items': {'type': 'object',
-           'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}}},
-           'required': ['Metabolite'],
+           # 'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}}},
+           # 'required': ['Metabolite'],
            'additionalProperties': True}}
 
 extended_schema = \
 {'type': 'array',
  'items': {'type': 'object',
-           'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}},
-                          'sample_id': {'type': 'string', 'not':{'enum': NA_VALUES}}},
-           'required': ['Metabolite', 'sample_id'],
+           # 'properties': {'Metabolite': {'type': 'string', 'not':{'enum': METABOLITE_NA_VALUES}},
+           #                'sample_id': {'type': 'string', 'not':{'enum': NA_VALUES}}},
+           # 'required': ['Metabolite', 'sample_id'],
            'additionalProperties': True}}
 
 binned_data_schema = \
 {'type': 'array',
  'items': {'type': 'object',
-           'properties': {'Bin range(ppm)': {'type': 'string', 'not':{'enum': NA_VALUES}}},
-           'required': ['Bin range(ppm)'],
+           # 'properties': {'Bin range(ppm)': {'type': 'string', 'not':{'enum': NA_VALUES}}},
+           # 'required': ['Bin range(ppm)'],
            'additionalProperties': True}}
 
 ms_metabolite_data_schema = \