Mostly cleanup

ptth222 · ptth222 · commit a45ed176a06d · 2025-11-03T20:09:10.000-05:00
A few cleanup changes preparing for the next release, but also adding back in read_lines to fileio. Turns out rcha_metab uses it and it is easier to leave it in the mwtab package.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,13 +1,14 @@
 Release History
 ===============
 
-1.2.6
-~~~~
+2.0.0
+~~~~~
 -Can now read duplicate keys in "Additional sample data" and reproduce it in write, will validate it as an error.
 -Writing out now ensures correct key ordering for JSON.
 -Validation now validates the main sections not just the internals of them.
 -Validate now checks that metabolites in the Data section are in the Metabolites section and vice versa.
 -Batch processing from the command line is more fault tolerant and won't stop the batch for 1 bad file.
+-Improved tokenizer so more files can be read in without error.
 -Changed schema validation to use jsonschema instead of schema.
 -Added validations for METABOLITES columns that try to give warnings for bad values, for example 'kegg_id' column should all be something like C00000.
 -Expanded the standard column name functionality to look for many more column names than in the previous version and do it in a much more robust way.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 The Clear BSD License
 
-Copyright (c) 2020, Christian D. Powell, Andrey Smelter, Hunter N.B. Moseley
+Copyright (c) 2025, P. Travis Thompson, Christian D. Powell, Andrey Smelter, Hunter N.B. Moseley
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/docs/todo.rst b/docs/todo.rst
@@ -1,6 +1,8 @@
 TODO
 ====
 
+Add options to validate CLI and validate method in mwtab to save out the new JSON file.
+
 Add --limit or --ignore option to validate to filter out certain types of errors/warnings. Need to first create some classifications to tag them with.
 
 Think about extending METABOLITES and EXTENDED blocks with an "Attributes" line like "Factors" in DATA block as a way to add more information about the columns themselves.
diff --git a/src/mwtab/__main__.py b/src/mwtab/__main__.py
@@ -10,7 +10,7 @@
 def main():
     doc = [line for line in cli.__doc__.split('\n')]
     doc = doc[:3] + [line.lstrip() for line in doc[5:]]
-    doc = doc.join('\n')
+    doc = '\n'.join(doc)
     args = docopt.docopt(cli.__doc__, version=__version__)
     cli.cli(args)
 
diff --git a/src/mwtab/cli.py b/src/mwtab/cli.py
@@ -49,6 +49,7 @@
     Documentation webpage: https://moseleybioinformaticslab.github.io/mwtab/
     GitHub webpage: https://github.com/MoseleyBioinformaticsLab/mwtab
 """
+## TODO add options to vlaidate to save out the new JSON.
 
 from os import getcwd
 from os.path import join, isfile
diff --git a/src/mwtab/fileio.py b/src/mwtab/fileio.py
@@ -27,8 +27,6 @@
 from functools import partial
 
 from . import mwtab
-from . import validator
-from . import mwschema
 from . import mwrest
 
 from urllib.request import urlopen
@@ -195,123 +193,27 @@ def read_with_class(sources: str|list[str], read_class: type, class_kwds: dict,
 read_files = partial(read_with_class, read_class = mwtab.MWTabFile, class_kwds = {"duplicate_keys": True})
 read_mwrest = partial(read_with_class, read_class = mwrest.MWRESTFile, class_kwds = {})
 
-
-# TODO delete this functions after testing.
-# def read_files(sources, return_exceptions=False):
-#     """Construct a generator that yields file instances.
-
-#     :param sources: One or more strings representing path to file(s).
-#     :param bool return_exceptions: Whether to yield a tuple with file instance and exception or just the file instance.
-#     """
-#     try:
-#         filenames = _generate_filenames(sources, True)
-#         filehandles = _generate_handles(filenames, True)
-#     except Exception as e:
-#         yield _return_correct_yield(None, 
-#                                     exception=e, 
-#                                     return_exceptions=return_exceptions)
-#     for fh, source, exc in filehandles:
-#         if exc is not None:
-#             yield _return_correct_yield(source, 
-#                                         exception=exc, 
-#                                         return_exceptions=return_exceptions)
-#             continue
-#         try:
-#             f = mwtab.MWTabFile(source, duplicate_keys=True)
-#             f.read(fh)
-#             fh.close()
-
-#             if VERBOSE:
-#                 print("Processed file: {}".format(os.path.abspath(source)))
-            
-#             yield _return_correct_yield(f, 
-#                                         exception=None, 
-#                                         return_exceptions=return_exceptions)
-
-#         except Exception as e:
-#             fh.close()
-#             if VERBOSE:
-#                 print("Error processing file: ", os.path.abspath(source), "\nReason:", e)
-#             yield _return_correct_yield(source, 
-#                                         exception=e, 
-#                                         return_exceptions=return_exceptions)
-
-
-# def read_mwrest(*sources, return_exceptions=False):
-#     """Construct a generator that yields file instances.
-
-#     :param sources: One or more strings representing path to file(s).
-#     :param bool return_exceptions: Whether to yield a tuple with file instance and exception or just the file instance.
-#     """
-#     try:
-#         filenames = _generate_filenames(sources, True)
-#         filehandles = _generate_handles(filenames, True)
-#     except Exception as e:
-#         yield _return_correct_yield(None, 
-#                                     exception=e, 
-#                                     return_exceptions=return_exceptions)
-#     for fh, source, exc in filehandles:
-#         try:
-#             f = mwrest.MWRESTFile(source)
-#             f.read(fh)
-#             fh.close()
-
-#             if VERBOSE:
-#                 print("Processed url: {}".format(source))
-
-#             yield _return_correct_yield(f, 
-#                                         exception=None, 
-#                                         return_exceptions=return_exceptions)
-
-#         except Exception as e:
-#             fh.close()
-#             if VERBOSE:
-#                 print("Error processing url: ", source, "\nReason:", e)
-#             yield _return_correct_yield(None, 
-#                                         exception=e, 
-#                                         return_exceptions=return_exceptions)
-
-# Unused function, leaving here for now.
-# def read_lines(*sources, return_exceptions=False):
-#     """Construct a generator that yields file instances.
-
-#     :param sources: One or more strings representing path to file(s).
-#     :param bool return_exceptions: Whether to yield a tuple with file instance and exception or just the file instance.
-#     """
-#     try:
-#         filenames = _generate_filenames(sources, True)
-#         filehandles = _generate_handles(filenames, True)
-#     except Exception as e:
-#         yield _return_correct_yield(None, 
-#                                     exception=e, 
-#                                     return_exceptions=return_exceptions)
-#     for fh, source, exc in filehandles:
-#         try:
-#             string = fh.read()
-#             fh.close()
-#             if isinstance(string, str):
-#                 lines = string.replace("\r", "\n").split("\n")
-#             elif isinstance(string, bytes):
-#                 lines = string.decode("utf-8").replace("\r", "\n").split("\n")
-#             else:
-#                 raise TypeError("Expecting <class 'str'> or <class 'bytes'>, but {} was passed".format(type(string)))
-
-#             lines = [line for line in lines if line]
-            
-#             if VERBOSE:
-#                 print("Processed file: {}".format(os.path.abspath(source)))
-            
-#             yield _return_correct_yield((lines, source), 
-#                                         exception=None, 
-#                                         return_exceptions=return_exceptions)
+class ReadLines():
+    def __init__(self, source, *args, **kwargs):
+        self.source = source
+        self.lines = []
+    
+    def read(self, filehandle):
+        """
+        """
+        string = filehandle.read()
+        filehandle.close()
+        if isinstance(string, str):
+            lines = string.replace("\r", "\n").split("\n")
+        elif isinstance(string, bytes):
+            lines = string.decode("utf-8").replace("\r", "\n").split("\n")
+        else:
+            raise TypeError("Expecting <class 'str'> or <class 'bytes'>, but {} was passed".format(type(string)))
         
-#         except Exception as e:
-#             fh.close()
-#             if VERBOSE:
-#                 print("Error processing file: ", source, "\nReason:", e)
-#             yield _return_correct_yield(source, 
-#                                         exception=e, 
-#                                         return_exceptions=return_exceptions)
+        self.lines = [line for line in lines if line]
+        
+
+read_lines = partial(read_with_class, read_class = ReadLines, class_kwds = {})
 
 
 class GenericFilePath(object):
diff --git a/src/mwtab/mwextract.py b/src/mwtab/mwextract.py
@@ -166,10 +166,6 @@ def write_metadata_csv(to_path, extracted_values, no_header=False):
     :return: None
     :rtype: :py:obj:`None`
     """
-    # if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])):
-    #     dirname = os.path.dirname(to_path)
-    #     if dirname:
-    #         os.makedirs(dirname)
     fileio._create_save_path(to_path)
 
     if not os.path.splitext(to_path)[1]:
@@ -219,10 +215,6 @@ def write_metabolites_csv(to_path, extracted_values, no_header=False):
             num_samples
         ])
 
-    # if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])):
-    #     dirname = os.path.dirname(to_path)
-    #     if dirname:
-    #         os.makedirs(dirname)
     fileio._create_save_path(to_path)
 
     if not os.path.splitext(to_path)[1]:
@@ -286,10 +278,6 @@ def write_json(to_path, extracted_dict):
     :return: None
     :rtype: :py:obj:`None`
     """
-    # if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])):
-    #     dirname = os.path.dirname(to_path)
-    #     if dirname:
-    #         os.makedirs(dirname)
     fileio._create_save_path(to_path)
 
     if not os.path.splitext(to_path)[1]:
diff --git a/src/mwtab/mwrest.py b/src/mwtab/mwrest.py
@@ -137,7 +137,7 @@ def generate_mwtab_urls(input_items, base_url=BASE_URL, output_format='txt', ret
                                                return_exceptions=return_exceptions)
 
 
-# Unused funciton. Leaving here for now.
+# Unused function. Leaving here for now.
 # def generate_urls(input_items, base_url=BASE_URL, return_exceptions=False, **kwds):
 #     """
 #     Method for creating a generator which yields validated Metabolomics Workbench REST urls.
diff --git a/src/mwtab/mwtab.py b/src/mwtab/mwtab.py
@@ -348,14 +348,16 @@ def get_metabolites_data_as_pandas(self):
         """
         return self.get_table_as_pandas('Data')
     
-    def validate(self, ms_schema = ms_required_schema, nmr_schema = nmr_required_schema, verbose = True):
+    def validate(self, ms_schema: dict = ms_required_schema, nmr_schema: dict = nmr_required_schema, verbose: bool = True) -> (str, list[dict]):
         """Validate the instance.
         
-        :param dict ms_schema: jsonschema to validate both the base parts of the file and the MS specific parts of the file.
-        :param dict nmr_schema: jsonschema to validate both the base parts of the file and the NMR specific parts of the file.
-        :param bool verbose: whether to be verbose or not.
-        :return: Validated file and errors if verbose is False.
-        :rtype: :py:class:`~mwtab.mwtab.MWTabFile`, _io.StringIO
+        Args:
+            ms_schema: jsonschema to validate both the base parts of the file and the MS specific parts of the file.
+            nmr_schema: jsonschema to validate both the base parts of the file and the NMR specific parts of the file.
+            verbose: whether to be verbose or not.
+        
+        Returns: 
+            Error messages as a single string and error messages in JSON form. If verbose is True, then the single string will be None.
         """
         return validate_file(
                     mwtabfile=self,
diff --git a/src/mwtab/validator.py b/src/mwtab/validator.py
diff --git a/tests/test_fileio.py b/tests/test_fileio.py