Bug fixes

ptth222 · ptth222 · commit 28c6f7c089d3 · 2025-12-11T12:56:27.000-05:00
Some small bug fixes to make some previous major changes work.
diff --git a/src/mwtab/cli.py b/src/mwtab/cli.py
@@ -231,7 +231,8 @@ def cli(cmdargs):
         converter = Converter(from_path=cmdargs["<from-path>"],
                               to_path=cmdargs["<to-path>"],
                               from_format=cmdargs["--from-format"],
-                              to_format=cmdargs["--to-format"])
+                              to_format=cmdargs["--to-format"],
+                              force=force)
         converter.convert()
 
     # mwtab validate ...
diff --git a/src/mwtab/converter.py b/src/mwtab/converter.py
@@ -118,7 +118,7 @@
 class Translator(object):
     """Translator abstract class."""
 
-    def __init__(self, from_path, to_path, from_format=None, to_format=None):
+    def __init__(self, from_path, to_path, from_format=None, to_format=None, force=False):
         """Translator initializer.
         :param str from_path: Path to input file(s).
         :param str to_path: Path to output file(s).
@@ -131,6 +131,7 @@ def __init__(self, from_path, to_path, from_format=None, to_format=None):
         self.to_format = to_format
         self.from_path_compression = fileio.GenericFilePath.is_compressed(from_path)
         self.to_path_compression = fileio.GenericFilePath.is_compressed(to_path)
+        self.force = force
 
     def __iter__(self):
         """Abstract iterator must be implemented in a subclass."""
@@ -264,6 +265,9 @@ def _to_dir(self, file_generator):
                 print("Something went wrong when trying to convert " + f.source)
                 traceback.print_exception(e, file=sys.stdout)
                 print()
+                
+                if os.path.exists(outpath):
+                    os.remove(outpath)
 
     def _to_zipfile(self, file_generator):
         """Convert files to zip archive.
@@ -344,14 +348,18 @@ def _to_textfile(self, file_generator):
             if file_generator.to_path.endswith(file_generator.file_extension[file_generator.to_format]) \
             else file_generator.to_path + file_generator.file_extension[file_generator.to_format]
 
-        with open(to_path, mode="w", encoding="utf-8") as outfile:
-            for f in file_generator:
-                try:
+        
+        for f in file_generator:
+            try:
+                with open(to_path, mode="w", encoding="utf-8") as outfile:
                     outfile.write(f.writestr(file_generator.to_format))
-                except Exception as e:
-                    print("Something went wrong when trying to convert " + f.source)
-                    traceback.print_exception(e, file=sys.stdout)
-                    print()
+            except Exception as e:
+                print("Something went wrong when trying to convert " + f.source)
+                traceback.print_exception(e, file=sys.stdout)
+                print()
+                
+                if os.path.exists(to_path):
+                    os.remove(to_path)
 
     def _output_path(self, input_path, to_format, archive=False):
         """Construct an output path string from an input path string.
diff --git a/src/mwtab/duplicates_dict.py b/src/mwtab/duplicates_dict.py
@@ -64,6 +64,9 @@ def __contains__(self, key):
     def __eq__(self, compare):
         return self.data == compare
     
+    def __ne__(self, compare):
+        return self.data != compare
+    
     def __repr__(self):
         return self.data.__repr__().replace('OrderedDict', 'DuplicatesDict')
     
diff --git a/src/mwtab/mwtab.py b/src/mwtab/mwtab.py
@@ -581,8 +581,8 @@ def _build_block(self, name, lexer):
                             self._raw_binned_headers = token_value
                             self._raw_samples = self._raw_binned_headers
                         if token.key == "Bin range(ppm)":
-                            self._binned_header = token_value[1:]
-                            self._samples = self._binned_header
+                            # self._binned_header = token_value[1:]
+                            # self._samples = self._binned_header
                             is_header = True
                     # Have seen Factors section in incorrect sections such as METABOLITES, 
                     # and seen multiple Factors sections in a single METABOLITE_DATA section.
@@ -602,24 +602,24 @@ def _build_block(self, name, lexer):
                         if loop_count < 1:
                             self._raw_samples = token_value
                         # The last check for len(token_value) == 1 is for ones like AN000788.
-                        if self._samples is None and \
-                           (any(sample in ssf_samples for sample in token_value[1:]) or \
-                           (len(token_value) == 1 and token_value[0] in ['Samples', 'metabolite name', 'metabolite_name'])):
-                            self._samples = token_value[1:]
+                        if (any(sample in ssf_samples for sample in token_value[1:]) or \
+                           (len(token_value) == 1 and token_value[0] in ['Samples', 'metabolite name', 'metabolite_name']) or \
+                           (len(ssf_samples) == 0 and token_value[0] in ['Samples', 'metabolite name', 'metabolite_name'])):
+                            # self._samples = token_value[1:]
                             is_header = True
                     
                     elif "METABOLITES" in section_name and loop_count < 2:
                         if loop_count < 1:
                             self._raw_metabolite_header = token_value
                         if token.key.lower() == "metabolite_name":
-                            self._metabolite_header = token_value[1:]
+                            # self._metabolite_header = token_value[1:]
                             is_header = True
                     
                     elif "EXTENDED" in section_name and loop_count < 2:
                         if loop_count < 1:
                             self._raw_extended_metabolite_header = token_value
                         if token.key.lower() == "metabolite_name":
-                            self._extended_metabolite_header = token_value[1:]
+                            # self._extended_metabolite_header = token_value[1:]
                             is_header = True
                     
                     
@@ -666,12 +666,30 @@ def _build_block(self, name, lexer):
                     token = next(lexer)
                     loop_count += 1
                 
+                # This makes it so all dicitonaries have the same number of values.
+                # Let's say row 3 looks like {'Metabolite': 'asdf', 'col1': 'qwer', '': 2345}
+                # The rows above don't have the '' entry, this code makes it so they do.
+                if self._duplicate_keys:
+                    data = [duplicates_dict.data for duplicates_dict in data]
+                data_df = pandas.DataFrame.from_records(data).fillna('').astype(str)
+                data = data_df.to_dict(orient='records')
+                if self._duplicate_keys:
+                    data = [DuplicatesDict(data_dict) for data_dict in data]
+                min_header = [column if not column.endswith('}}}') else re.match(DUPLICATE_KEY_REGEX, column).group(1) 
+                              for column in data_df.columns]
+                min_header = min_header[1:] if min_header[1:] else None
+                
                 if token.key.startswith("METABOLITES"):
                     section["Metabolites"] = data
+                    self._metabolite_header = min_header
                 elif token.key.startswith("EXTENDED_"):
                     section["Extended"] = data
+                    self._extended_metabolite_header = min_header
                 else:
                     section["Data"] = data
+                    self._samples = min_header
+                    if "BINNED_DATA" in section_name:
+                        self._binned_header = min_header
 
             elif token.key.endswith("_RESULTS_FILE"):
                 key, results_file_dict = token
@@ -716,8 +734,11 @@ def print_file(self, f=sys.stdout, file_format="mwtab"):
                         print("#NMR", file=f)
                     else:
                         print("#{}".format(key), file=f)
-
-                    self.print_block(key, f=f, file_format=file_format)
+                    
+                    if isinstance(self[key], dict):
+                        self.print_block(key, f=f, file_format=file_format)
+                    else:
+                        raise TypeError(f'Key/section "{key}" is not a dictionary. It cannot be translated to the mwTab format.')
             print("#END", file=f)
 
         elif file_format == "json":
diff --git a/src/mwtab/validator.py b/src/mwtab/validator.py