Add char arrays output (#43)

MelReyCG · web-flow · commit a88a0934b552 · 2024-11-05T08:27:11.000-08:00
* ✨ add char comparison when encountering int8 arrays

* adding detection of wider strings

* better management of strings separators &amp; whitespaces

* removed potencially disturbing character

* 🎨 code style

* 🎨 code style + msg fix
diff --git a/geos-ats/src/geos/ats/helpers/restart_check.py b/geos-ats/src/geos/ats/helpers/restart_check.py
@@ -7,6 +7,7 @@
 import argparse
 import logging
 import time
+import string
 from pathlib import Path
 try:
     from geos.ats.helpers.permute_array import permuteArray  # type: ignore[import]
@@ -375,35 +376,100 @@ def compareIntArrays( self, path, arr, base_arr ):
         ARR [in]: The hdf5 Dataset to compare.
         BASE_ARR [in]: The hdf5 Dataset to compare against.
         """
-        # If the shapes are different they can't be compared.
+        message = ""
         if arr.shape != base_arr.shape:
-            msg = "Datasets have different shapes and therefore can't be compared: %s, %s.\n" % ( arr.shape,
-                                                                                                  base_arr.shape )
-            self.errorMsg( path, msg, True )
-            return
+            message = "Datasets have different shapes and therefore can't be compared statistically: %s, %s.\n" % (
+                arr.shape, base_arr.shape )
+        else:
+            # Calculate the absolute difference.
+            difference = np.subtract( arr, base_arr )
+            np.abs( difference, out=difference )
 
-        # Create a copy of the arrays.
+            offenders = difference != 0.0
+            n_offenders = np.sum( offenders )
 
-        # Calculate the absolute difference.
-        difference = np.subtract( arr, base_arr )
-        np.abs( difference, out=difference )
+            if n_offenders != 0:
+                max_index = np.unravel_index( np.argmax( difference ), difference.shape )
+                max_difference = difference[ max_index ]
+                offenders_mean = np.mean( difference[ offenders ] )
+                offenders_std = np.std( difference[ offenders ] )
 
-        offenders = difference != 0.0
-        n_offenders = np.sum( offenders )
+                message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
+                    arr.dtype, base_arr.dtype, offenders.size, n_offenders )
+                message += "Statistics of the differences greater than 0:\n"
+                message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
+                                                                                   offenders_mean, offenders_std )
 
-        if n_offenders != 0:
-            max_index = np.unravel_index( np.argmax( difference ), difference.shape )
-            max_difference = difference[ max_index ]
-            offenders_mean = np.mean( difference[ offenders ] )
-            offenders_std = np.std( difference[ offenders ] )
+        # actually, int8 arrays are almost always char arrays, so we sould add a character comparison.
+        if arr.dtype == np.int8 and base_arr.dtype == np.int8:
+            message += self.compareCharArrays( arr, base_arr )
 
-            message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
-                arr.dtype, base_arr.dtype, offenders.size, n_offenders )
-            message += "Statistics of the differences greater than 0:\n"
-            message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
-                                                                               offenders_mean, offenders_std )
+        if message != "":
             self.errorMsg( path, message, True )
 
+    def compareCharArrays( self, comp_arr, base_arr ):
+        """
+        Compare the valid characters of two arrays and return a formatted string showing differences.
+
+        COMP_ARR [in]: The hdf5 Dataset to compare.
+        BASE_ARR [in]: The hdf5 Dataset to compare against.
+
+        Returns a formatted string highlighting the differing characters.
+        """
+        comp_ndarr = np.array( comp_arr ).flatten()
+        base_ndarr = np.array( base_arr ).flatten()
+
+        # Replace invalid characters by group-separator characters ('\x1D')
+        valid_chars = set( string.printable )
+        invalid_char = '\x1D'
+        comp_str = "".join(
+            [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in comp_ndarr ] )
+        base_str = "".join(
+            [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in base_ndarr ] )
+
+        # replace whitespaces sequences by only one space (preventing indentation / spacing changes detection)
+        whitespace_pattern = r"[ \t\n\r\v\f]+"
+        comp_str = re.sub( whitespace_pattern, " ", comp_str )
+        base_str = re.sub( whitespace_pattern, " ", base_str )
+        # replace invalid characters sequences by a double space (for clear display)
+        invalid_char_pattern = r"\x1D+"
+        comp_str_display = re.sub( invalid_char_pattern, "  ", comp_str )
+        base_str_display = re.sub( invalid_char_pattern, "  ", base_str )
+
+        message = ""
+
+        def limited_display( n, string ):
+            return string[ :n ] + f"... ({len(string)-n} omitted chars)" if len( string ) > n else string
+
+        if len( comp_str ) != len( base_str ):
+            max_display = 250
+            message = f"Character arrays have different sizes: {len( comp_str )}, {len( base_str )}.\n"
+            message += f"  {limited_display( max_display, comp_str_display )}\n"
+            message += f"  {limited_display( max_display, base_str_display )}\n"
+        else:
+            # We need to trim arrays to the length of the shortest one for the comparisons
+            min_length = min( len( comp_str_display ), len( base_str_display ) )
+            comp_str_trim = comp_str_display[ :min_length ]
+            base_str_trim = base_str_display[ :min_length ]
+
+            differing_indices = np.where( np.array( list( comp_str_trim ) ) != np.array( list( base_str_trim ) ) )[ 0 ]
+            if differing_indices.size != 0:
+                # check for reordering
+                arr_set = sorted( set( comp_str.split( invalid_char ) ) )
+                base_arr_set = sorted( set( base_str.split( invalid_char ) ) )
+                reordering_detected = arr_set == base_arr_set
+
+                max_display = 110 if reordering_detected else 250
+                message = "Differing valid characters"
+                message += " (substrings reordering detected):\n" if reordering_detected else ":\n"
+
+                message += f"  {limited_display( max_display, comp_str_display )}\n"
+                message += f"  {limited_display( max_display, base_str_display )}\n"
+                message += "  " + "".join(
+                    [ "^" if i in differing_indices else " " for i in range( min( max_display, min_length ) ) ] ) + "\n"
+
+        return message
+
     def compareStringArrays( self, path, arr, base_arr ):
         """
         Compare two string datasets. Exact equality is used as the acceptance criteria.