|
7 | 7 | import argparse |
8 | 8 | import logging |
9 | 9 | import time |
| 10 | +import string |
10 | 11 | from pathlib import Path |
11 | 12 | try: |
12 | 13 | from geos.ats.helpers.permute_array import permuteArray # type: ignore[import] |
@@ -375,35 +376,100 @@ def compareIntArrays( self, path, arr, base_arr ): |
375 | 376 | ARR [in]: The hdf5 Dataset to compare. |
376 | 377 | BASE_ARR [in]: The hdf5 Dataset to compare against. |
377 | 378 | """ |
378 | | - # If the shapes are different they can't be compared. |
| 379 | + message = "" |
379 | 380 | if arr.shape != base_arr.shape: |
380 | | - msg = "Datasets have different shapes and therefore can't be compared: %s, %s.\n" % ( arr.shape, |
381 | | - base_arr.shape ) |
382 | | - self.errorMsg( path, msg, True ) |
383 | | - return |
| 381 | + message = "Datasets have different shapes and therefore can't be compared statistically: %s, %s.\n" % ( |
| 382 | + arr.shape, base_arr.shape ) |
| 383 | + else: |
| 384 | + # Calculate the absolute difference. |
| 385 | + difference = np.subtract( arr, base_arr ) |
| 386 | + np.abs( difference, out=difference ) |
384 | 387 |
|
385 | | - # Create a copy of the arrays. |
| 388 | + offenders = difference != 0.0 |
| 389 | + n_offenders = np.sum( offenders ) |
386 | 390 |
|
387 | | - # Calculate the absolute difference. |
388 | | - difference = np.subtract( arr, base_arr ) |
389 | | - np.abs( difference, out=difference ) |
| 391 | + if n_offenders != 0: |
| 392 | + max_index = np.unravel_index( np.argmax( difference ), difference.shape ) |
| 393 | + max_difference = difference[ max_index ] |
| 394 | + offenders_mean = np.mean( difference[ offenders ] ) |
| 395 | + offenders_std = np.std( difference[ offenders ] ) |
390 | 396 |
|
391 | | - offenders = difference != 0.0 |
392 | | - n_offenders = np.sum( offenders ) |
| 397 | + message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % ( |
| 398 | + arr.dtype, base_arr.dtype, offenders.size, n_offenders ) |
| 399 | + message += "Statistics of the differences greater than 0:\n" |
| 400 | + message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference, |
| 401 | + offenders_mean, offenders_std ) |
393 | 402 |
|
394 | | - if n_offenders != 0: |
395 | | - max_index = np.unravel_index( np.argmax( difference ), difference.shape ) |
396 | | - max_difference = difference[ max_index ] |
397 | | - offenders_mean = np.mean( difference[ offenders ] ) |
398 | | - offenders_std = np.std( difference[ offenders ] ) |
| 403 | + # actually, int8 arrays are almost always char arrays, so we sould add a character comparison. |
| 404 | + if arr.dtype == np.int8 and base_arr.dtype == np.int8: |
| 405 | + message += self.compareCharArrays( arr, base_arr ) |
399 | 406 |
|
400 | | - message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % ( |
401 | | - arr.dtype, base_arr.dtype, offenders.size, n_offenders ) |
402 | | - message += "Statistics of the differences greater than 0:\n" |
403 | | - message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference, |
404 | | - offenders_mean, offenders_std ) |
| 407 | + if message != "": |
405 | 408 | self.errorMsg( path, message, True ) |
406 | 409 |
|
| 410 | + def compareCharArrays( self, comp_arr, base_arr ): |
| 411 | + """ |
| 412 | + Compare the valid characters of two arrays and return a formatted string showing differences. |
| 413 | +
|
| 414 | + COMP_ARR [in]: The hdf5 Dataset to compare. |
| 415 | + BASE_ARR [in]: The hdf5 Dataset to compare against. |
| 416 | +
|
| 417 | + Returns a formatted string highlighting the differing characters. |
| 418 | + """ |
| 419 | + comp_ndarr = np.array( comp_arr ).flatten() |
| 420 | + base_ndarr = np.array( base_arr ).flatten() |
| 421 | + |
| 422 | + # Replace invalid characters by group-separator characters ('\x1D') |
| 423 | + valid_chars = set( string.printable ) |
| 424 | + invalid_char = '\x1D' |
| 425 | + comp_str = "".join( |
| 426 | + [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in comp_ndarr ] ) |
| 427 | + base_str = "".join( |
| 428 | + [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in base_ndarr ] ) |
| 429 | + |
| 430 | + # replace whitespaces sequences by only one space (preventing indentation / spacing changes detection) |
| 431 | + whitespace_pattern = r"[ \t\n\r\v\f]+" |
| 432 | + comp_str = re.sub( whitespace_pattern, " ", comp_str ) |
| 433 | + base_str = re.sub( whitespace_pattern, " ", base_str ) |
| 434 | + # replace invalid characters sequences by a double space (for clear display) |
| 435 | + invalid_char_pattern = r"\x1D+" |
| 436 | + comp_str_display = re.sub( invalid_char_pattern, " ", comp_str ) |
| 437 | + base_str_display = re.sub( invalid_char_pattern, " ", base_str ) |
| 438 | + |
| 439 | + message = "" |
| 440 | + |
| 441 | + def limited_display( n, string ): |
| 442 | + return string[ :n ] + f"... ({len(string)-n} omitted chars)" if len( string ) > n else string |
| 443 | + |
| 444 | + if len( comp_str ) != len( base_str ): |
| 445 | + max_display = 250 |
| 446 | + message = f"Character arrays have different sizes: {len( comp_str )}, {len( base_str )}.\n" |
| 447 | + message += f" {limited_display( max_display, comp_str_display )}\n" |
| 448 | + message += f" {limited_display( max_display, base_str_display )}\n" |
| 449 | + else: |
| 450 | + # We need to trim arrays to the length of the shortest one for the comparisons |
| 451 | + min_length = min( len( comp_str_display ), len( base_str_display ) ) |
| 452 | + comp_str_trim = comp_str_display[ :min_length ] |
| 453 | + base_str_trim = base_str_display[ :min_length ] |
| 454 | + |
| 455 | + differing_indices = np.where( np.array( list( comp_str_trim ) ) != np.array( list( base_str_trim ) ) )[ 0 ] |
| 456 | + if differing_indices.size != 0: |
| 457 | + # check for reordering |
| 458 | + arr_set = sorted( set( comp_str.split( invalid_char ) ) ) |
| 459 | + base_arr_set = sorted( set( base_str.split( invalid_char ) ) ) |
| 460 | + reordering_detected = arr_set == base_arr_set |
| 461 | + |
| 462 | + max_display = 110 if reordering_detected else 250 |
| 463 | + message = "Differing valid characters" |
| 464 | + message += " (substrings reordering detected):\n" if reordering_detected else ":\n" |
| 465 | + |
| 466 | + message += f" {limited_display( max_display, comp_str_display )}\n" |
| 467 | + message += f" {limited_display( max_display, base_str_display )}\n" |
| 468 | + message += " " + "".join( |
| 469 | + [ "^" if i in differing_indices else " " for i in range( min( max_display, min_length ) ) ] ) + "\n" |
| 470 | + |
| 471 | + return message |
| 472 | + |
407 | 473 | def compareStringArrays( self, path, arr, base_arr ): |
408 | 474 | """ |
409 | 475 | Compare two string datasets. Exact equality is used as the acceptance criteria. |
|
0 commit comments