diff --git a/engine/fof_compare.py b/engine/fof_compare.py index ae47e99..73b051a 100644 --- a/engine/fof_compare.py +++ b/engine/fof_compare.py @@ -8,12 +8,15 @@ import click import xarray as xr +import pandas as pd from util.fof_utils import ( compare_var_and_attr_ds, primary_check, split_feedback_dataset, ) +from util.dataframe_ops import check_file_with_tolerances, parse_check +from util.utils import FileInfo @click.command() @@ -46,44 +49,40 @@ default=None, help="If specified, location where to save the CSV file with the differences.", ) +@click.option( + "--tol", + default=1e-12, +) def fof_compare( - file1, file2, print_lines, lines, output, location + file1, file2, print_lines, lines, output, location, tol ): # pylint: disable=too-many-positional-arguments if not primary_check(file1, file2): print("Different types of files") return - ds1 = xr.open_dataset(file1) - ds2 = xr.open_dataset(file2) + n_righe = xr.open_dataset(file1).sizes["d_body"] + tolerance_file = "tolerance_file.csv" - ds_reports1_sorted, ds_obs1_sorted = split_feedback_dataset(ds1) - ds_reports2_sorted, ds_obs2_sorted = split_feedback_dataset(ds2) + def create_tolerance_csv(n_righe, tol, tolerance_file_name): + df = pd.DataFrame( + {"tolerance": [tol] * n_righe} + ) + df.to_csv(tolerance_file_name) - total_elements_all, equal_elements_all = 0, 0 + create_tolerance_csv(n_righe, tol, tolerance_file) - if print_lines: - nl = lines - else: - nl = 0 + out, err, tol = check_file_with_tolerances( + tolerance_file, + FileInfo(file1), + FileInfo(file2), + factor=4, + rules="", + ) + # print(out) + # print(err) - for ds1, ds2 in [ - (ds_reports1_sorted, ds_reports2_sorted), - (ds_obs1_sorted, ds_obs2_sorted), - ]: - t, e = compare_var_and_attr_ds(ds1, ds2, nl, output, location) - total_elements_all += t - equal_elements_all += e - if total_elements_all > 0: - percent_equal_all = (equal_elements_all / total_elements_all) * 100 - percent_diff_all = 100 - percent_equal_all - print(f"Total percentage of equality: {percent_equal_all:.2f}%") - print(f"Total percentage of difference: {percent_diff_all:.2f}%") - if equal_elements_all == total_elements_all: - print("Files are consistent!") - else: - print("Files are NOT consistent!") if __name__ == "__main__": diff --git a/tests/util/test_fof_utils.py b/tests/util/test_fof_utils.py index bc57bec..9f7c91a 100644 --- a/tests/util/test_fof_utils.py +++ b/tests/util/test_fof_utils.py @@ -164,8 +164,10 @@ def test_compare_array_equal(arr1, arr2, arr1_nan, arr2_nan): - they have the same content - they have nan values in the same positions """ - total, equal, diff = compare_arrays(arr1, arr2, "var_name") - total_nan, equal_nan, diff_nan = compare_arrays(arr1_nan, arr2_nan, "var_name") + total, equal, diff = compare_arrays(arr1, arr2, "var_name", tol=1e-12) + total_nan, equal_nan, diff_nan = compare_arrays( + arr1_nan, arr2_nan, "var_name", tol=1e-12 + ) assert (total, equal, total_nan, equal_nan, diff.size, diff_nan.size) == ( 5, @@ -181,7 +183,7 @@ def test_compare_array_diff(arr1, arr3): """ Test that if I compare two different arrays I get the number of total and equal vales and the number of the position where values are different.""" - total, equal, diff = compare_arrays(arr1, arr3, "var_name") + total, equal, diff = compare_arrays(arr1, arr3, "var_name", tol=1e-12) assert (total, equal, diff.tolist()) == (5, 3, [0, 3]) @@ -320,9 +322,11 @@ def test_compare_var_and_attr_ds(ds1, ds2, tmp_path): file_path = tmp_path / "differences.csv" total1, equal1 = compare_var_and_attr_ds( - ds1, ds2, nl=0, output=True, location=file_path + ds1, ds2, nl=0, output=True, location=file_path, tol=1e-12 + ) + total2, equal2 = compare_var_and_attr_ds( + ds1, ds2, nl=4, output=True, location=None, tol=1e-12 ) - total2, equal2 = compare_var_and_attr_ds(ds1, ds2, nl=4, output=True, location=None) assert (total1, equal1) == (104, 103) assert (total2, equal2) == (104, 103) diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py index 85e46ed..b5a0fb1 100644 --- a/util/dataframe_ops.py +++ b/util/dataframe_ops.py @@ -471,7 +471,7 @@ def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules): cur_df_xr = cur_df[cols_other].to_xarray() t, e = compare_var_and_attr_ds( - ref_df_xr, cur_df_xr, nl=5, output=False, location=None + ref_df_xr, cur_df_xr, nl=5, output=False, location=None, tol=0 ) if t != e: return errors == 1 diff --git a/util/fof_utils.py b/util/fof_utils.py index 9400b00..1d4ab28 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -3,12 +3,15 @@ """ import os +import re import shutil import numpy as np import pandas as pd import xarray as xr +from util.constants import CHECK_THRESHOLD + def get_report_variables(ds): """ @@ -69,13 +72,29 @@ def split_feedback_dataset(ds): return ds_report_sorted, ds_obs_sorted -def compare_arrays(arr1, arr2, var_name): +def compare_arrays(arr1, arr2, var_name, tol): """ Comparison of two arrays containing the values of the same variable. If not the same, it tells you in percentage terms how different they are. """ total = arr1.size + if var_name == "veri_data": + diff_rel = np.abs((arr1 - arr2) / (1.0 + np.abs(arr1))) + diff_rel_df = pd.DataFrame(diff_rel) + + diff = diff_rel_df - tol + + selector = (diff > CHECK_THRESHOLD).any(axis=1) + + out = (~selector).all() + diff_err = diff.index[selector].to_numpy() + + if out: + return total, total, np.array([]) + equal = total - len(diff_err) + return total, equal, diff_err + if np.array_equal(arr1, arr2): equal = total diff = np.array([]) @@ -96,8 +115,7 @@ def compare_arrays(arr1, arr2, var_name): f"Differences in '{var_name}': {percent:.2f}% equal. " f"{total} total entries for this variable" ) - diff_idx = np.where(~mask_equal.ravel())[0] - diff = diff_idx + diff = np.where(~mask_equal.ravel())[0] return total, equal, diff @@ -191,21 +209,31 @@ def write_lines(ds1, ds2, diff, path_name): f.write(f"diff : {row_diff}" + "\n") -def write_different_size(output, nl, path_name, var, sizes): +def write_different_size(output, var, sizes, path_name=None): + """ + This function appends a message to a file (and optionally prints it) warning + that a given variable cannot be compared because two datasets have different + lengths. The message is written only if output is enabled, and printed to the + console if nl is not zero. + """ + # print(sizes) + print(var) if output: with open(path_name, "a", encoding="utf-8") as f: f.write( f"variable : {var} -> datasets have different lengths " f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" + "\n" ) - if nl != 0: - print( - f"\033[1mvar\033[0m : {var} -> datasets have different lengths " - f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" - ) + else: + print( + f"\033[1mvar\033[0m : {var} -> datasets have different lengths " + f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" + ) -def compare_var_and_attr_ds(ds1, ds2, nl, output, location): +def compare_var_and_attr_ds( + ds1, ds2, nl, output, location, tol +): # pylint: disable=too-many-positional-arguments """ Variable by variable and attribute by attribute, comparison of the two files. @@ -213,6 +241,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): total_all, equal_all = 0, 0 list_to_skip = ["source", "i_body", "l_body"] + path_name = "" if output: if location: @@ -231,7 +260,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): arr2 = fill_nans_for_float32(ds2[var].values) if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) + t, e, diff = compare_arrays(arr1, arr2, var, tol) if output: write_lines(ds1, ds2, diff, path_name) @@ -242,7 +271,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): else: t, e = max(arr1.size, arr2.size), 0 - write_different_size(output, nl, path_name, var, [arr1.size, arr2.size]) + write_different_size(output, var, [arr1.size, arr2.size], path_name) total_all += t equal_all += e @@ -251,7 +280,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): arr1 = np.array(ds1.attrs[var], dtype=object) arr2 = np.array(ds2.attrs[var], dtype=object) if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) + t, e, diff = compare_arrays(arr1, arr2, var, tol) if output: write_lines(ds1, ds2, diff, path_name) @@ -262,7 +291,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): else: t, e = max(arr1.size, arr2.size), 0 - write_different_size(output, nl, path_name, var, [arr1.size, arr2.size]) + write_different_size(output, var, [arr1.size, arr2.size], path_name) total_all += t equal_all += e @@ -272,12 +301,17 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location): def primary_check(file1, file2): """ - Test that the two files are of the same type. + Check if two files are of the observation type, ignoring timestamp differences. + The check includes the prefix, the observation type and the ensemble suffix if + present. """ - name1 = os.path.basename(file1) - name2 = os.path.basename(file2) - name1_core = name1.replace("fof", "").replace(".nc", "") - name2_core = name2.replace("fof", "").replace(".nc", "") + def core_name(path): + # Filename without directory + name = os.path.basename(path) + # Remove extension + name = os.path.splitext(name)[0] + # Remove timestamp + return re.sub(r"_(\d{14})(?=(_ens\d+)?$)", "", name) - return name1_core == name2_core + return core_name(file1) == core_name(file2)