Skip to content
51 changes: 25 additions & 26 deletions engine/fof_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@

import click
import xarray as xr
import pandas as pd

from util.fof_utils import (
compare_var_and_attr_ds,
primary_check,
split_feedback_dataset,
)
from util.dataframe_ops import check_file_with_tolerances, parse_check
from util.utils import FileInfo


@click.command()
Expand Down Expand Up @@ -46,44 +49,40 @@
default=None,
help="If specified, location where to save the CSV file with the differences.",
)
@click.option(
"--tol",
default=1e-12,
)
def fof_compare(
file1, file2, print_lines, lines, output, location
file1, file2, print_lines, lines, output, location, tol
): # pylint: disable=too-many-positional-arguments

if not primary_check(file1, file2):
print("Different types of files")
return

ds1 = xr.open_dataset(file1)
ds2 = xr.open_dataset(file2)
n_righe = xr.open_dataset(file1).sizes["d_body"]
tolerance_file = "tolerance_file.csv"

ds_reports1_sorted, ds_obs1_sorted = split_feedback_dataset(ds1)
ds_reports2_sorted, ds_obs2_sorted = split_feedback_dataset(ds2)
def create_tolerance_csv(n_righe, tol, tolerance_file_name):
df = pd.DataFrame(
{"tolerance": [tol] * n_righe}
)
df.to_csv(tolerance_file_name)

total_elements_all, equal_elements_all = 0, 0
create_tolerance_csv(n_righe, tol, tolerance_file)

if print_lines:
nl = lines
else:
nl = 0
out, err, tol = check_file_with_tolerances(
tolerance_file,
FileInfo(file1),
FileInfo(file2),
factor=4,
rules="",
)
# print(out)
# print(err)

for ds1, ds2 in [
(ds_reports1_sorted, ds_reports2_sorted),
(ds_obs1_sorted, ds_obs2_sorted),
]:
t, e = compare_var_and_attr_ds(ds1, ds2, nl, output, location)
total_elements_all += t
equal_elements_all += e

if total_elements_all > 0:
percent_equal_all = (equal_elements_all / total_elements_all) * 100
percent_diff_all = 100 - percent_equal_all
print(f"Total percentage of equality: {percent_equal_all:.2f}%")
print(f"Total percentage of difference: {percent_diff_all:.2f}%")
if equal_elements_all == total_elements_all:
print("Files are consistent!")
else:
print("Files are NOT consistent!")


if __name__ == "__main__":
Expand Down
14 changes: 9 additions & 5 deletions tests/util/test_fof_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,10 @@ def test_compare_array_equal(arr1, arr2, arr1_nan, arr2_nan):
- they have the same content
- they have nan values in the same positions
"""
total, equal, diff = compare_arrays(arr1, arr2, "var_name")
total_nan, equal_nan, diff_nan = compare_arrays(arr1_nan, arr2_nan, "var_name")
total, equal, diff = compare_arrays(arr1, arr2, "var_name", tol=1e-12)
total_nan, equal_nan, diff_nan = compare_arrays(
arr1_nan, arr2_nan, "var_name", tol=1e-12
)

assert (total, equal, total_nan, equal_nan, diff.size, diff_nan.size) == (
5,
Expand All @@ -181,7 +183,7 @@ def test_compare_array_diff(arr1, arr3):
"""
Test that if I compare two different arrays I get the number of total and equal
vales and the number of the position where values are different."""
total, equal, diff = compare_arrays(arr1, arr3, "var_name")
total, equal, diff = compare_arrays(arr1, arr3, "var_name", tol=1e-12)

assert (total, equal, diff.tolist()) == (5, 3, [0, 3])

Expand Down Expand Up @@ -320,9 +322,11 @@ def test_compare_var_and_attr_ds(ds1, ds2, tmp_path):
file_path = tmp_path / "differences.csv"

total1, equal1 = compare_var_and_attr_ds(
ds1, ds2, nl=0, output=True, location=file_path
ds1, ds2, nl=0, output=True, location=file_path, tol=1e-12
)
total2, equal2 = compare_var_and_attr_ds(
ds1, ds2, nl=4, output=True, location=None, tol=1e-12
)
total2, equal2 = compare_var_and_attr_ds(ds1, ds2, nl=4, output=True, location=None)

assert (total1, equal1) == (104, 103)
assert (total2, equal2) == (104, 103)
Expand Down
2 changes: 1 addition & 1 deletion util/dataframe_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules):
cur_df_xr = cur_df[cols_other].to_xarray()

t, e = compare_var_and_attr_ds(
ref_df_xr, cur_df_xr, nl=5, output=False, location=None
ref_df_xr, cur_df_xr, nl=5, output=False, location=None, tol=0
)
if t != e:
return errors == 1
Expand Down
74 changes: 54 additions & 20 deletions util/fof_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
"""

import os
import re
import shutil

import numpy as np
import pandas as pd
import xarray as xr

from util.constants import CHECK_THRESHOLD


def get_report_variables(ds):
"""
Expand Down Expand Up @@ -69,13 +72,29 @@ def split_feedback_dataset(ds):
return ds_report_sorted, ds_obs_sorted


def compare_arrays(arr1, arr2, var_name):
def compare_arrays(arr1, arr2, var_name, tol):
"""
Comparison of two arrays containing the values of the same variable.
If not the same, it tells you in percentage terms how different they are.
"""
total = arr1.size

if var_name == "veri_data":
diff_rel = np.abs((arr1 - arr2) / (1.0 + np.abs(arr1)))
diff_rel_df = pd.DataFrame(diff_rel)

diff = diff_rel_df - tol

selector = (diff > CHECK_THRESHOLD).any(axis=1)

out = (~selector).all()
diff_err = diff.index[selector].to_numpy()

if out:
return total, total, np.array([])
equal = total - len(diff_err)
return total, equal, diff_err

if np.array_equal(arr1, arr2):
equal = total
diff = np.array([])
Expand All @@ -96,8 +115,7 @@ def compare_arrays(arr1, arr2, var_name):
f"Differences in '{var_name}': {percent:.2f}% equal. "
f"{total} total entries for this variable"
)
diff_idx = np.where(~mask_equal.ravel())[0]
diff = diff_idx
diff = np.where(~mask_equal.ravel())[0]

return total, equal, diff

Expand Down Expand Up @@ -191,28 +209,39 @@ def write_lines(ds1, ds2, diff, path_name):
f.write(f"diff : {row_diff}" + "\n")


def write_different_size(output, nl, path_name, var, sizes):
def write_different_size(output, var, sizes, path_name=None):
"""
This function appends a message to a file (and optionally prints it) warning
that a given variable cannot be compared because two datasets have different
lengths. The message is written only if output is enabled, and printed to the
console if nl is not zero.
"""
# print(sizes)
print(var)
if output:
with open(path_name, "a", encoding="utf-8") as f:
f.write(
f"variable : {var} -> datasets have different lengths "
f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" + "\n"
)
if nl != 0:
print(
f"\033[1mvar\033[0m : {var} -> datasets have different lengths "
f"({sizes[0]} vs. {sizes[1]} ), comparison not possible"
)
else:
print(
f"\033[1mvar\033[0m : {var} -> datasets have different lengths "
f"({sizes[0]} vs. {sizes[1]} ), comparison not possible"
)


def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
def compare_var_and_attr_ds(
ds1, ds2, nl, output, location, tol
): # pylint: disable=too-many-positional-arguments
"""
Variable by variable and attribute by attribute,
comparison of the two files.
"""

total_all, equal_all = 0, 0
list_to_skip = ["source", "i_body", "l_body"]
path_name = ""

if output:
if location:
Expand All @@ -231,7 +260,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
arr2 = fill_nans_for_float32(ds2[var].values)

if arr1.size == arr2.size:
t, e, diff = compare_arrays(arr1, arr2, var)
t, e, diff = compare_arrays(arr1, arr2, var, tol)

if output:
write_lines(ds1, ds2, diff, path_name)
Expand All @@ -242,7 +271,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):

else:
t, e = max(arr1.size, arr2.size), 0
write_different_size(output, nl, path_name, var, [arr1.size, arr2.size])
write_different_size(output, var, [arr1.size, arr2.size], path_name)

total_all += t
equal_all += e
Expand All @@ -251,7 +280,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
arr1 = np.array(ds1.attrs[var], dtype=object)
arr2 = np.array(ds2.attrs[var], dtype=object)
if arr1.size == arr2.size:
t, e, diff = compare_arrays(arr1, arr2, var)
t, e, diff = compare_arrays(arr1, arr2, var, tol)

if output:
write_lines(ds1, ds2, diff, path_name)
Expand All @@ -262,7 +291,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):

else:
t, e = max(arr1.size, arr2.size), 0
write_different_size(output, nl, path_name, var, [arr1.size, arr2.size])
write_different_size(output, var, [arr1.size, arr2.size], path_name)

total_all += t
equal_all += e
Expand All @@ -272,12 +301,17 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):

def primary_check(file1, file2):
"""
Test that the two files are of the same type.
Check if two files are of the observation type, ignoring timestamp differences.
The check includes the prefix, the observation type and the ensemble suffix if
present.
"""
name1 = os.path.basename(file1)
name2 = os.path.basename(file2)

name1_core = name1.replace("fof", "").replace(".nc", "")
name2_core = name2.replace("fof", "").replace(".nc", "")
def core_name(path):
# Filename without directory
name = os.path.basename(path)
# Remove extension
name = os.path.splitext(name)[0]
# Remove timestamp
return re.sub(r"_(\d{14})(?=(_ens\d+)?$)", "", name)

return name1_core == name2_core
return core_name(file1) == core_name(file2)
Loading