MeteoSwiss · cghielmini · Dec 1, 2025 · Dec 1, 2025 · Dec 3, 2025 · Dec 12, 2025
diff --git a/engine/fof_compare.py b/engine/fof_compare.py
@@ -8,12 +8,15 @@
 
 import click
 import xarray as xr
+import pandas as pd
 
 from util.fof_utils import (
     compare_var_and_attr_ds,
     primary_check,
     split_feedback_dataset,
 )
+from util.dataframe_ops import check_file_with_tolerances, parse_check
+from util.utils import FileInfo
 
 
 @click.command()
@@ -46,44 +49,40 @@
     default=None,
     help="If specified, location where to save the CSV file with the differences.",
 )
+@click.option(
+    "--tol",
+    default=1e-12,
+)
 def fof_compare(
-    file1, file2, print_lines, lines, output, location
+    file1, file2, print_lines, lines, output, location, tol
 ):  # pylint: disable=too-many-positional-arguments
 
     if not primary_check(file1, file2):
         print("Different types of files")
         return
 
-    ds1 = xr.open_dataset(file1)
-    ds2 = xr.open_dataset(file2)
+    n_righe = xr.open_dataset(file1).sizes["d_body"]
+    tolerance_file = "tolerance_file.csv"
 
-    ds_reports1_sorted, ds_obs1_sorted = split_feedback_dataset(ds1)
-    ds_reports2_sorted, ds_obs2_sorted = split_feedback_dataset(ds2)
+    def create_tolerance_csv(n_righe, tol, tolerance_file_name):
+        df = pd.DataFrame(
+            {"tolerance": [tol] * n_righe}
+        )
+        df.to_csv(tolerance_file_name)
 
-    total_elements_all, equal_elements_all = 0, 0
+    create_tolerance_csv(n_righe, tol, tolerance_file)
 
-    if print_lines:
-        nl = lines
-    else:
-        nl = 0
+    out, err, tol = check_file_with_tolerances(
+            tolerance_file,
+            FileInfo(file1),
+            FileInfo(file2),
+            factor=4,
+            rules="",
+        )
+    # print(out)
+    # print(err)
 
-    for ds1, ds2 in [
-        (ds_reports1_sorted, ds_reports2_sorted),
-        (ds_obs1_sorted, ds_obs2_sorted),
-    ]:
-        t, e = compare_var_and_attr_ds(ds1, ds2, nl, output, location)
-        total_elements_all += t
-        equal_elements_all += e
 
-    if total_elements_all > 0:
-        percent_equal_all = (equal_elements_all / total_elements_all) * 100
-        percent_diff_all = 100 - percent_equal_all
-        print(f"Total percentage of equality: {percent_equal_all:.2f}%")
-        print(f"Total percentage of difference: {percent_diff_all:.2f}%")
-        if equal_elements_all == total_elements_all:
-            print("Files are consistent!")
-        else:
-            print("Files are NOT consistent!")
 
 
 if __name__ == "__main__":

diff --git a/tests/util/test_fof_utils.py b/tests/util/test_fof_utils.py
@@ -164,8 +164,10 @@ def test_compare_array_equal(arr1, arr2, arr1_nan, arr2_nan):
     - they have the same content
     - they have nan values in the same positions
     """
-    total, equal, diff = compare_arrays(arr1, arr2, "var_name")
-    total_nan, equal_nan, diff_nan = compare_arrays(arr1_nan, arr2_nan, "var_name")
+    total, equal, diff = compare_arrays(arr1, arr2, "var_name", tol=1e-12)
+    total_nan, equal_nan, diff_nan = compare_arrays(
+        arr1_nan, arr2_nan, "var_name", tol=1e-12
+    )
 
     assert (total, equal, total_nan, equal_nan, diff.size, diff_nan.size) == (
         5,
@@ -181,7 +183,7 @@ def test_compare_array_diff(arr1, arr3):
     """
     Test that if I compare two different arrays I get the number of total and equal
     vales and the number of the position where values are different."""
-    total, equal, diff = compare_arrays(arr1, arr3, "var_name")
+    total, equal, diff = compare_arrays(arr1, arr3, "var_name", tol=1e-12)
 
     assert (total, equal, diff.tolist()) == (5, 3, [0, 3])
 
@@ -320,9 +322,11 @@ def test_compare_var_and_attr_ds(ds1, ds2, tmp_path):
     file_path = tmp_path / "differences.csv"
 
     total1, equal1 = compare_var_and_attr_ds(
-        ds1, ds2, nl=0, output=True, location=file_path
+        ds1, ds2, nl=0, output=True, location=file_path, tol=1e-12
+    )
+    total2, equal2 = compare_var_and_attr_ds(
+        ds1, ds2, nl=4, output=True, location=None, tol=1e-12
     )
-    total2, equal2 = compare_var_and_attr_ds(ds1, ds2, nl=4, output=True, location=None)
 
     assert (total1, equal1) == (104, 103)
     assert (total2, equal2) == (104, 103)

diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py
@@ -471,7 +471,7 @@ def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules):
             cur_df_xr = cur_df[cols_other].to_xarray()
 
             t, e = compare_var_and_attr_ds(
-                ref_df_xr, cur_df_xr, nl=5, output=False, location=None
+                ref_df_xr, cur_df_xr, nl=5, output=False, location=None, tol=0
             )
             if t != e:
                 return errors == 1

diff --git a/util/fof_utils.py b/util/fof_utils.py
@@ -3,12 +3,15 @@
 """
 
 import os
+import re
 import shutil
 
 import numpy as np
 import pandas as pd
 import xarray as xr
 
+from util.constants import CHECK_THRESHOLD
+
 
 def get_report_variables(ds):
     """
@@ -69,13 +72,29 @@ def split_feedback_dataset(ds):
     return ds_report_sorted, ds_obs_sorted
 
 
-def compare_arrays(arr1, arr2, var_name):
+def compare_arrays(arr1, arr2, var_name, tol):
     """
     Comparison of two arrays containing the values of the same variable.
     If not the same, it tells you in percentage terms how different they are.
     """
     total = arr1.size
 
+    if var_name == "veri_data":
+        diff_rel = np.abs((arr1 - arr2) / (1.0 + np.abs(arr1)))
+        diff_rel_df = pd.DataFrame(diff_rel)
+
+        diff = diff_rel_df - tol
+
+        selector = (diff > CHECK_THRESHOLD).any(axis=1)
+
+        out = (~selector).all()
+        diff_err = diff.index[selector].to_numpy()
+
+        if out:
+            return total, total, np.array([])
+        equal = total - len(diff_err)
+        return total, equal, diff_err
+
     if np.array_equal(arr1, arr2):
         equal = total
         diff = np.array([])
@@ -96,8 +115,7 @@ def compare_arrays(arr1, arr2, var_name):
             f"Differences in '{var_name}': {percent:.2f}% equal. "
             f"{total} total entries for this variable"
         )
-        diff_idx = np.where(~mask_equal.ravel())[0]
-        diff = diff_idx
+        diff = np.where(~mask_equal.ravel())[0]
 
     return total, equal, diff
 
@@ -191,28 +209,39 @@ def write_lines(ds1, ds2, diff, path_name):
                 f.write(f"diff : {row_diff}" + "\n")
 
 
-def write_different_size(output, nl, path_name, var, sizes):
+def write_different_size(output, var, sizes, path_name=None):
+    """
+    This function appends a message to a file (and optionally prints it) warning
+    that a given variable cannot be compared because two datasets have different
+    lengths. The message is written only if output is enabled, and printed to the
+    console if nl is not zero.
+    """
+    # print(sizes)
+    print(var)
     if output:
         with open(path_name, "a", encoding="utf-8") as f:
             f.write(
                 f"variable  : {var} -> datasets have different lengths "
                 f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" + "\n"
             )
-        if nl != 0:
-            print(
-                f"\033[1mvar\033[0m : {var} -> datasets have different lengths "
-                f"({sizes[0]} vs. {sizes[1]} ), comparison not possible"
-            )
+    else:
+        print(
+            f"\033[1mvar\033[0m : {var} -> datasets have different lengths "
+            f"({sizes[0]} vs. {sizes[1]} ), comparison not possible"
+        )
 
 
-def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
+def compare_var_and_attr_ds(
+    ds1, ds2, nl, output, location, tol
+):  # pylint: disable=too-many-positional-arguments
     """
     Variable by variable and attribute by attribute,
     comparison of the two files.
     """
 
     total_all, equal_all = 0, 0
     list_to_skip = ["source", "i_body", "l_body"]
+    path_name = ""
 
     if output:
         if location:
@@ -231,7 +260,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
             arr2 = fill_nans_for_float32(ds2[var].values)
 
             if arr1.size == arr2.size:
-                t, e, diff = compare_arrays(arr1, arr2, var)
+                t, e, diff = compare_arrays(arr1, arr2, var, tol)
 
                 if output:
                     write_lines(ds1, ds2, diff, path_name)
@@ -242,7 +271,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
 
             else:
                 t, e = max(arr1.size, arr2.size), 0
-                write_different_size(output, nl, path_name, var, [arr1.size, arr2.size])
+                write_different_size(output, var, [arr1.size, arr2.size], path_name)
 
             total_all += t
             equal_all += e
@@ -251,7 +280,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
             arr1 = np.array(ds1.attrs[var], dtype=object)
             arr2 = np.array(ds2.attrs[var], dtype=object)
             if arr1.size == arr2.size:
-                t, e, diff = compare_arrays(arr1, arr2, var)
+                t, e, diff = compare_arrays(arr1, arr2, var, tol)
 
                 if output:
                     write_lines(ds1, ds2, diff, path_name)
@@ -262,7 +291,7 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
 
             else:
                 t, e = max(arr1.size, arr2.size), 0
-                write_different_size(output, nl, path_name, var, [arr1.size, arr2.size])
+                write_different_size(output, var, [arr1.size, arr2.size], path_name)
 
             total_all += t
             equal_all += e
@@ -272,12 +301,17 @@ def compare_var_and_attr_ds(ds1, ds2, nl, output, location):
 
 def primary_check(file1, file2):
     """
-    Test that the two files are of the same type.
+    Check if two files are of the observation type, ignoring timestamp differences.
+    The check includes the prefix, the observation type and the ensemble suffix if
+    present.
     """
-    name1 = os.path.basename(file1)
-    name2 = os.path.basename(file2)
 
-    name1_core = name1.replace("fof", "").replace(".nc", "")
-    name2_core = name2.replace("fof", "").replace(".nc", "")
+    def core_name(path):
+        # Filename without directory
+        name = os.path.basename(path)
+        # Remove extension
+        name = os.path.splitext(name)[0]
+        # Remove timestamp
+        return re.sub(r"_(\d{14})(?=(_ens\d+)?$)", "", name)
 
-    return name1_core == name2_core
+    return core_name(file1) == core_name(file2)