Merge pull request #51 from Quantmetry/angoho_benchmarks

JulienRoussel77 · web-flow · commit 2b74646b267f · 2023-07-07T18:13:31.000+02:00
Feat: add distance correlation and pattern-based metric
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -8,6 +8,7 @@
 from sklearn import metrics as skm
 from sklearn.ensemble import BaseEnsemble
 from sklearn.preprocessing import StandardScaler
+import dcor
 
 EPS = np.finfo(float).eps
 
@@ -835,6 +836,98 @@ def frechet_distance(
         return pd.Series(np.repeat(frechet_dist, len(df1.columns)))
 
 
+def distance_correlation_complement(
+    df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
+) -> pd.Series:
+    """Correlation distance between columns of 2 dataframes.
+
+    Parameters
+    ----------
+    df1 : pd.DataFrame
+        true dataframe
+    df2 : pd.DataFrame
+        predicted dataframe
+    df_mask : pd.DataFrame
+        Elements of the dataframes to compute on
+
+    Returns
+    -------
+    pd.Series
+        Correlation distance
+    """
+    # For the case that we use this function outside pattern_based_metric
+    df1 = df1[df_mask].fillna(0.0)
+    df2 = df2[df_mask].fillna(0.0)
+
+    return 1.0 - pd.Series([dcor.distance_correlation(df1.values, df2.values)], index=["All"])
+
+
+def pattern_based_weighted_mean_metric(
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    df_mask: pd.DataFrame,
+    metric: Callable,
+    min_num_row: int = 10,
+    **kwargs,
+) -> pd.Series:
+    """Compute a mean score based on missing patterns.
+    Note that for each pattern, a score is returned by the function metric.
+    This code is based on https://www.statsmodels.org/
+
+    Parameters
+    ----------
+    df1 : pd.DataFrame
+        true dataframe
+    df2 : pd.DataFrame
+        predicted dataframe
+    df_mask : pd.DataFrame
+        Elements of the dataframes to compute on
+    metric : Callable
+        metric function
+    min_num_row : int, optional
+        minimum number of row allowed for a pattern without nan, by default 10
+
+    Returns
+    -------
+    pd.Series
+        _description_
+    """
+    # Identify all distinct missing patterns
+    z = 1 + np.log(1 + np.arange(df_mask.shape[1]))
+    c = np.dot(df_mask, z)
+    row_map: Dict = {}
+    for i, v in enumerate(c):
+        if v == 0:
+            # No missing values
+            continue
+        if v not in row_map:
+            row_map[v] = []
+        row_map[v].append(i)
+    patterns = [np.asarray(v) for v in row_map.values()]
+    scores = []
+    weights = []
+    for pattern in patterns:
+        df1_pattern = df1.iloc[pattern].dropna(axis=1)
+        if len(df1_pattern.columns) == 0:
+            df1_pattern = df1.iloc[pattern].dropna(axis=0)
+
+        if len(df1_pattern) >= min_num_row:
+            df2_pattern = df2.loc[df1_pattern.index, df1_pattern.columns]
+            weights.append(1.0 / len(df1_pattern))
+            scores.append(
+                metric(df1_pattern, df2_pattern, ~df1_pattern.isna(), **kwargs).values[0]
+            )
+
+    if len(scores) == 0:
+        raise Exception(
+            "Not found enough patterns. "
+            + f"Number of row for each pattern must be larger than min_num_row={min_num_row}."
+        )
+
+    weighted_scores = np.array(scores) * np.array(weights)
+    return pd.Series(np.sum(weighted_scores) / np.sum(weights), index=["All"])
+
+
 def get_metric(name: str) -> Callable:
     dict_metrics: Dict[str, Callable] = {
         "mse": mean_squared_error,
@@ -849,5 +942,9 @@ def get_metric(name: str) -> Callable:
         "pairwise_dist": sum_pairwise_distances,
         "energy": sum_energy_distances,
         "frechet": frechet_distance,
+        "dist_corr_pattern": partial(
+            pattern_based_weighted_mean_metric,
+            metric=distance_correlation_complement,
+        ),
     }
     return dict_metrics[name]
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
@@ -333,3 +333,32 @@ def test_value_error_get_correlation_f_oneway_matrix(
     assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df2, df_mask
     ).equals(pd.Series([np.nan], index=["col1"]))
+
+
+@pytest.mark.parametrize("df1", [df_incomplete])
+@pytest.mark.parametrize("df2", [df_imputed])
+@pytest.mark.parametrize("df_mask", [df_mask])
+def test_distance_correlation_complement(
+    df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
+) -> None:
+    result = metrics.distance_correlation_complement(df1, df2, df_mask)
+    expected = pd.Series([0.001559], index=["All"])
+    np.testing.assert_allclose(result, expected, atol=1e-3)
+
+
+@pytest.mark.parametrize("df1", [df_incomplete])
+@pytest.mark.parametrize("df2", [df_imputed])
+@pytest.mark.parametrize("df_mask", [df_mask])
+def test_pattern_based_weighted_mean_metric(
+    df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
+) -> None:
+    with pytest.raises(Exception):
+        metrics.pattern_based_weighted_mean_metric(
+            df1, df2, df_mask, metric=metrics.distance_correlation_complement, min_num_row=5
+        )
+
+    expected = pd.Series([2 / 3], index=["All"])
+    result = metrics.pattern_based_weighted_mean_metric(
+        df1, df2, df_mask, metric=metrics.distance_correlation_complement, min_num_row=1
+    )
+    np.testing.assert_allclose(result, expected, atol=1e-3)