consistency test added

Julien Roussel · Julien Roussel · commit 9d5c2bd1c96f · 2023-07-20T19:11:30.000+02:00
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -801,6 +801,39 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
     return scipy.stats.entropy(p + EPS, q + EPS)
 
 
+def kl_divergence_gaussian_exact(
+    mean1: pd.Series, cov1: pd.DataFrame, mean2: pd.Series, cov2: pd.DataFrame
+) -> float:
+    """Exact Kullback-Leibler divergence computed between two multivariate normal distributions
+
+    Parameters
+    ----------
+    mean1: pd.Series
+        Mean of the first distribution
+    cov1: pd.DataFrame
+        Covariance matrx of the first distribution
+    mean2: pd.Series
+        Mean of the second distribution
+    cov2: pd.DataFrame
+        Covariance matrx of the second distribution
+    Returns
+    -------
+    float
+        Kulback-Leibler divergence
+    """
+    n_variables = len(mean1)
+    L1, lower1 = scipy.linalg.cho_factor(cov1)
+    L2, lower2 = scipy.linalg.cho_factor(cov2)
+    M = scipy.linalg.solve(L2, L1)
+    y = scipy.linalg.solve(L2, mean2 - mean1)
+    norm_M = (M**2).sum().sum()
+    norm_y = (y**2).sum()
+    term_diag_L = 2 * np.sum(np.log(np.diagonal(L2) / np.diagonal(L1)))
+    print(norm_M, "-", n_variables, "+", norm_y, "+", term_diag_L)
+    div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L)
+    return div_kl
+
+
 def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.Series) -> float:
     """Kullback-Leibler divergence estimation based on a Gaussian approximation of both empirical
     distributions
@@ -821,20 +854,12 @@ def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.Ser
     """
     df1 = df1[df_mask.any(axis=1)]
     df2 = df2[df_mask.any(axis=1)]
-    n_variables = len(df1.columns)
     cov1 = df1.cov()
     cov2 = df2.cov()
     mean1 = df1.mean()
     mean2 = df2.mean()
-    L1, lower1 = scipy.linalg.cho_factor(cov1)
-    L2, lower2 = scipy.linalg.cho_factor(cov2)
-    M = scipy.linalg.solve(L2, L1)
-    y = scipy.linalg.solve(L2, mean2 - mean1)
-    norm_M = (M**2).sum().sum()
-    norm_y = (y**2).sum()
-    term_diag_L = 2 * np.sum(np.log(np.diagonal(L2) / np.diagonal(L1)))
-    print(norm_M, "-", n_variables, "+", norm_y, "+", term_diag_L)
-    div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L)
+
+    div_kl = kl_divergence_gaussian_exact(mean1, cov1, mean2, cov2)
     return div_kl
 
 
@@ -1017,6 +1042,10 @@ def pattern_based_weighted_mean_metric(
         scores.append(metric(df1_pattern, df2_pattern, df_mask_pattern, **kwargs))
     if len(scores) == 0:
         raise NotEnoughSamples(max_num_row, min_n_rows)
+    print("scores:")
+    print(scores)
+    print("weights:")
+    print(weights)
     return pd.Series(sum([s * w for s, w in zip(scores, weights)]), index=["All"])
 
 
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
@@ -360,3 +360,20 @@ def test_pattern_based_weighted_mean_metric(
         df1, df2, df_mask, metric=metrics.distance_anticorr, min_n_rows=1
     )
     np.testing.assert_allclose(result, expected, rtol=1e-2)
+
+
+rng = npr.default_rng(123)
+df_gauss1 = pd.DataFrame(rng.multivariate_normal([0, 0], [[1, 0.2], [0.2, 2]], size=100))
+df_gauss2 = pd.DataFrame(rng.multivariate_normal([0, 1], [[1, 0.2], [0.2, 2]], size=100))
+df_mask = pd.DataFrame(np.full_like(df_gauss1, True))
+
+
+def test_pattern_mae_comparison() -> None:
+    def fun_mean_mae(df_gauss1, df_gauss2, df_mask) -> float:
+        return metrics.mean_squared_error(df_gauss1, df_gauss2, df_mask).mean()
+
+    result1 = fun_mean_mae(df_gauss1, df_gauss2, df_mask)
+    result2 = metrics.pattern_based_weighted_mean_metric(
+        df_gauss1, df_gauss2, df_mask, metric=fun_mean_mae, min_n_rows=1
+    )
+    np.testing.assert_allclose(result1, result2, rtol=1e-2)