documentation improved, history written

vm-aifluence-jro · vm-aifluence-jro · commit a3d0c5f540c6 · 2023-06-14T14:00:57.000Z
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,6 +1,15 @@
 =======
 History
 =======
+0.0.14 (2023-06-14)
+-------------------
+
+* Documentation improved, with the API information
+* Bug patched, in particular for some logo display and RPCA imputation
+* The PRSA online dataset has been modified, the benchmark now loads the new version with a single station
+* More tests have been implemented
+* Tests for compliance with the sklearn standards have been implemented (check_estimator). Some arguments are mutable, and the corresponding tests are for now ignored
+
 0.0.13 (2023-06-07)
 -------------------
 
diff --git a/docs/Hole_Generator.rst b/docs/Hole_Generator.rst
diff --git a/docs/api.rst b/docs/api.rst
@@ -60,14 +60,12 @@ Metrics
     benchmark.metrics.root_mean_squared_error
     benchmark.metrics.weighted_mean_absolute_percentage_error
     benchmark.metrics.wasserstein_distance
-    benchmark.metrics.density_from_rf
-    benchmark.metrics.kl_divergence_1D
     benchmark.metrics.kl_divergence
     benchmark.metrics.kolmogorov_smirnov_test
     benchmark.metrics.total_variance_distance
     benchmark.metrics.mean_difference_correlation_matrix_numerical_features
     benchmark.metrics.mean_difference_correlation_matrix_categorical_features
-    benchmark.metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features
+    benchmark.metrics.mean_diff_corr_matrix_categorical_vs_numerical_features
     benchmark.metrics.sum_energy_distances
     benchmark.metrics.frechet_distance
 
@@ -79,7 +77,6 @@ RPCA engine
     :toctree: generated/
     :template: class.rst    
     
-    imputations.rpca.rpca.RPCA
     imputations.rpca.rpca_noisy.RPCANoisy
     imputations.rpca.rpca_pcp.RPCAPCP
 
@@ -91,7 +88,6 @@ EM engine
     :toctree: generated/
     :template: class.rst    
     
-    imputations.em_sampler.EM
     imputations.em_sampler.MultiNormalEM
     imputations.em_sampler.VAR1EM
 
diff --git a/docs/hole_generator.rst b/docs/hole_generator.rst
@@ -0,0 +1,5 @@
+
+Focus on holes generation
+==========================
+
+Qolmat allows to generate new missing valueson a an existing dataset, mainly for the purpose of comparing imputations. All features are available in `benchmark.missing_patterns`. This section will be completed very soon...
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,7 +8,7 @@
    explanation
    RPCA
    EM_sampler
-   Hole_Generator
+   hole_generator
 
 .. toctree::
    :maxdepth: 2 
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -5,6 +5,7 @@
 import scipy
 import sklearn
 from sklearn import metrics as skm
+from sklearn.ensemble import BaseEnsemble
 from sklearn.preprocessing import StandardScaler
 
 EPS = np.finfo(float).eps
@@ -155,7 +156,28 @@ def wasserstein_distance(
         )
 
 
-def density_from_rf(df, estimator, df_est=None):
+def density_from_rf(
+    df: pd.DataFrame, estimator: BaseEnsemble, df_est: Optional[pd.DataFrame] = None
+):
+    """Estimates the density of the empirical distribution given by df at the sample points given by
+    df_est. The estimation uses an random forest estimator and relies on the average number of
+    samples in the leaf corresponding to each estimation point.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Empirical distribution which density should be estimated
+    estimator : BaseEnsemble
+        Estimator defining the forest upon which is based the density counting.
+    df_est : pd.DataFrame, optional
+        Sample points of the estimation, by default None
+        If None, the density is estimated at the points given by `df`.
+
+    Returns
+    -------
+    pd.Series
+        Series of floats providing the normalized density
+    """
     if df_est is None:
         df_est = df.copy()
     counts = pd.Series(0, index=df_est.index)
@@ -172,7 +194,23 @@ def density_from_rf(df, estimator, df_est=None):
     return counts
 
 
-def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
+def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
+    """Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
+    given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
+    the smallest to the largest value. Not that this may be a coarse estimation.
+
+    Parameters
+    ----------
+    df1 : pd.Series
+        First empirical distribution
+    df2 : pd.Series
+        Second empirical distribution
+
+    Returns
+    -------
+    float
+        Kullback-Leibler divergence between the two empirical distributions.
+    """
     min_val = min(df1.min(), df2.min())
     max_val = max(df1.max(), df2.max())
     bins = np.linspace(min_val, max_val, 20)
@@ -572,7 +610,7 @@ def _get_correlation_f_oneway_matrix(
     return pd.DataFrame(matrix, index=cols_categorical, columns=cols_numerical)
 
 
-def mean_difference_correlation_matrix_categorical_vs_numerical_features(
+def mean_diff_corr_matrix_categorical_vs_numerical_features(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
     df_mask: pd.DataFrame,
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
@@ -268,16 +268,16 @@ def test_mean_difference_correlation_matrix_categorical_features(
 @pytest.mark.parametrize("df1", [df_incomplete_cat_num])
 @pytest.mark.parametrize("df2", [df_imputed_cat_num])
 @pytest.mark.parametrize("df_mask", [df_mask_cat_num])
-def test_mean_difference_correlation_matrix_categorical_vs_numerical_features(
+def test_mean_diff_corr_matrix_categorical_vs_numerical_features(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
 ) -> None:
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df1, df_mask
     ).equals(pd.Series([0.0], index=["col1"]))
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df1, df_mask, False
     ).equals(pd.Series([0.0], index=["col1"]))
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df2, df_mask
     ).equals(pd.Series([0.07009774198932273], index=["col1"]))
 
@@ -330,6 +330,6 @@ def test_exception_raise_no_categorical_column_found(
 def test_value_error_get_correlation_f_oneway_matrix(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
 ) -> None:
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df2, df_mask
     ).equals(pd.Series([np.nan], index=["col1"]))