Merge pull request #40 from Quantmetry/dev_doc_supp

JulienRoussel77 · web-flow · commit e28cae4ebc3a · 2023-06-14T16:43:10.000+02:00
Dev doc supp
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,6 +1,15 @@
 =======
 History
 =======
+0.0.14 (2023-06-14)
+-------------------
+
+* Documentation improved, with the API information
+* Bug patched, in particular for some logo display and RPCA imputation
+* The PRSA online dataset has been modified, the benchmark now loads the new version with a single station
+* More tests have been implemented
+* Tests for compliance with the sklearn standards have been implemented (check_estimator). Some arguments are mutable, and the corresponding tests are for now ignored
+
 0.0.13 (2023-06-07)
 -------------------
 
diff --git a/docs/_templates/__init__.py b/docs/_templates/__init__.py
diff --git a/docs/_templates/class.rst b/docs/_templates/class.rst
@@ -0,0 +1,17 @@
+:mod:`{{module}}`.{{objname}}
+{{ underline }}==============
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+
+   {% block methods %}
+   .. automethod:: __init__
+   {% endblock %}
+
+.. include:: {{module}}.{{objname}}.examples
+
+.. raw:: html
+
+    <div style='clear:both'></div>
diff --git a/docs/_templates/function.rst b/docs/_templates/function.rst
@@ -0,0 +1,12 @@
+:mod:`{{module}}`.{{objname}}
+{{ underline }}====================
+
+.. currentmodule:: {{ module }}
+
+.. autofunction:: {{ objname }}
+
+.. include:: {{module}}.{{objname}}.examples
+
+.. raw:: html
+
+    <div style='clear:both'></div>
diff --git a/docs/api.rst b/docs/api.rst
@@ -1,22 +1,94 @@
-##########
+###########
 Qolmat API
-##########
+###########
 
 .. currentmodule:: qolmat
 
-Classes
-=======
+Imputers
+=========
 
 .. autosummary::
     :toctree: generated/
-    :template: class.rst
+    :template: class.rst    
+    
+    imputations.imputers.ImputerEM
+    imputations.imputers.ImputerKNN
+    imputations.imputers.ImputerInterpolation
+    imputations.imputers.ImputerLOCF
+    imputations.imputers.ImputerMedian
+    imputations.imputers.ImputerMean
+    imputations.imputers.ImputerMICE
+    imputations.imputers.ImputerMode
+    imputations.imputers.ImputerNOCB
+    imputations.imputers.ImputerOracle
+    imputations.imputers.ImputerRegressor
+    imputations.imputers.ImputerResiduals
+    imputations.imputers.ImputerRPCA
+    imputations.imputers.ImputerShuffle
+
+Comparator
+===========
 
-    imputations.rpca.rpca.RPCA
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst    
+    
+    benchmark.comparator.Comparator
 
-Utils
-=====
+Missing Patterns
+================
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst    
+    
+    benchmark.missing_patterns.UniformHoleGenerator
+    benchmark.missing_patterns.GeometricHoleGenerator
+    benchmark.missing_patterns.EmpiricalHoleGenerator
+    benchmark.missing_patterns.MultiMarkovHoleGenerator
+    benchmark.missing_patterns.GroupedHoleGenerator
+
+
+Metrics
+=======
 
 .. autosummary::
     :toctree: generated/
     :template: function.rst
 
+    benchmark.metrics.mean_squared_error
+    benchmark.metrics.root_mean_squared_error
+    benchmark.metrics.weighted_mean_absolute_percentage_error
+    benchmark.metrics.wasserstein_distance
+    benchmark.metrics.kl_divergence
+    benchmark.metrics.kolmogorov_smirnov_test
+    benchmark.metrics.total_variance_distance
+    benchmark.metrics.mean_difference_correlation_matrix_numerical_features
+    benchmark.metrics.mean_difference_correlation_matrix_categorical_features
+    benchmark.metrics.mean_diff_corr_matrix_categorical_vs_numerical_features
+    benchmark.metrics.sum_energy_distances
+    benchmark.metrics.frechet_distance
+
+
+RPCA engine
+================
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst    
+    
+    imputations.rpca.rpca_noisy.RPCANoisy
+    imputations.rpca.rpca_pcp.RPCAPCP
+
+
+EM engine
+================
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst    
+    
+    imputations.em_sampler.MultiNormalEM
+    imputations.em_sampler.VAR1EM
+
+
diff --git a/docs/hole_generator.rst b/docs/hole_generator.rst
@@ -0,0 +1,5 @@
+
+Focus on holes generation
+==========================
+
+Qolmat allows to generate new missing valueson a an existing dataset, mainly for the purpose of comparing imputations. All features are available in `benchmark.missing_patterns`. This section will be completed very soon...
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,6 +8,7 @@
    explanation
    RPCA
    EM_sampler
+   hole_generator
 
 .. toctree::
    :maxdepth: 2 
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -5,6 +5,7 @@
 import scipy
 import sklearn
 from sklearn import metrics as skm
+from sklearn.ensemble import BaseEnsemble
 from sklearn.preprocessing import StandardScaler
 
 EPS = np.finfo(float).eps
@@ -155,7 +156,28 @@ def wasserstein_distance(
         )
 
 
-def density_from_rf(df, estimator, df_est=None):
+def density_from_rf(
+    df: pd.DataFrame, estimator: BaseEnsemble, df_est: Optional[pd.DataFrame] = None
+):
+    """Estimates the density of the empirical distribution given by df at the sample points given
+    by df_est. The estimation uses an random forest estimator and relies on the average number of
+    samples in the leaf corresponding to each estimation point.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Empirical distribution which density should be estimated
+    estimator : BaseEnsemble
+        Estimator defining the forest upon which is based the density counting.
+    df_est : pd.DataFrame, optional
+        Sample points of the estimation, by default None
+        If None, the density is estimated at the points given by `df`.
+
+    Returns
+    -------
+    pd.Series
+        Series of floats providing the normalized density
+    """
     if df_est is None:
         df_est = df.copy()
     counts = pd.Series(0, index=df_est.index)
@@ -172,7 +194,23 @@ def density_from_rf(df, estimator, df_est=None):
     return counts
 
 
-def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
+def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
+    """Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
+    given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
+    the smallest to the largest value. Not that this may be a coarse estimation.
+
+    Parameters
+    ----------
+    df1 : pd.Series
+        First empirical distribution
+    df2 : pd.Series
+        Second empirical distribution
+
+    Returns
+    -------
+    float
+        Kullback-Leibler divergence between the two empirical distributions.
+    """
     min_val = min(df1.min(), df2.min())
     max_val = max(df1.max(), df2.max())
     bins = np.linspace(min_val, max_val, 20)
@@ -184,17 +222,23 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
 def kl_divergence(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, method: str = "columnwise"
 ) -> pd.Series:
-    """TODO Documentation
-    Kullback-Leibler divergence between distributions
-    If multivariate normal distributions:
-    https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    """
+    Estimation of the Kullback-Leibler divergence between too empirical distributions. Three
+    methods are implemented:
+    - columnwise, relying on a uniform binarization and only taking marginals into account
+    (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
+    - gaussian, relying on a Gaussian approximation,
+    - random_forest, experimental
 
     Parameters
     ----------
     df1 : pd.DataFrame
+        First empirical distribution
     df2 : pd.DataFrame
-    columnwise_evaluation: Optional[bool]
-        if the evalutation is computed column-wise. By default, is set to False
+        Second empirical distribution
+    df_mask: pd.DataFrame
+        Mask indicating on what values the divergence should be computed
+    method:
 
     Returns
     -------
@@ -572,7 +616,7 @@ def _get_correlation_f_oneway_matrix(
     return pd.DataFrame(matrix, index=cols_categorical, columns=cols_numerical)
 
 
-def mean_difference_correlation_matrix_categorical_vs_numerical_features(
+def mean_diff_corr_matrix_categorical_vs_numerical_features(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
     df_mask: pd.DataFrame,
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
@@ -268,16 +268,16 @@ def test_mean_difference_correlation_matrix_categorical_features(
 @pytest.mark.parametrize("df1", [df_incomplete_cat_num])
 @pytest.mark.parametrize("df2", [df_imputed_cat_num])
 @pytest.mark.parametrize("df_mask", [df_mask_cat_num])
-def test_mean_difference_correlation_matrix_categorical_vs_numerical_features(
+def test_mean_diff_corr_matrix_categorical_vs_numerical_features(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
 ) -> None:
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df1, df_mask
     ).equals(pd.Series([0.0], index=["col1"]))
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df1, df_mask, False
     ).equals(pd.Series([0.0], index=["col1"]))
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df2, df_mask
     ).equals(pd.Series([0.07009774198932273], index=["col1"]))
 
@@ -330,6 +330,6 @@ def test_exception_raise_no_categorical_column_found(
 def test_value_error_get_correlation_f_oneway_matrix(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
 ) -> None:
-    assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
+    assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
         df1, df2, df_mask
     ).equals(pd.Series([np.nan], index=["col1"]))