Skip to content

Commit a3d0c5f

Browse files
author
vm-aifluence-jro
committed
documentation improved, history written
1 parent df54984 commit a3d0c5f

File tree

7 files changed

+62
-19
lines changed

7 files changed

+62
-19
lines changed

HISTORY.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
=======
22
History
33
=======
4+
0.0.14 (2023-06-14)
5+
-------------------
6+
7+
* Documentation improved, with the API information
8+
* Bug patched, in particular for some logo display and RPCA imputation
9+
* The PRSA online dataset has been modified, the benchmark now loads the new version with a single station
10+
* More tests have been implemented
11+
* Tests for compliance with the sklearn standards have been implemented (check_estimator). Some arguments are mutable, and the corresponding tests are for now ignored
12+
413
0.0.13 (2023-06-07)
514
-------------------
615

docs/Hole_Generator.rst

Lines changed: 0 additions & 5 deletions
This file was deleted.

docs/api.rst

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,12 @@ Metrics
6060
benchmark.metrics.root_mean_squared_error
6161
benchmark.metrics.weighted_mean_absolute_percentage_error
6262
benchmark.metrics.wasserstein_distance
63-
benchmark.metrics.density_from_rf
64-
benchmark.metrics.kl_divergence_1D
6563
benchmark.metrics.kl_divergence
6664
benchmark.metrics.kolmogorov_smirnov_test
6765
benchmark.metrics.total_variance_distance
6866
benchmark.metrics.mean_difference_correlation_matrix_numerical_features
6967
benchmark.metrics.mean_difference_correlation_matrix_categorical_features
70-
benchmark.metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features
68+
benchmark.metrics.mean_diff_corr_matrix_categorical_vs_numerical_features
7169
benchmark.metrics.sum_energy_distances
7270
benchmark.metrics.frechet_distance
7371

@@ -79,7 +77,6 @@ RPCA engine
7977
:toctree: generated/
8078
:template: class.rst
8179

82-
imputations.rpca.rpca.RPCA
8380
imputations.rpca.rpca_noisy.RPCANoisy
8481
imputations.rpca.rpca_pcp.RPCAPCP
8582

@@ -91,7 +88,6 @@ EM engine
9188
:toctree: generated/
9289
:template: class.rst
9390

94-
imputations.em_sampler.EM
9591
imputations.em_sampler.MultiNormalEM
9692
imputations.em_sampler.VAR1EM
9793

docs/hole_generator.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
Focus on holes generation
3+
==========================
4+
5+
Qolmat allows to generate new missing valueson a an existing dataset, mainly for the purpose of comparing imputations. All features are available in `benchmark.missing_patterns`. This section will be completed very soon...

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
explanation
99
RPCA
1010
EM_sampler
11-
Hole_Generator
11+
hole_generator
1212

1313
.. toctree::
1414
:maxdepth: 2

qolmat/benchmark/metrics.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import scipy
66
import sklearn
77
from sklearn import metrics as skm
8+
from sklearn.ensemble import BaseEnsemble
89
from sklearn.preprocessing import StandardScaler
910

1011
EPS = np.finfo(float).eps
@@ -155,7 +156,28 @@ def wasserstein_distance(
155156
)
156157

157158

158-
def density_from_rf(df, estimator, df_est=None):
159+
def density_from_rf(
160+
df: pd.DataFrame, estimator: BaseEnsemble, df_est: Optional[pd.DataFrame] = None
161+
):
162+
"""Estimates the density of the empirical distribution given by df at the sample points given by
163+
df_est. The estimation uses an random forest estimator and relies on the average number of
164+
samples in the leaf corresponding to each estimation point.
165+
166+
Parameters
167+
----------
168+
df : pd.DataFrame
169+
Empirical distribution which density should be estimated
170+
estimator : BaseEnsemble
171+
Estimator defining the forest upon which is based the density counting.
172+
df_est : pd.DataFrame, optional
173+
Sample points of the estimation, by default None
174+
If None, the density is estimated at the points given by `df`.
175+
176+
Returns
177+
-------
178+
pd.Series
179+
Series of floats providing the normalized density
180+
"""
159181
if df_est is None:
160182
df_est = df.copy()
161183
counts = pd.Series(0, index=df_est.index)
@@ -172,7 +194,23 @@ def density_from_rf(df, estimator, df_est=None):
172194
return counts
173195

174196

175-
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
197+
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
198+
"""Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
199+
given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
200+
the smallest to the largest value. Not that this may be a coarse estimation.
201+
202+
Parameters
203+
----------
204+
df1 : pd.Series
205+
First empirical distribution
206+
df2 : pd.Series
207+
Second empirical distribution
208+
209+
Returns
210+
-------
211+
float
212+
Kullback-Leibler divergence between the two empirical distributions.
213+
"""
176214
min_val = min(df1.min(), df2.min())
177215
max_val = max(df1.max(), df2.max())
178216
bins = np.linspace(min_val, max_val, 20)
@@ -572,7 +610,7 @@ def _get_correlation_f_oneway_matrix(
572610
return pd.DataFrame(matrix, index=cols_categorical, columns=cols_numerical)
573611

574612

575-
def mean_difference_correlation_matrix_categorical_vs_numerical_features(
613+
def mean_diff_corr_matrix_categorical_vs_numerical_features(
576614
df1: pd.DataFrame,
577615
df2: pd.DataFrame,
578616
df_mask: pd.DataFrame,

tests/benchmark/test_metrics.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,16 +268,16 @@ def test_mean_difference_correlation_matrix_categorical_features(
268268
@pytest.mark.parametrize("df1", [df_incomplete_cat_num])
269269
@pytest.mark.parametrize("df2", [df_imputed_cat_num])
270270
@pytest.mark.parametrize("df_mask", [df_mask_cat_num])
271-
def test_mean_difference_correlation_matrix_categorical_vs_numerical_features(
271+
def test_mean_diff_corr_matrix_categorical_vs_numerical_features(
272272
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
273273
) -> None:
274-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
274+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
275275
df1, df1, df_mask
276276
).equals(pd.Series([0.0], index=["col1"]))
277-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
277+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
278278
df1, df1, df_mask, False
279279
).equals(pd.Series([0.0], index=["col1"]))
280-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
280+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
281281
df1, df2, df_mask
282282
).equals(pd.Series([0.07009774198932273], index=["col1"]))
283283

@@ -330,6 +330,6 @@ def test_exception_raise_no_categorical_column_found(
330330
def test_value_error_get_correlation_f_oneway_matrix(
331331
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
332332
) -> None:
333-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
333+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
334334
df1, df2, df_mask
335335
).equals(pd.Series([np.nan], index=["col1"]))

0 commit comments

Comments
 (0)