Skip to content

Commit e28cae4

Browse files
Merge pull request #40 from Quantmetry/dev_doc_supp
Dev doc supp
2 parents eaf50eb + a3511d8 commit e28cae4

File tree

9 files changed

+182
-22
lines changed

9 files changed

+182
-22
lines changed

HISTORY.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
=======
22
History
33
=======
4+
0.0.14 (2023-06-14)
5+
-------------------
6+
7+
* Documentation improved, with the API information
8+
* Bug patched, in particular for some logo display and RPCA imputation
9+
* The PRSA online dataset has been modified, the benchmark now loads the new version with a single station
10+
* More tests have been implemented
11+
* Tests for compliance with the sklearn standards have been implemented (check_estimator). Some arguments are mutable, and the corresponding tests are for now ignored
12+
413
0.0.13 (2023-06-07)
514
-------------------
615

docs/_templates/__init__.py

Whitespace-only changes.

docs/_templates/class.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
:mod:`{{module}}`.{{objname}}
2+
{{ underline }}==============
3+
4+
.. currentmodule:: {{ module }}
5+
6+
.. autoclass:: {{ objname }}
7+
:members:
8+
9+
{% block methods %}
10+
.. automethod:: __init__
11+
{% endblock %}
12+
13+
.. include:: {{module}}.{{objname}}.examples
14+
15+
.. raw:: html
16+
17+
<div style='clear:both'></div>

docs/_templates/function.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
:mod:`{{module}}`.{{objname}}
2+
{{ underline }}====================
3+
4+
.. currentmodule:: {{ module }}
5+
6+
.. autofunction:: {{ objname }}
7+
8+
.. include:: {{module}}.{{objname}}.examples
9+
10+
.. raw:: html
11+
12+
<div style='clear:both'></div>

docs/api.rst

Lines changed: 80 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,94 @@
1-
##########
1+
###########
22
Qolmat API
3-
##########
3+
###########
44

55
.. currentmodule:: qolmat
66

7-
Classes
8-
=======
7+
Imputers
8+
=========
99

1010
.. autosummary::
1111
:toctree: generated/
12-
:template: class.rst
12+
:template: class.rst
13+
14+
imputations.imputers.ImputerEM
15+
imputations.imputers.ImputerKNN
16+
imputations.imputers.ImputerInterpolation
17+
imputations.imputers.ImputerLOCF
18+
imputations.imputers.ImputerMedian
19+
imputations.imputers.ImputerMean
20+
imputations.imputers.ImputerMICE
21+
imputations.imputers.ImputerMode
22+
imputations.imputers.ImputerNOCB
23+
imputations.imputers.ImputerOracle
24+
imputations.imputers.ImputerRegressor
25+
imputations.imputers.ImputerResiduals
26+
imputations.imputers.ImputerRPCA
27+
imputations.imputers.ImputerShuffle
28+
29+
Comparator
30+
===========
1331

14-
imputations.rpca.rpca.RPCA
32+
.. autosummary::
33+
:toctree: generated/
34+
:template: class.rst
35+
36+
benchmark.comparator.Comparator
1537

16-
Utils
17-
=====
38+
Missing Patterns
39+
================
40+
41+
.. autosummary::
42+
:toctree: generated/
43+
:template: class.rst
44+
45+
benchmark.missing_patterns.UniformHoleGenerator
46+
benchmark.missing_patterns.GeometricHoleGenerator
47+
benchmark.missing_patterns.EmpiricalHoleGenerator
48+
benchmark.missing_patterns.MultiMarkovHoleGenerator
49+
benchmark.missing_patterns.GroupedHoleGenerator
50+
51+
52+
Metrics
53+
=======
1854

1955
.. autosummary::
2056
:toctree: generated/
2157
:template: function.rst
2258

59+
benchmark.metrics.mean_squared_error
60+
benchmark.metrics.root_mean_squared_error
61+
benchmark.metrics.weighted_mean_absolute_percentage_error
62+
benchmark.metrics.wasserstein_distance
63+
benchmark.metrics.kl_divergence
64+
benchmark.metrics.kolmogorov_smirnov_test
65+
benchmark.metrics.total_variance_distance
66+
benchmark.metrics.mean_difference_correlation_matrix_numerical_features
67+
benchmark.metrics.mean_difference_correlation_matrix_categorical_features
68+
benchmark.metrics.mean_diff_corr_matrix_categorical_vs_numerical_features
69+
benchmark.metrics.sum_energy_distances
70+
benchmark.metrics.frechet_distance
71+
72+
73+
RPCA engine
74+
================
75+
76+
.. autosummary::
77+
:toctree: generated/
78+
:template: class.rst
79+
80+
imputations.rpca.rpca_noisy.RPCANoisy
81+
imputations.rpca.rpca_pcp.RPCAPCP
82+
83+
84+
EM engine
85+
================
86+
87+
.. autosummary::
88+
:toctree: generated/
89+
:template: class.rst
90+
91+
imputations.em_sampler.MultiNormalEM
92+
imputations.em_sampler.VAR1EM
93+
94+

docs/hole_generator.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
Focus on holes generation
3+
==========================
4+
5+
Qolmat allows to generate new missing valueson a an existing dataset, mainly for the purpose of comparing imputations. All features are available in `benchmark.missing_patterns`. This section will be completed very soon...

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
explanation
99
RPCA
1010
EM_sampler
11+
hole_generator
1112

1213
.. toctree::
1314
:maxdepth: 2

qolmat/benchmark/metrics.py

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import scipy
66
import sklearn
77
from sklearn import metrics as skm
8+
from sklearn.ensemble import BaseEnsemble
89
from sklearn.preprocessing import StandardScaler
910

1011
EPS = np.finfo(float).eps
@@ -155,7 +156,28 @@ def wasserstein_distance(
155156
)
156157

157158

158-
def density_from_rf(df, estimator, df_est=None):
159+
def density_from_rf(
160+
df: pd.DataFrame, estimator: BaseEnsemble, df_est: Optional[pd.DataFrame] = None
161+
):
162+
"""Estimates the density of the empirical distribution given by df at the sample points given
163+
by df_est. The estimation uses an random forest estimator and relies on the average number of
164+
samples in the leaf corresponding to each estimation point.
165+
166+
Parameters
167+
----------
168+
df : pd.DataFrame
169+
Empirical distribution which density should be estimated
170+
estimator : BaseEnsemble
171+
Estimator defining the forest upon which is based the density counting.
172+
df_est : pd.DataFrame, optional
173+
Sample points of the estimation, by default None
174+
If None, the density is estimated at the points given by `df`.
175+
176+
Returns
177+
-------
178+
pd.Series
179+
Series of floats providing the normalized density
180+
"""
159181
if df_est is None:
160182
df_est = df.copy()
161183
counts = pd.Series(0, index=df_est.index)
@@ -172,7 +194,23 @@ def density_from_rf(df, estimator, df_est=None):
172194
return counts
173195

174196

175-
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
197+
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
198+
"""Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
199+
given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
200+
the smallest to the largest value. Not that this may be a coarse estimation.
201+
202+
Parameters
203+
----------
204+
df1 : pd.Series
205+
First empirical distribution
206+
df2 : pd.Series
207+
Second empirical distribution
208+
209+
Returns
210+
-------
211+
float
212+
Kullback-Leibler divergence between the two empirical distributions.
213+
"""
176214
min_val = min(df1.min(), df2.min())
177215
max_val = max(df1.max(), df2.max())
178216
bins = np.linspace(min_val, max_val, 20)
@@ -184,17 +222,23 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
184222
def kl_divergence(
185223
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, method: str = "columnwise"
186224
) -> pd.Series:
187-
"""TODO Documentation
188-
Kullback-Leibler divergence between distributions
189-
If multivariate normal distributions:
190-
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
225+
"""
226+
Estimation of the Kullback-Leibler divergence between too empirical distributions. Three
227+
methods are implemented:
228+
- columnwise, relying on a uniform binarization and only taking marginals into account
229+
(https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
230+
- gaussian, relying on a Gaussian approximation,
231+
- random_forest, experimental
191232
192233
Parameters
193234
----------
194235
df1 : pd.DataFrame
236+
First empirical distribution
195237
df2 : pd.DataFrame
196-
columnwise_evaluation: Optional[bool]
197-
if the evalutation is computed column-wise. By default, is set to False
238+
Second empirical distribution
239+
df_mask: pd.DataFrame
240+
Mask indicating on what values the divergence should be computed
241+
method:
198242
199243
Returns
200244
-------
@@ -572,7 +616,7 @@ def _get_correlation_f_oneway_matrix(
572616
return pd.DataFrame(matrix, index=cols_categorical, columns=cols_numerical)
573617

574618

575-
def mean_difference_correlation_matrix_categorical_vs_numerical_features(
619+
def mean_diff_corr_matrix_categorical_vs_numerical_features(
576620
df1: pd.DataFrame,
577621
df2: pd.DataFrame,
578622
df_mask: pd.DataFrame,

tests/benchmark/test_metrics.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,16 +268,16 @@ def test_mean_difference_correlation_matrix_categorical_features(
268268
@pytest.mark.parametrize("df1", [df_incomplete_cat_num])
269269
@pytest.mark.parametrize("df2", [df_imputed_cat_num])
270270
@pytest.mark.parametrize("df_mask", [df_mask_cat_num])
271-
def test_mean_difference_correlation_matrix_categorical_vs_numerical_features(
271+
def test_mean_diff_corr_matrix_categorical_vs_numerical_features(
272272
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
273273
) -> None:
274-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
274+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
275275
df1, df1, df_mask
276276
).equals(pd.Series([0.0], index=["col1"]))
277-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
277+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
278278
df1, df1, df_mask, False
279279
).equals(pd.Series([0.0], index=["col1"]))
280-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
280+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
281281
df1, df2, df_mask
282282
).equals(pd.Series([0.07009774198932273], index=["col1"]))
283283

@@ -330,6 +330,6 @@ def test_exception_raise_no_categorical_column_found(
330330
def test_value_error_get_correlation_f_oneway_matrix(
331331
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
332332
) -> None:
333-
assert metrics.mean_difference_correlation_matrix_categorical_vs_numerical_features(
333+
assert metrics.mean_diff_corr_matrix_categorical_vs_numerical_features(
334334
df1, df2, df_mask
335335
).equals(pd.Series([np.nan], index=["col1"]))

0 commit comments

Comments
 (0)