Skip to content

Commit eccb46d

Browse files
Julien RousselJulien Roussel
authored andcommitted
history updated
1 parent 7fae887 commit eccb46d

File tree

18 files changed

+726
-332
lines changed

18 files changed

+726
-332
lines changed

HISTORY.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22
History
33
=======
44

5+
0.1.3 (2024-03-07)
6+
------------------
7+
8+
* RPCA algorithms now start with a normalizing scaler
9+
* The EM algorithms now include a gradient projection step to be more robust to colinearity
10+
* The EM algorithm based on the Gaussian model is now initialized using a robust estimation of the covariance matrix
11+
* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling biais
12+
* Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
13+
* The ImputeRegressor class now handles the nans by `row` by default
14+
* The metric `frechet` was not correctly called and has been patched
15+
516
0.1.2 (2024-02-28)
617
------------------
718

docs/imputers.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement
4141
The class :class:`RPCANoisy` implements an recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following
4242

4343
.. math::
44-
\text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
44+
\text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
4545
4646
with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
4747
See the :class:`~qolmat.imputations.imputers.ImputerRpcaNoisy` class for implementation details.

examples/benchmark.md

Lines changed: 89 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ jupyter:
88
format_version: '1.3'
99
jupytext_version: 1.14.4
1010
kernelspec:
11-
display_name: env_qolmat
11+
display_name: env_qolmat_dev
1212
language: python
13-
name: env_qolmat
13+
name: env_qolmat_dev
1414
---
1515

1616
**This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -28,6 +28,8 @@ import warnings
2828
%reload_ext autoreload
2929
%autoreload 2
3030

31+
from IPython.display import Image
32+
3133
import pandas as pd
3234
from datetime import datetime
3335
import numpy as np
@@ -82,12 +84,12 @@ n_cols = len(cols_to_impute)
8284
```
8385

8486
```python
85-
fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
87+
fig = plt.figure(figsize=(20 * n_stations, 6 * n_cols))
8688
for i_station, (station, df) in enumerate(df_data.groupby("station")):
8789
df_station = df_data.loc[station]
8890
for i_col, col in enumerate(cols_to_impute):
8991
fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
90-
plt.plot(df_station[col], '.', label=station)
92+
plt.plot(df_station[col], label=station)
9193
# break
9294
plt.ylabel(col)
9395
plt.xticks(rotation=15)
@@ -127,7 +129,7 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
127129
imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
128130
imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
129131

130-
imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
132+
imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=.01, lam=5, rank=1)
131133
imputer_rpca_opti = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=256)
132134
dict_config_opti["RPCA_opti"] = {
133135
"tau": ho.hp.uniform("tau", low=.5, high=5),
@@ -141,9 +143,9 @@ dict_config_opti["RPCA_opticw"] = {
141143
"lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
142144
}
143145

144-
imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
145-
imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3, p=1)
146-
imputer_tsmle = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=100, n_iter_ou=15, dt=1e-3, p=1)
146+
imputer_normal_sample = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2)
147+
imputer_var_sample = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)
148+
imputer_var_max = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)
147149

148150
imputer_knn = imputers.ImputerKNN(groups=("station",), n_neighbors=10)
149151
imputer_mice = imputers.ImputerMICE(groups=("station",), estimator=LinearRegression(), sample_posterior=False, max_iter=100)
@@ -163,25 +165,25 @@ dict_imputers = {
163165
# "spline": imputer_spline,
164166
# "shuffle": imputer_shuffle,
165167
"residuals": imputer_residuals,
166-
# "OU": imputer_ou,
167-
"TSOU": imputer_tsou,
168-
"TSMLE": imputer_tsmle,
168+
"Normal_sample": imputer_normal_sample,
169+
"VAR_sample": imputer_var_sample,
170+
"VAR_max": imputer_var_max,
169171
"RPCA": imputer_rpca,
170172
# "RPCA_opti": imputer_rpca,
171173
# "RPCA_opticw": imputer_rpca_opti2,
172174
# "locf": imputer_locf,
173175
# "nocb": imputer_nocb,
174176
# "knn": imputer_knn,
175-
"ols": imputer_regressor,
176-
"mice_ols": imputer_mice,
177+
"OLS": imputer_regressor,
178+
"MICE_OLS": imputer_mice,
177179
}
178180
n_imputers = len(dict_imputers)
179181
```
180182

181183
In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.
182184

183185
<p align="center">
184-
<img src="../../docs/images/comparator.png" width=50% height=50%>
186+
<img src="https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/schema_qolmat.png" width=50% height=50%>
185187
</p>
186188

187189

@@ -190,14 +192,14 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
190192

191193
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
192194

193-
```python
194-
metrics = ["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"]
195+
```python tags=[]
196+
metrics = ["mae", "wmape", "KL_columnwise", "frechet"]
195197
comparison = comparator.Comparator(
196198
dict_imputers,
197199
cols_to_impute,
198200
generator_holes = generator_holes,
199201
metrics=metrics,
200-
max_evals=10,
202+
max_evals=2,
201203
dict_config_opti=dict_config_opti,
202204
)
203205
results = comparison.compare(df_data)
@@ -220,9 +222,9 @@ plt.show()
220222
### **III. Comparison of methods**
221223

222224

223-
We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
225+
We now run just one time each algorithm on the initial corrupted dataframe and visualize the different imputations.
224226

225-
```python
227+
```python tags=[]
226228
df_plot = df_data[cols_to_impute]
227229
```
228230

@@ -233,12 +235,17 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
233235
```python
234236
station = df_plot.index.get_level_values("station")[0]
235237
df_station = df_plot.loc[station]
238+
# dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
236239
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
237240
```
238241

239242
Let's look at the imputations.
240243
When the data is missing at random, imputation is easier. Missing block are more challenging.
241244

245+
```python
246+
dfs_imputed_station["VAR_max"]
247+
```
248+
242249
```python
243250
for col in cols_to_impute:
244251
fig, ax = plt.subplots(figsize=(10, 3))
@@ -270,21 +277,21 @@ i_plot = 1
270277
for i_col, col in enumerate(cols_to_impute):
271278
for name_imputer, df_imp in dfs_imputed_station.items():
272279

273-
fig.add_subplot(n_columns, n_imputers, i_plot)
280+
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
274281
values_orig = df_station[col]
275282

276283
values_imp = df_imp[col].copy()
277284
values_imp[values_orig.notna()] = np.nan
278-
plt.plot(values_imp, marker="o", color=tab10(0), label=name_imputer, alpha=1)
285+
plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
279286
plt.plot(values_orig, color='black', marker="o", label="original")
280287
plt.ylabel(col, fontsize=16)
281-
if i_plot % n_columns == 1:
282-
plt.legend(loc=[1, 0], fontsize=18)
288+
if i_plot % n_imputers == 0:
289+
plt.legend(loc="lower right", fontsize=18)
283290
plt.xticks(rotation=15)
284291
if i_col == 0:
285292
plt.title(name_imputer)
286293
if i_col != n_columns - 1:
287-
plt.xticks([], [])
294+
ax.set_xticklabels([])
288295
loc = plticker.MultipleLocator(base=2*365)
289296
ax.xaxis.set_major_locator(loc)
290297
ax.tick_params(axis='both', which='major')
@@ -297,7 +304,7 @@ plt.show()
297304
## (Optional) Deep Learning Model
298305

299306

300-
In this section, we present an MLP model of data imputation using Keras, which can be installed using a "pip install pytorch".
307+
In this section, we present an MLP model of data imputation using PyTorch, which can be installed using a "pip install qolmat[pytorch]".
301308

302309
```python
303310
from qolmat.imputations import imputers_pytorch
@@ -308,17 +315,6 @@ except ModuleNotFoundError:
308315
raise PyTorchExtraNotInstalled
309316
```
310317

311-
For the MLP model, we work on a dataset that corresponds to weather data with missing values. We add missing MCAR values on the features "TEMP", "PRES" and other features with NaN values. The goal is impute the missing values for the features "TEMP" and "PRES" by a Deep Learning method. We add features to take into account the seasonality of the data set and a feature for the station name
312-
313-
```python
314-
df = data.get_data("Beijing")
315-
cols_to_impute = ["TEMP", "PRES"]
316-
cols_with_nans = list(df.columns[df.isna().any()])
317-
df_data = data.add_datetime_features(df)
318-
df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked=.1, mean_size=120)
319-
df_data
320-
```
321-
322318
For the example, we use a simple MLP model with 3 layers of neurons.
323319
Then we train the model without taking a group on the stations
324320

@@ -340,49 +336,75 @@ plt.show()
340336
```
341337

342338
```python
343-
# estimator = nn.Sequential(
344-
# nn.Linear(np.sum(df_data.isna().sum()==0), 256),
345-
# nn.ReLU(),
346-
# nn.Linear(256, 128),
347-
# nn.ReLU(),
348-
# nn.Linear(128, 64),
349-
# nn.ReLU(),
350-
# nn.Linear(64, 1)
351-
# )
352-
estimator = imputers_pytorch.build_mlp(input_dim=np.sum(df_data.isna().sum()==0), list_num_neurons=[256,128,64])
353-
encoder, decoder = imputers_pytorch.build_autoencoder(input_dim=df_data.values.shape[1],latent_dim=4, output_dim=df_data.values.shape[1], list_num_neurons=[4*4, 2*4])
339+
n_variables = len(cols_to_impute)
340+
341+
estimator = imputers_pytorch.build_mlp(input_dim=n_variables-1, list_num_neurons=[256,128,64])
342+
encoder, decoder = imputers_pytorch.build_autoencoder(input_dim=n_variables,latent_dim=4, output_dim=n_variables, list_num_neurons=[4*4, 2*4])
354343
```
355344

356345
```python
357-
dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), handler_nan = "column", epochs=500)
346+
dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), epochs=500)
358347
dict_imputers["Autoencoder"] = imputer_autoencoder = imputers_pytorch.ImputerAutoencoder(encoder, decoder, max_iterations=100, epochs=100)
359348
dict_imputers["Diffusion"] = imputer_diffusion = imputers_pytorch.ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=100, batch_size=100)
360349
```
361350

362351
We can re-run the imputation model benchmark as before.
352+
```python
353+
comparison = comparator.Comparator(
354+
dict_imputers,
355+
cols_to_impute,
356+
generator_holes = generator_holes,
357+
metrics=metrics,
358+
max_evals=2,
359+
dict_config_opti=dict_config_opti,
360+
)
361+
```
362+
363363
```python tags=[]
364364
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=3, groups=('station',), subset=cols_to_impute, ratio_masked=ratio_masked)
365365

366366
comparison = comparator.Comparator(
367367
dict_imputers,
368-
selected_columns = df_data.columns,
368+
cols_to_impute,
369369
generator_holes = generator_holes,
370-
metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
371-
max_evals=10,
370+
metrics=metrics,
371+
max_evals=2,
372372
dict_config_opti=dict_config_opti,
373373
)
374374
results = comparison.compare(df_data)
375375
results.style.highlight_min(color="green", axis=1)
376376
```
377+
```python
378+
n_metrics = len(metrics)
379+
fig = plt.figure(figsize=(24, 4 * n_metrics))
380+
for i, metric in enumerate(metrics):
381+
fig.add_subplot(n_metrics, 1, i + 1)
382+
df = results.loc[metric]
383+
plot.multibar(df, decimals=2)
384+
plt.ylabel(metric)
385+
386+
#plt.savefig("figures/imputations_benchmark_errors.png")
387+
plt.show()
388+
```
389+
377390
```python tags=[]
378-
df_plot = df_data
391+
df_plot = df_data[cols_to_impute]
392+
```
393+
394+
```python
379395
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
396+
```
397+
398+
```python
380399
station = df_plot.index.get_level_values("station")[0]
381400
df_station = df_plot.loc[station]
382401
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
383402
```
384403

385-
```python tags=[]
404+
Let's look at the imputations.
405+
When the data is missing at random, imputation is easier. Missing block are more challenging.
406+
407+
```python
386408
for col in cols_to_impute:
387409
fig, ax = plt.subplots(figsize=(10, 3))
388410
values_orig = df_station[col]
@@ -399,39 +421,42 @@ for col in cols_to_impute:
399421
ax.xaxis.set_major_locator(loc)
400422
ax.tick_params(axis='both', which='major', labelsize=17)
401423
plt.show()
424+
402425
```
403426

404427
```python
405-
n_columns = len(df_plot.columns)
428+
# plot.plot_imputations(df_station, dfs_imputed_station)
429+
430+
n_columns = len(cols_to_impute)
406431
n_imputers = len(dict_imputers)
407432

408-
fig = plt.figure(figsize=(8 * n_imputers, 6 * n_columns))
433+
fig = plt.figure(figsize=(12 * n_imputers, 4 * n_columns))
409434
i_plot = 1
410-
for i_col, col in enumerate(df_plot):
435+
for i_col, col in enumerate(cols_to_impute):
411436
for name_imputer, df_imp in dfs_imputed_station.items():
412437

413-
fig.add_subplot(n_columns, n_imputers, i_plot)
438+
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
414439
values_orig = df_station[col]
415440

416-
plt.plot(values_orig, ".", color='black', label="original")
417-
418441
values_imp = df_imp[col].copy()
419442
values_imp[values_orig.notna()] = np.nan
420-
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
443+
plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
444+
plt.plot(values_orig, color='black', marker="o", label="original")
421445
plt.ylabel(col, fontsize=16)
422-
if i_plot % n_columns == 1:
423-
plt.legend(loc=[1, 0], fontsize=18)
446+
if i_plot % n_imputers == 0:
447+
plt.legend(loc="lower right", fontsize=18)
424448
plt.xticks(rotation=15)
425449
if i_col == 0:
426450
plt.title(name_imputer)
427451
if i_col != n_columns - 1:
428-
plt.xticks([], [])
452+
ax.set_xticklabels([])
429453
loc = plticker.MultipleLocator(base=2*365)
430454
ax.xaxis.set_major_locator(loc)
431455
ax.tick_params(axis='both', which='major')
432456
i_plot += 1
433-
plt.savefig("figures/imputations_benchmark.png")
457+
434458
plt.show()
459+
435460
```
436461

437462
## Covariance

qolmat/benchmark/metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1011,7 +1011,7 @@ def get_metric(name: str) -> Callable:
10111011
"ks_test": kolmogorov_smirnov_test,
10121012
"correlation_diff": mean_difference_correlation_matrix_numerical_features,
10131013
"energy": sum_energy_distances,
1014-
"frechet": frechet_distance,
1014+
"frechet": frechet_distance_pattern,
10151015
"dist_corr_pattern": partial(
10161016
pattern_based_weighted_mean_metric,
10171017
metric=distance_anticorr,

0 commit comments

Comments
 (0)