Skip to content

Commit 07d98a9

Browse files
Julien RousselJulien Roussel
authored andcommitted
varp naive freeze updated
1 parent 4600641 commit 07d98a9

File tree

3 files changed

+23
-49
lines changed

3 files changed

+23
-49
lines changed

examples/benchmark.md

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,6 @@ df_station = df_plot.loc[station]
240240
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
241241
```
242242

243-
Let's look at the imputations.
244-
When the data is missing at random, imputation is easier. Missing block are more challenging.
245-
246243
```python
247244
for col in cols_to_impute:
248245
fig, ax = plt.subplots(figsize=(10, 3))
@@ -263,19 +260,6 @@ for col in cols_to_impute:
263260

264261
```
265262

266-
```python
267-
dfs_imputed_station
268-
```
269-
270-
```python
271-
X = dfs_imputed_station["VAR_max"]
272-
model = dict_imputers["VAR_max"]._dict_fitting["__all__"][0]
273-
```
274-
275-
```python
276-
model.B
277-
```
278-
279263
```python
280264
# plot.plot_imputations(df_station, dfs_imputed_station)
281265

@@ -370,7 +354,7 @@ comparison = comparator.Comparator(
370354
)
371355
```
372356

373-
```python tags=[]
357+
```python tags=[] jupyter={"outputs_hidden": true}
374358
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=3, groups=('station',), subset=cols_to_impute, ratio_masked=ratio_masked)
375359

376360
comparison = comparator.Comparator(
@@ -401,7 +385,7 @@ plt.show()
401385
df_plot = df_data[cols_to_impute]
402386
```
403387

404-
```python
388+
```python jupyter={"outputs_hidden": true}
405389
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
406390
```
407391

@@ -482,7 +466,7 @@ for i, col in enumerate(cols_to_impute[:-1]):
482466
for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed.items()):
483467
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
484468
plot.compare_covariances(df_plot, df_imp, col, cols_to_impute[i+1], ax, color=tab10(i_imputer), label=name_imputer)
485-
ax.set_title(f"imputation method: {name_imputer}", fontsize=20)
469+
ax.set_title(f"{name_imputer}", fontsize=20)
486470
i_plot += 1
487471
ax.legend()
488472
plt.show()
@@ -499,19 +483,14 @@ dfs_imputed["VAR_max"].groupby("station").min()
499483
## Auto-correlation
500484

501485

502-
We are now interested in th eauto-correlation function (ACF). As seen before, time series display seaonal patterns.
503-
[Autocorrelation](https://en.wikipedia.org/wiki/Autocorrelation) is the correlation of a signal with a delayed copy of itself as a function of delay. Informally, it is the similarity between observations of a random variable as a function of the time lag between them.
504-
505-
The idea is the AFC to be similar between the original dataset and the imputed one.
506-
Fot the TEMP variable, one sees the good reconstruction for all the algorithms.
507-
On th econtrary, for the PRES variable, all methods overestimates the autocorrelation of the variables, especially the RPCA one.
508-
Finally, for the DEWP variable, the methods cannot impute to obtain a behavior close to the original: the autocorrelation decreases to linearly.
486+
We are now interested in the auto-correlation function (ACF). As seen before, time series display seaonal patterns.
487+
[Autocorrelation](https://en.wikipedia.org/wiki/Autocorrelation) is the correlation of a signal with a delayed copy of itself as a function of delay. It measures the similarity between observations of a random variable as a function of the time lag between them. The objective is to have an ACF to be similar between the original dataset and the imputed one.
509488

510489
```python
511490
n_columns = len(df_plot.columns)
512491
n_imputers = len(dict_imputers)
513492

514-
fig = plt.figure(figsize=(6 * n_columns, 6))
493+
fig = plt.figure(figsize=(9 * n_columns, 6))
515494
for i_col, col in enumerate(df_plot):
516495
ax = fig.add_subplot(1, n_columns, i_col + 1)
517496
for name_imputer, df_imp in dfs_imputed_station.items():
@@ -521,6 +500,7 @@ for i_col, col in enumerate(df_plot):
521500
values_orig = df_station[col]
522501
acf = utils.acf(values_orig)
523502
plt.plot(acf, color="black", lw=2, ls="--", label="original")
503+
ax.set_title(f"{col}", fontsize=20)
524504
plt.legend()
525505

526506
plt.savefig("figures/acf.png")

qolmat/imputations/em_sampler.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import abstractmethod
2-
from typing import Dict, List, Literal, Union
2+
from typing import Dict, List, Literal, Tuple, Union
33
import warnings
44

55
import numpy as np
@@ -431,7 +431,7 @@ def transform(self, X: NDArray) -> NDArray:
431431

432432
return X
433433

434-
def pretreatment(self, X, mask_na) -> NDArray:
434+
def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]:
435435
"""
436436
Pretreats the data before imputation by EM, making it more robust.
437437
@@ -1055,11 +1055,11 @@ def init_imputation(self, X: NDArray) -> NDArray:
10551055
"""
10561056
return utils.linear_interpolation(X)
10571057

1058-
def pretreatment(self, X, mask_na) -> NDArray:
1058+
def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]:
10591059
"""
10601060
Pretreats the data before imputation by EM, making it more robust. In the case of the
1061-
VAR(p) model we carry the first observation backward on each variable to avoid explosive
1062-
imputations.
1061+
VAR(p) model we freeze the naive imputation on the first observations if all variables are
1062+
missing to avoid explosive imputations.
10631063
10641064
Parameters
10651065
----------
@@ -1077,15 +1077,9 @@ def pretreatment(self, X, mask_na) -> NDArray:
10771077
"""
10781078
if self.p == 0:
10791079
return X, mask_na
1080-
X = X.copy()
10811080
mask_na = mask_na.copy()
1082-
n_rows, n_cols = X.shape
1083-
for col in range(n_cols):
1084-
n_holes_left = np.sum(np.cumsum(~mask_na[:, col]) == 0)
1085-
if n_holes_left == n_rows:
1086-
continue
1087-
X[:n_holes_left, col] = X[n_holes_left, col]
1088-
mask_na[:n_holes_left, col] = False
1081+
n_holes_left = np.sum(~np.cumsum(~mask_na, axis=0).any(axis=1))
1082+
mask_na[:n_holes_left] = False
10891083
return X, mask_na
10901084

10911085
def _check_convergence(self) -> bool:

tests/imputations/test_em_sampler.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -431,20 +431,20 @@ def test_gradient_X_loglik(em: em_sampler.EM, p: int):
431431
)
432432
def test_pretreatment_temporal(em):
433433
mask2 = mask.copy()
434-
mask2[0, 0] = True
435-
mask2[:, 2] = True
434+
mask2[0, :] = True
436435
X_result, mask_result = em.pretreatment(X_missing, mask2)
437-
X_expected = np.array(
438-
[[2, 4, 1], [2, 4, 3], [1, 4, np.nan], [-1, 2, 1], [1, 1, np.nan]],
439-
dtype=float,
440-
)
441436
mask_expected = mask.copy()
442-
mask_expected[:2, 1] = False
443-
mask_expected[:, 2] = True
444-
np.testing.assert_allclose(X_result, X_expected)
437+
mask_expected[0, :] = False
438+
np.testing.assert_allclose(X_result, X_missing)
445439
np.testing.assert_allclose(mask_result, mask_expected)
446440

447441

442+
# X_missing = np.array(
443+
# [[1, np.nan, 1], [2, np.nan, 3], [1, 4, np.nan], [-1, 2, 1], [1, 1, np.nan]],
444+
# dtype=float,
445+
# )
446+
447+
448448
@pytest.mark.parametrize(
449449
"em",
450450
[

0 commit comments

Comments
 (0)