Skip to content

Commit 4600641

Browse files
Julien RousselJulien Roussel
authored andcommitted
pretreatment in varp
1 parent eccb46d commit 4600641

File tree

5 files changed

+278
-37
lines changed

5 files changed

+278
-37
lines changed

HISTORY.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ History
1212
* Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
1313
* The ImputeRegressor class now handles the nans by `row` by default
1414
* The metric `frechet` was not correctly called and has been patched
15+
* The EM algorithm with VAR(p) now fills initial holes in order to avoid exponential explosions
1516

1617
0.1.2 (2024-02-28)
1718
------------------

examples/RPCA.md

Lines changed: 163 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ jupyter:
88
format_version: '1.3'
99
jupytext_version: 1.14.4
1010
kernelspec:
11-
display_name: Python 3 (ipykernel)
11+
display_name: env_qolmat_dev
1212
language: python
13-
name: python3
13+
name: env_qolmat_dev
1414
---
1515

16-
```python
16+
```python tags=[]
1717
%reload_ext autoreload
1818
%autoreload 2
1919

@@ -26,17 +26,18 @@ import sys
2626

2727
from math import pi
2828

29-
from qolmat.utils import plot, data
30-
from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
31-
from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
29+
from qolmat.utils import utils, plot, data
30+
from qolmat.imputations.rpca.rpca_pcp import RpcaPcp
31+
from qolmat.imputations.rpca.rpca_noisy import RpcaNoisy
32+
from qolmat.imputations.softimpute import SoftImpute
3233
from qolmat.imputations.rpca import rpca_utils
3334
from qolmat.utils.data import generate_artificial_ts
3435
```
3536

3637
**Generate synthetic data**
3738

38-
```python
39-
n_samples = 1000
39+
```python tags=[]
40+
n_samples = 10000
4041
periods = [100, 20]
4142
amp_anomalies = 0.5
4243
ratio_anomalies = 0.05
@@ -47,13 +48,15 @@ X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalie
4748
signal = X_true + A_true + E_true
4849

4950
# Adding missing data
50-
#signal[5:20] = np.nan
51-
mask = np.random.choice(len(signal), round(len(signal) / 20))
52-
signal[mask] = np.nan
51+
signal[120:180] = np.nan
52+
signal[:20] = np.nan
53+
# signal[80:220] = np.nan
54+
# mask = np.random.choice(len(signal), round(len(signal) / 20))
55+
# signal[mask] = np.nan
5356

5457
```
5558

56-
```python
59+
```python tags=[]
5760
fig = plt.figure(figsize=(15, 8))
5861
ax = fig.add_subplot(4, 1, 1)
5962
ax.title.set_text("Low-rank signal")
@@ -74,40 +77,172 @@ plt.plot(signal)
7477
plt.show()
7578
```
7679

80+
<!-- #region tags=[] -->
81+
# Fit RPCA Noisy
82+
<!-- #endregion -->
83+
84+
```python tags=[]
85+
rpca_noisy = RpcaNoisy(tau=1, lam=.4, rank=1, norm="L2")
86+
```
87+
88+
```python tags=[]
89+
period = 100
90+
D = utils.prepare_data(signal, period)
91+
Omega = ~np.isnan(D)
92+
D = utils.linear_interpolation(D)
93+
```
94+
95+
```python tags=[]
96+
M, A, L, Q = rpca_noisy.decompose_with_basis(D, Omega)
97+
M2, A2 = rpca_noisy.decompose_on_basis(D, Omega, Q)
98+
```
99+
100+
```python tags=[]
101+
M_final = utils.get_shape_original(M, signal.shape)
102+
A_final = utils.get_shape_original(A, signal.shape)
103+
D_final = utils.get_shape_original(D, signal.shape)
104+
signal_imputed = M_final + A_final
105+
```
106+
107+
```python tags=[]
108+
fig = plt.figure(figsize=(12, 4))
109+
110+
plt.plot(signal_imputed, label="Imputed signal with anomalies")
111+
plt.plot(M_final, label="Imputed signal without anomalies")
112+
plt.plot(A_final, label="Anomalies")
113+
# plt.plot(D_final, label="D")
114+
plt.plot(signal, color="black", label="Original signal")
115+
plt.xlim(0, 400)
116+
plt.legend()
117+
plt.show()
118+
```
119+
77120
## PCP RPCA
78121

122+
```python tags=[]
123+
rpca_pcp = RpcaPcp(max_iterations=1000, lam=.1)
124+
```
125+
126+
```python tags=[]
127+
period = 100
128+
D = utils.prepare_data(signal, period)
129+
Omega = ~np.isnan(D)
130+
D = utils.linear_interpolation(D)
131+
```
132+
133+
```python tags=[]
134+
M, A = rpca_pcp.decompose(D, Omega)
135+
```
136+
137+
```python tags=[]
138+
M_final = utils.get_shape_original(M, signal.shape)
139+
A_final = utils.get_shape_original(A, signal.shape)
140+
D_final = utils.get_shape_original(D, signal.shape)
141+
# Y_final = utils.get_shape_original(Y, signal.shape)
142+
signal_imputed = M_final + A_final
143+
```
144+
145+
```python tags=[]
146+
fig = plt.figure(figsize=(12, 4))
147+
148+
plt.plot(signal_imputed, label="Imputed signal with anomalies")
149+
plt.plot(M_final, label="Imputed signal without anomalies")
150+
plt.plot(A_final, label="Anomalies")
151+
152+
plt.plot(signal, color="black", label="Original signal")
153+
plt.xlim(0, 400)
154+
# plt.gca().twinx()
155+
# plt.plot(Y_final, label="Y")
156+
plt.legend()
157+
plt.show()
158+
```
159+
160+
## Soft Impute
161+
162+
```python tags=[]
163+
imputer = SoftImpute(max_iterations=1000, tau=.1)
164+
```
165+
166+
```python tags=[]
167+
period = 100
168+
D = utils.prepare_data(signal, period)
169+
Omega = ~np.isnan(D)
170+
D = utils.linear_interpolation(D)
171+
```
172+
173+
```python tags=[]
174+
M, A = imputer.decompose(D, Omega)
175+
```
176+
177+
```python tags=[]
178+
M_final = utils.get_shape_original(M, signal.shape)
179+
A_final = utils.get_shape_original(A, signal.shape)
180+
D_final = utils.get_shape_original(D, signal.shape)
181+
# Y_final = utils.get_shape_original(Y, signal.shape)
182+
signal_imputed = M_final + A_final
183+
```
184+
185+
```python tags=[]
186+
fig = plt.figure(figsize=(12, 4))
187+
188+
plt.plot(signal_imputed, label="Imputed signal with anomalies")
189+
plt.plot(M_final, label="Imputed signal without anomalies")
190+
plt.plot(A_final, label="Anomalies")
191+
192+
plt.plot(signal, color="black", label="Original signal")
193+
plt.xlim(0, 400)
194+
plt.legend()
195+
plt.show()
196+
```
197+
198+
## Temporal RPCA
199+
79200
```python
80201
%%time
81-
rpca_pcp = RPCAPCP(period=100, max_iterations=100, mu=.5, lam=0.1)
82-
X, A = rpca_pcp.decompose_rpca_signal(signal)
83-
imputed = signal - A
202+
# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
203+
rpca_noisy = RpcaNoisy(tau=1, lam=0.4, rank=2, norm="L2")
204+
M, A = rpca_noisy.decompose(D, Omega)
205+
# imputed = X
84206
```
85207

86-
```python
208+
```python tags=[]
87209
fig = plt.figure(figsize=(12, 4))
88-
plt.plot(X, color="black")
89-
plt.plot(imputed)
210+
211+
plt.plot(signal_imputed, label="Imputed signal with anomalies")
212+
plt.plot(M_final, label="Imputed signal without anomalies")
213+
plt.plot(A_final, label="Anomalies")
214+
215+
plt.plot(signal, color="black", label="Original signal")
216+
plt.xlim(0, 400)
217+
# plt.gca().twinx()
218+
# plt.plot(Y_final, label="Y")
219+
plt.legend()
220+
plt.show()
90221
```
91222

92-
## Temporal RPCA
223+
# EM VAR(p)
93224

94225
```python
95-
signal.shape
226+
from qolmat.imputations import em_sampler
96227
```
97228

98229
```python
99-
%%time
100-
# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
101-
rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, norm="L2")
102-
X, A = rpca_noisy.decompose_rpca_signal(signal)
103-
imputed =
230+
p = 1
231+
model = em_sampler.VARpEM(method="mle", max_iter_em=10, n_iter_ou=512, dt=1e-1, p=p)
232+
```
233+
234+
```python
235+
D = signal.reshape(-1, 1)
236+
M_final = model.fit_transform(D)
104237
```
105238

106239
```python
107240
fig = plt.figure(figsize=(12, 4))
108-
plt.plot(signal, color="black")
109-
plt.plot(X_true)
110-
plt.plot(X)
241+
plt.plot(signal_imputed, label="Imputed signal with anomalies")
242+
plt.plot(M_final, label="Imputed signal without anomalies")
243+
plt.xlim(0, 400)
244+
plt.legend()
245+
plt.show()
111246
```
112247

113248
```python

examples/benchmark.md

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
233233
```
234234

235235
```python
236-
station = df_plot.index.get_level_values("station")[0]
236+
# station = df_plot.index.get_level_values("station")[0]
237+
station = "Huairou"
237238
df_station = df_plot.loc[station]
238239
# dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
239240
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
@@ -242,10 +243,6 @@ dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imput
242243
Let's look at the imputations.
243244
When the data is missing at random, imputation is easier. Missing block are more challenging.
244245

245-
```python
246-
dfs_imputed_station["VAR_max"]
247-
```
248-
249246
```python
250247
for col in cols_to_impute:
251248
fig, ax = plt.subplots(figsize=(10, 3))
@@ -266,6 +263,19 @@ for col in cols_to_impute:
266263

267264
```
268265

266+
```python
267+
dfs_imputed_station
268+
```
269+
270+
```python
271+
X = dfs_imputed_station["VAR_max"]
272+
model = dict_imputers["VAR_max"]._dict_fitting["__all__"][0]
273+
```
274+
275+
```python
276+
model.B
277+
```
278+
269279
```python
270280
# plot.plot_imputations(df_station, dfs_imputed_station)
271281

@@ -478,6 +488,14 @@ for i, col in enumerate(cols_to_impute[:-1]):
478488
plt.show()
479489
```
480490

491+
```python
492+
493+
```
494+
495+
```python
496+
dfs_imputed["VAR_max"].groupby("station").min()
497+
```
498+
481499
## Auto-correlation
482500

483501

qolmat/imputations/em_sampler.py

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@
1212

1313
from qolmat.utils import utils
1414

15-
from matplotlib import pyplot as plt
16-
17-
from qolmat.utils.exceptions import IllConditioned
18-
1915

2016
def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray:
2117
"""
@@ -423,6 +419,8 @@ def transform(self, X: NDArray) -> NDArray:
423419
X = self.init_imputation(X)
424420
warm_start = False
425421

422+
X, mask_na = self.pretreatment(X, mask_na)
423+
426424
if (self.method == "mle") or not warm_start:
427425
X = self._maximize_likelihood(X, mask_na)
428426
if self.method == "sample":
@@ -433,6 +431,26 @@ def transform(self, X: NDArray) -> NDArray:
433431

434432
return X
435433

434+
def pretreatment(self, X, mask_na) -> NDArray:
435+
"""
436+
Pretreats the data before imputation by EM, making it more robust.
437+
438+
Parameters
439+
----------
440+
X : NDArray
441+
Data matrix without nans
442+
mask_na : NDArray
443+
Boolean matrix indicating which entries are to be imputed
444+
445+
Returns
446+
-------
447+
Tuple[NDArray, NDArray]
448+
A tuple containing:
449+
- X the pretreatd data matrix
450+
- mask_na the updated mask
451+
"""
452+
return X, mask_na
453+
436454
def _check_conditionning(self, X: NDArray):
437455
"""
438456
Check that the data matrix X is not ill-conditioned. Running the EM algorithm on data with
@@ -1037,6 +1055,39 @@ def init_imputation(self, X: NDArray) -> NDArray:
10371055
"""
10381056
return utils.linear_interpolation(X)
10391057

1058+
def pretreatment(self, X, mask_na) -> NDArray:
1059+
"""
1060+
Pretreats the data before imputation by EM, making it more robust. In the case of the
1061+
VAR(p) model we carry the first observation backward on each variable to avoid explosive
1062+
imputations.
1063+
1064+
Parameters
1065+
----------
1066+
X : NDArray
1067+
Data matrix without nans
1068+
mask_na : NDArray
1069+
Boolean matrix indicating which entries are to be imputed
1070+
1071+
Returns
1072+
-------
1073+
Tuple[NDArray, NDArray]
1074+
A tuple containing:
1075+
- X the pretreatd data matrix
1076+
- mask_na the updated mask
1077+
"""
1078+
if self.p == 0:
1079+
return X, mask_na
1080+
X = X.copy()
1081+
mask_na = mask_na.copy()
1082+
n_rows, n_cols = X.shape
1083+
for col in range(n_cols):
1084+
n_holes_left = np.sum(np.cumsum(~mask_na[:, col]) == 0)
1085+
if n_holes_left == n_rows:
1086+
continue
1087+
X[:n_holes_left, col] = X[n_holes_left, col]
1088+
mask_na[:n_holes_left, col] = False
1089+
return X, mask_na
1090+
10401091
def _check_convergence(self) -> bool:
10411092
"""
10421093
Check if the EM algorithm has converged. Three criteria:

0 commit comments

Comments
 (0)