pretreatment in varp

Julien Roussel · Julien Roussel · commit 460064163ae7 · 2024-03-08T12:20:44.000+01:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -12,6 +12,7 @@ History
 * Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
 * The ImputeRegressor class now handles the nans by `row` by default
 * The metric `frechet` was not correctly called and has been patched
+* The EM algorithm with VAR(p) now fills initial holes in order to avoid exponential explosions
 
 0.1.2 (2024-02-28)
 ------------------
diff --git a/examples/RPCA.md b/examples/RPCA.md
@@ -8,12 +8,12 @@ jupyter:
       format_version: '1.3'
       jupytext_version: 1.14.4
   kernelspec:
-    display_name: Python 3 (ipykernel)
+    display_name: env_qolmat_dev
     language: python
-    name: python3
+    name: env_qolmat_dev
 ---
 
-```python
+```python tags=[]
 %reload_ext autoreload
 %autoreload 2
 
@@ -26,17 +26,18 @@ import sys
 
 from math import pi
 
-from qolmat.utils import plot, data
-from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
-from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
+from qolmat.utils import utils, plot, data
+from qolmat.imputations.rpca.rpca_pcp import RpcaPcp
+from qolmat.imputations.rpca.rpca_noisy import RpcaNoisy
+from qolmat.imputations.softimpute import SoftImpute
 from qolmat.imputations.rpca import rpca_utils
 from qolmat.utils.data import generate_artificial_ts
 ```
 
 **Generate synthetic data**
 
-```python
-n_samples = 1000
+```python tags=[]
+n_samples = 10000
 periods = [100, 20]
 amp_anomalies = 0.5
 ratio_anomalies = 0.05
@@ -47,13 +48,15 @@ X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalie
 signal = X_true + A_true + E_true
 
 # Adding missing data
-#signal[5:20] = np.nan
-mask = np.random.choice(len(signal), round(len(signal) / 20))
-signal[mask] = np.nan
+signal[120:180] = np.nan
+signal[:20] = np.nan
+# signal[80:220] = np.nan
+# mask = np.random.choice(len(signal), round(len(signal) / 20))
+# signal[mask] = np.nan
 
 ```
 
-```python
+```python tags=[]
 fig = plt.figure(figsize=(15, 8))
 ax = fig.add_subplot(4, 1, 1)
 ax.title.set_text("Low-rank signal")
@@ -74,40 +77,172 @@ plt.plot(signal)
 plt.show()
 ```
 
+<!-- #region tags=[] -->
+# Fit RPCA Noisy
+<!-- #endregion -->
+
+```python tags=[]
+rpca_noisy = RpcaNoisy(tau=1, lam=.4, rank=1, norm="L2")
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A, L, Q = rpca_noisy.decompose_with_basis(D, Omega)
+M2, A2 = rpca_noisy.decompose_on_basis(D, Omega, Q)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+# plt.plot(D_final, label="D")
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
+```
+
 ## PCP RPCA
 
+```python tags=[]
+rpca_pcp = RpcaPcp(max_iterations=1000, lam=.1)
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A = rpca_pcp.decompose(D, Omega)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+# Y_final = utils.get_shape_original(Y, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+# plt.gca().twinx()
+# plt.plot(Y_final, label="Y")
+plt.legend()
+plt.show()
+```
+
+## Soft Impute
+
+```python tags=[]
+imputer = SoftImpute(max_iterations=1000, tau=.1)
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A = imputer.decompose(D, Omega)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+# Y_final = utils.get_shape_original(Y, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
+```
+
+## Temporal RPCA
+
 ```python
 %%time
-rpca_pcp = RPCAPCP(period=100, max_iterations=100, mu=.5, lam=0.1)
-X, A = rpca_pcp.decompose_rpca_signal(signal)
-imputed = signal - A
+# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
+rpca_noisy = RpcaNoisy(tau=1, lam=0.4, rank=2, norm="L2")
+M, A = rpca_noisy.decompose(D, Omega)
+# imputed = X
 ```
 
-```python
+```python tags=[]
 fig = plt.figure(figsize=(12, 4))
-plt.plot(X, color="black")
-plt.plot(imputed)
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+# plt.gca().twinx()
+# plt.plot(Y_final, label="Y")
+plt.legend()
+plt.show()
 ```
 
-## Temporal RPCA
+# EM VAR(p)
 
 ```python
-signal.shape
+from qolmat.imputations import em_sampler
 ```
 
 ```python
-%%time
-# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
-rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, norm="L2")
-X, A = rpca_noisy.decompose_rpca_signal(signal)
-imputed =
+p = 1
+model = em_sampler.VARpEM(method="mle", max_iter_em=10, n_iter_ou=512, dt=1e-1, p=p)
+```
+
+```python
+D = signal.reshape(-1, 1)
+M_final = model.fit_transform(D)
 ```
 
 ```python
 fig = plt.figure(figsize=(12, 4))
-plt.plot(signal, color="black")
-plt.plot(X_true)
-plt.plot(X)
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
 ```
 
 ```python
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -233,7 +233,8 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
 ```
 
 ```python
-station = df_plot.index.get_level_values("station")[0]
+# station = df_plot.index.get_level_values("station")[0]
+station = "Huairou"
 df_station = df_plot.loc[station]
 # dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
@@ -242,10 +243,6 @@ dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imput
 Let's look at the imputations.
 When the data is missing at random, imputation is easier. Missing block are more challenging.
 
-```python
-dfs_imputed_station["VAR_max"]
-```
-
 ```python
 for col in cols_to_impute:
     fig, ax = plt.subplots(figsize=(10, 3))
@@ -266,6 +263,19 @@ for col in cols_to_impute:
 
 ```
 
+```python
+dfs_imputed_station
+```
+
+```python
+X = dfs_imputed_station["VAR_max"]
+model = dict_imputers["VAR_max"]._dict_fitting["__all__"][0]
+```
+
+```python
+model.B
+```
+
 ```python
 # plot.plot_imputations(df_station, dfs_imputed_station)
 
@@ -478,6 +488,14 @@ for i, col in enumerate(cols_to_impute[:-1]):
 plt.show()
 ```
 
+```python
+
+```
+
+```python
+dfs_imputed["VAR_max"].groupby("station").min()
+```
+
 ## Auto-correlation
 
 
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -12,10 +12,6 @@
 
 from qolmat.utils import utils
 
-from matplotlib import pyplot as plt
-
-from qolmat.utils.exceptions import IllConditioned
-
 
 def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray:
     """
@@ -423,6 +419,8 @@ def transform(self, X: NDArray) -> NDArray:
             X = self.init_imputation(X)
             warm_start = False
 
+        X, mask_na = self.pretreatment(X, mask_na)
+
         if (self.method == "mle") or not warm_start:
             X = self._maximize_likelihood(X, mask_na)
         if self.method == "sample":
@@ -433,6 +431,26 @@ def transform(self, X: NDArray) -> NDArray:
 
         return X
 
+    def pretreatment(self, X, mask_na) -> NDArray:
+        """
+        Pretreats the data before imputation by EM, making it more robust.
+
+        Parameters
+        ----------
+        X : NDArray
+            Data matrix without nans
+        mask_na : NDArray
+            Boolean matrix indicating which entries are to be imputed
+
+        Returns
+        -------
+        Tuple[NDArray, NDArray]
+            A tuple containing:
+            - X the pretreatd data matrix
+            - mask_na the updated mask
+        """
+        return X, mask_na
+
     def _check_conditionning(self, X: NDArray):
         """
         Check that the data matrix X is not ill-conditioned. Running the EM algorithm on data with
@@ -1037,6 +1055,39 @@ def init_imputation(self, X: NDArray) -> NDArray:
         """
         return utils.linear_interpolation(X)
 
+    def pretreatment(self, X, mask_na) -> NDArray:
+        """
+        Pretreats the data before imputation by EM, making it more robust. In the case of the
+        VAR(p) model we carry the first observation backward on each variable to avoid explosive
+        imputations.
+
+        Parameters
+        ----------
+        X : NDArray
+            Data matrix without nans
+        mask_na : NDArray
+            Boolean matrix indicating which entries are to be imputed
+
+        Returns
+        -------
+        Tuple[NDArray, NDArray]
+            A tuple containing:
+            - X the pretreatd data matrix
+            - mask_na the updated mask
+        """
+        if self.p == 0:
+            return X, mask_na
+        X = X.copy()
+        mask_na = mask_na.copy()
+        n_rows, n_cols = X.shape
+        for col in range(n_cols):
+            n_holes_left = np.sum(np.cumsum(~mask_na[:, col]) == 0)
+            if n_holes_left == n_rows:
+                continue
+            X[:n_holes_left, col] = X[n_holes_left, col]
+            mask_na[:n_holes_left, col] = False
+        return X, mask_na
+
     def _check_convergence(self) -> bool:
         """
         Check if the EM algorithm has converged. Three criteria:
diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py