uniformization em

Julien Roussel · Julien Roussel · commit c47076793799 · 2023-02-28T11:00:27.000+01:00
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -8,13 +8,13 @@
 import pandas as pd
 import scipy
 from numpy.typing import ArrayLike
-from sklearn.impute._base import _BaseImputer
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import StandardScaler
 
 logger = logging.getLogger(__name__)
 
 
-def _gradient_conjugue(A: ArrayLike, X: ArrayLike, tol: float = 1e-6) -> ArrayLike:
+def _gradient_conjugue(A: ArrayLike, X: ArrayLike) -> ArrayLike:
     """
     Minimize Tr(X.T AX) by imputing missing values.
     To this aim, we compute in parallel a gradient algorithm for each data.
@@ -25,8 +25,6 @@ def _gradient_conjugue(A: ArrayLike, X: ArrayLike, tol: float = 1e-6) -> ArrayLi
         A array
     X : ArrayLike
         X array
-    tol : float, optional
-        Tolerance, by default 1e-6
 
     Returns
     -------
@@ -79,7 +77,7 @@ def invert_robust(M, epsilon=1e-2):
     return scipy.linalg.inv(Meps)
 
 
-class ImputeEM(BaseEstimator, TransformerMixin):
+class EM(BaseEstimator, TransformerMixin):
     """
     Imputation of missing values using a multivariate Gaussian model through EM optimization and
     using a projected Ornstein-Uhlenbeck process.
@@ -131,7 +129,7 @@ def __init__(
         tolerance: Optional[float] = 1e-4,
         stagnation_threshold: Optional[float] = 5e-3,
         stagnation_loglik: Optional[float] = 2,
-    ) -> None:
+    ):
 
         if strategy not in ["mle", "ou"]:
             raise Exception("strategy has to be 'mle' or 'ou'")
@@ -276,7 +274,7 @@ def transform(self, X: np.array) -> np.array:
         return X_transformed
 
 
-class ImputeMultiNormalEM(ImputeEM):
+class MultiNormalEM(EM):
     """
     Imputation of missing values using a multivariate Gaussian model through EM optimization and
     using a projected Ornstein-Uhlenbeck process.
@@ -488,7 +486,7 @@ def _check_convergence(self) -> bool:
         return min_diff_reached or min_diff_stable or max_loglik
 
 
-class ImputeVAR1EM(ImputeEM):
+class VAR1EM(EM):
     """
     Imputation of missing values using a vector autoregressive model through EM optimization and
     using a projected Ornstein-Uhlenbeck process.
diff --git a/qolmat/imputations/models.py b/qolmat/imputations/models.py
@@ -482,118 +482,9 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         )
         results = imputer.fit_transform(df)
         return pd.DataFrame(data=results, columns=df.columns, index=df.index)
-    
-
-class ImputerRPCA(Imputer):
-    """
-    This class implements the RPCA imputation
-
-    Parameters
-    ----------
-    method : str
-        name of the RPCA method:
-            "PCP" for basic RPCA
-            "temporal" for temporal RPCA, with regularisations
-            "online" for online RPCA
-    columnwise : bool
-        for RPCA method to be applied columnwise (with reshaping of each column into an array)
-        or to be applied directly on the dataframe. By default, the value is set to False.
-    """
-
-    def __init__(
-        self,
-        method: str = "temporal",
-        groups: List[str] = [],
-        columnwise: bool = False,
-        **hyperparams
-        ) -> None:
-        super().__init__(groups=groups, columnwise=columnwise, hyperparams=hyperparams)
-        
-        self.method = method
-        
-    def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Fit/transform to impute with RPCA methods
-
-        Parameters
-        ----------
-        df : pd.DataFrame
-            dataframe to impute
-
-        Returns
-        -------
-        pd.DataFrame
-            imputed dataframe
-        """
-        if not isinstance(df, pd.DataFrame):
-            raise ValueError("Input has to be a pandas.DataFrame.")
-
-        if self.method == "PCP":
-            rpca = RPCA(**self.hyperparams_element)
-        elif self.method == "temporal":
-            rpca = TemporalRPCA(**self.hyperparams_element)
-        elif self.method == "onlinetemporal":
-            rpca = OnlineTemporalRPCA(**self.hyperparams_element)
-            
-        df_imputed = pd.DataFrame(rpca.fit_transform(X=df.values), index=df.index, columns=df.columns)
-
-        return df_imputed
-
 
-class ImputeEM(_BaseImputer):
-    def __init__(
-        self,
-        strategy: Optional[str] = "mle",
-        method: Optional[str] = "multinormal",
-        max_iter_em: Optional[int] = 200,
-        n_iter_ou: Optional[int] = 50,
-        ampli: Optional[int] = 1,
-        random_state: Optional[int] = 123,
-        dt: Optional[float] = 2e-2,
-        tolerance: Optional[float] = 1e-4,
-        stagnation_threshold: Optional[float] = 5e-3,
-        stagnation_loglik: Optional[float] = 2,
-    ):
-        if method == "multinormal":
-            self.model = em_sampler.ImputeMultiNormalEM(
-                strategy=strategy,
-                max_iter_em=max_iter_em,
-                n_iter_ou=n_iter_ou,
-                ampli=ampli,
-                random_state=random_state,
-                dt=dt,
-                tolerance=tolerance,
-                stagnation_threshold=stagnation_threshold,
-                stagnation_loglik=stagnation_loglik,
-            )
-        elif method == "VAR1":
-            self.model = em_sampler.ImputeVAR1EM(
-                strategy=strategy,
-                max_iter_em=max_iter_em,
-                n_iter_ou=n_iter_ou,
-                ampli=ampli,
-                random_state=random_state,
-                dt=dt,
-                tolerance=tolerance,
-                stagnation_threshold=stagnation_threshold,
-                stagnation_loglik=stagnation_loglik,
-            )
-        else:
-            raise ValueError("Strategy '{strategy}' is not handled by ImputeEM!")
-
-    def fit(self, df):
-        X = df.values
-        self.model.fit(X)
-        return self
 
-    def transform(self, df):
-        X = df.values
-        X_transformed = self.model.transform(X)
-        df_transformed = pd.DataFrame(X_transformed, columns=df.columns, index=df.index)
-        return df_transformed
-
-
-class ImputeMICE(Imputer):
+class ImputerMICE(Imputer):
     """
     This class implements an iterative imputer in the multivariate case.
     It imputes each Series within a DataFrame multiple times using an iteration of fits
@@ -728,7 +619,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
                 hyperparams[hyperparam] = value
 
             model = self.type_model(**hyperparams)
-            
+
             if self.fit_on_nan:
                 X = df.drop(columns=col)
             else:
@@ -802,3 +693,111 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
             df_imp.loc[is_na, col] = random_pred[is_na]
 
         return df_imp
+
+
+class ImputerRPCA(Imputer):
+    """
+    This class implements the RPCA imputation
+
+    Parameters
+    ----------
+    method : str
+        name of the RPCA method:
+            "PCP" for basic RPCA
+            "temporal" for temporal RPCA, with regularisations
+            "online" for online RPCA
+    columnwise : bool
+        for RPCA method to be applied columnwise (with reshaping of each column into an array)
+        or to be applied directly on the dataframe. By default, the value is set to False.
+    """
+
+    def __init__(
+        self,
+        method: str = "temporal",
+        groups: List[str] = [],
+        columnwise: bool = False,
+        **hyperparams
+        ) -> None:
+        super().__init__(groups=groups, columnwise=columnwise, hyperparams=hyperparams)
+        
+        self.method = method
+        
+    def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Fit/transform to impute with RPCA methods
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            dataframe to impute
+
+        Returns
+        -------
+        pd.DataFrame
+            imputed dataframe
+        """
+        if not isinstance(df, pd.DataFrame):
+            raise ValueError("Input has to be a pandas.DataFrame.")
+
+        if self.method == "PCP":
+            rpca = RPCA(**self.hyperparams_element)
+        elif self.method == "temporal":
+            rpca = TemporalRPCA(**self.hyperparams_element)
+        elif self.method == "onlinetemporal":
+            rpca = OnlineTemporalRPCA(**self.hyperparams_element)
+            
+        df_imputed = pd.DataFrame(rpca.fit_transform(X=df.values), index=df.index, columns=df.columns)
+
+        return df_imputed
+
+
+class ImputeEM(Imputer):
+    def __init__(
+        self,
+        groups: List[str]=[],
+        method: Optional[str] = "multinormal",
+        columnwise: bool=False,
+        **hyperparams
+
+    ):
+        super().__init__(groups=groups, columnwise=columnwise, hyperparams=hyperparams)
+        self.method = method
+        # if method == "multinormal":
+        #     self.model = em_sampler.ImputeMultiNormalEM(
+        #         **hyperparams
+        #     )
+        # elif method == "VAR1":
+        #     self.model = em_sampler.ImputeVAR1EM(
+        #         **hyperparams
+        #     )
+        # else:
+        #     raise ValueError("Strategy '{strategy}' is not handled by ImputeEM!")
+        
+    def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
+        if self.method == "multinormal":
+            model = em_sampler.MultiNormalEM(
+                **self.hyperparams_element
+            )
+        elif self.method == "VAR1":
+            model = em_sampler.VAR1EM(
+                **self.hyperparams_element
+            )
+        else:
+            raise ValueError("Strategy '{strategy}' is not handled by ImputeEM!")
+        X = df.values
+        model.fit(X)
+
+        X_transformed = model.transform(X)
+        df_transformed = pd.DataFrame(X_transformed, columns=df.columns, index=df.index)
+        return df_transformed
+
+    # def fit(self, df):
+    #     X = df.values
+    #     self.model.fit(X)
+    #     return self
+
+    # def transform(self, df):
+    #     X = df.values
+    #     X_transformed = self.model.transform(X)
+    #     df_transformed = pd.DataFrame(X_transformed, columns=df.columns, index=df.index)
+    #     return df_transformed
diff --git a/qolmat/notebooks/benchmark.md b/qolmat/notebooks/benchmark.md
@@ -219,22 +219,29 @@ results = comparison.compare(df_data)
 results
 ```
 
-### **IV. Comparison of methods**
-
 ```python
-df
+fig = plt.figure(figsize=(24, 4))
+plot.multibar(results.loc["mae"])
+plt.show()
 ```
 
+### **IV. Comparison of methods**
+
+
 We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
 
 ```python
-dfs_imputed = {name: imp.fit_transform(df_data) for name, imp in dict_models.items()}
+df_plot = df_data[["TEMP", "PRES"]]
+```
+
+```python
+dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_models.items()}
 ```
 
 ```python
 station = "Aotizhongxin"
-df_station = df_data.loc[station]
-dfs_imputed_station = {name: df.loc[station] for name, df in dfs_imputed.items()}
+df_station = df_plot.loc[station]
+dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 ```
 
 Let's look at the imputations.
@@ -243,12 +250,37 @@ Note here we didn't fit the hyperparams of the RPCA... results might be of poor
 
 ```python
 # palette = sns.color_palette("icefire", n_colors=len(dict_models))
-#palette = sns.color_palette("husl", 8)
+# palette = sns.color_palette("husl", 8)
 # sns.set_palette(palette)
-markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
-colors = ["tab:red", "tab:blue", "tab:blue"]
+# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
 
+for col in cols_to_impute:
+    fig, ax = plt.subplots(figsize=(10, 3))
+    values_orig = df_station[col]
+
+    plt.plot(values_orig, ".", color='black', label="original")
+    #plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
 
+    for ind, (name, model) in enumerate(list(dict_models.items())):
+        values_imp = dfs_imputed_station[name][col].copy()
+        values_imp[values_orig.notna()] = np.nan
+        plt.plot(values_imp, ".", color=tab10(ind), label=name, alpha=1)
+    plt.ylabel(col, fontsize=16)
+    plt.legend(loc=[1, 0], fontsize=18)
+    loc = plticker.MultipleLocator(base=2*365)
+    ax.xaxis.set_major_locator(loc)
+    ax.tick_params(axis='both', which='major', labelsize=17)
+    plt.show()
+
+```
+
+```python
+# palette = sns.color_palette("icefire", n_colors=len(dict_models))
+# palette = sns.color_palette("husl", 8)
+# sns.set_palette(palette)
+# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
+
+fig = plt.figure(figsize=(
 for col in cols_to_impute:
     fig, ax = plt.subplots(figsize=(10, 3))
     values_orig = df_station[col]
@@ -265,8 +297,8 @@ for col in cols_to_impute:
     loc = plticker.MultipleLocator(base=2*365)
     ax.xaxis.set_major_locator(loc)
     ax.tick_params(axis='both', which='major', labelsize=17)
-    sns.despine()
     plt.show()
+
 ```
 
 **IV.a. Covariance**
diff --git a/qolmat/utils/plot.py b/qolmat/utils/plot.py