ImputeEM implemented in models.py, and TS MLE version added

vm-aifluence-jro · vm-aifluence-jro · commit 17ebe83c8549 · 2023-02-23T15:47:36.000Z
diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py
@@ -44,7 +44,8 @@ def get_sizes_max(values_isna: pd.Series) -> pd.Series:
 
 class _HoleGenerator:
     """
-    This abstract class implements the generic method to generate masks according to law of missing values.
+    This abstract class implements the generic method to generate masks according to law of missing
+    values.
 
     Parameters
     ----------
@@ -192,7 +193,8 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
 
 
 class _SamplerHoleGenerator(_HoleGenerator):
-    """This abstract class implements a generic way to generate holes in a dataframe by sampling 1D hole size distributions.
+    """This abstract class implements a generic way to generate holes in a dataframe by sampling 1D
+    hole size distributions.
 
     Parameters
     ----------
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -148,6 +148,7 @@ def __init__(
         self.convergence_threshold = tolerance
         self.stagnation_threshold = stagnation_threshold
         self.stagnation_loglik = stagnation_loglik
+        self.scaler = StandardScaler()
 
         self.dict_criteria_stop = {}
 
@@ -200,109 +201,79 @@ def _convert_numpy(self, X: ArrayLike) -> np.ndarray:
     def _check_convergence(self) -> bool:
         return False
 
-    def _maximize_likelihood(self, X: ArrayLike) -> ArrayLike:
+    def fit(self, X: np.array):
         """
-        Get the argmax of a posterior distribution.
+        Fit the statistical distribution with the input X array.
 
         Parameters
         ----------
-        X : ArrayLike
-            Input DataFrame.
-
-        Returns
-        -------
-        ArrayLike
-            DataFrame with imputed values.
+        X : np.array
+            Numpy array to be imputed
         """
-        X_center = X - self.means[:, None]
-        X_imputed = _gradient_conjugue(self.cov_inv, X_center)
-        X_imputed = self.means[:, None] + X_imputed
-        return X_imputed
-
-    def impute_em(self, X: ArrayLike) -> ArrayLike:
-        """Imputation via EM algorithm
-
-        Parameters
-        ----------
-        X : ArrayLike
-            array with missing values
+        X = X.copy()
+        self.hash_fit = hash(X.tobytes())
+        if not isinstance(X, np.ndarray):
+            raise AssertionError("Invalid type. X must be a np.ndarray.")
 
-        Returns
-        -------
-        X_transformed
-            imputed array
-        """
+        if X.shape[0] < 2:
+            raise AssertionError("Invalid dimensions: X must be of dimension (n,m) with m>1.")
 
-        X_ = self._convert_numpy(X)
-        if np.nansum(X_) == 0:
-            return X_
+        X = self.scaler.fit_transform(X)
+        X = X.T
 
         mask_na = np.isnan(X)
 
         # first imputation
-        X_transformed = self._linear_interpolation(X_)
+        X_sample_last = self._linear_interpolation(X)
 
-        self.fit_distribution(X_transformed)
+        self.fit_distribution(X_sample_last)
 
         for iter_em in range(self.max_iter_em):
 
-            X_transformed = self._sample_ou(X_transformed, mask_na)
+            X_sample_last = self._sample_ou(X_sample_last, mask_na)
 
             if self._check_convergence():
                 logger.info(f"EM converged after {iter_em} iterations.")
                 break
 
-        if self.strategy == "mle":
-            X_transformed = self._maximize_likelihood(X_)
-        elif self.strategy == "ou":
-            X_transformed = self._sample_ou(X_transformed, mask_na)
-
         self.dict_criteria_stop = {key: [] for key in self.dict_criteria_stop}
+        self.X_sample_last = X_sample_last
+        return self
 
-        if np.all(np.isnan(X_transformed)):
-            raise WarningMessage("Result contains NaN. This is a bug.")
-
-        return X_transformed
-
-    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
+    def transform(self, X: np.array) -> np.array:
         """
-        Fit and impute input X array.
+        Transform the input X array by imputing the missing values.
 
         Parameters
         ----------
-        X : pd.DataFrame
-            DataFrame to be imputed
+        X : np.array
+            Numpy array to be imputed
 
         Returns
         -------
         ArrayLike
             Final array after EM sampling.
         """
-        if not ((isinstance(df, np.ndarray)) or (isinstance(df, pd.DataFrame))):
-            raise AssertionError("Invalid type. X must be either pd.DataFrame or np.ndarray.")
-
-        if df.shape[1] < 2:
-            raise AssertionError("Invalid dimensions: X must be of dimension (n,m) with m>1.")
 
-        X = df.values
+        if hash(X.tobytes()) == self.hash_fit:
+            X = self.X_sample_last
+        else:
+            X = self.scaler.transform(X)
+            X = X.T
+            X = self._linear_interpolation(X)
 
-        scaler = StandardScaler()
-        X = scaler.fit_transform(X)
-        X = X.T
-        X = self.impute_em(X)
-        X = X.T
-        X = scaler.inverse_transform(X)
+        if self.strategy == "mle":
+            X_transformed = self._maximize_likelihood(X)
+        elif self.strategy == "ou":
+            mask_na = np.isnan(X)
+            X_transformed = self._sample_ou(X, mask_na)
 
-        if np.isnan(np.sum(X)):
+        if np.all(np.isnan(X_transformed)):
             raise WarningMessage("Result contains NaN. This is a bug.")
 
-        if isinstance(df, np.ndarray):
-            return X
-        elif isinstance(df, pd.DataFrame):
-            return pd.DataFrame(X, index=df.index, columns=df.columns)
-
-        else:
-            raise AssertionError("Invalid type. X must be either pd.DataFrame or np.ndarray.")
+        X_transformed = X_transformed.T
+        X_transformed = self.scaler.inverse_transform(X_transformed)
+        return X_transformed
 
 
 class ImputeMultiNormalEM(ImputeEM):  # type: ignore
@@ -372,18 +343,32 @@ def __init__(
         )
         self.tolerance = tolerance
 
-        # self.list_logliks = []
-        # self.list_means = []
-        # self.list_covs = []
         self.dict_criteria_stop = {"logliks": [], "means": [], "covs": []}
 
     def fit_distribution(self, X):
-        # first estimation of params
         self.means = np.mean(X, axis=1)
         self.cov = np.cov(X)
-
         self.cov_inv = invert_robust(self.cov, epsilon=1e-2)
 
+    def _maximize_likelihood(self, X: ArrayLike) -> ArrayLike:
+        """
+        Get the argmax of a posterior distribution.
+
+        Parameters
+        ----------
+        X : ArrayLike
+            Input DataFrame.
+
+        Returns
+        -------
+        ArrayLike
+            DataFrame with imputed values.
+        """
+        X_center = X - self.means[:, None]
+        X_imputed = _gradient_conjugue(self.cov_inv, X_center)
+        X_imputed = self.means[:, None] + X_imputed
+        return X_imputed
+
     def _sample_ou(
         self,
         X: ArrayLike,
@@ -465,10 +450,6 @@ def _check_convergence(self) -> bool:
             True/False if the algorithm has converged
         """
 
-        # self.list_means.append(self.means)
-        # self.list_covs.append(self.cov)
-        # self.list_logliks.append(self.loglik)
-
         list_means = self.dict_criteria_stop["means"]
         list_covs = self.dict_criteria_stop["covs"]
         list_logliks = self.dict_criteria_stop["logliks"]
@@ -602,11 +583,6 @@ def fit_distribution(self, X):
             self.fit_parameter_A(X)
         self.fit_parameter_omega(X)
 
-        # print("distribution fitted :")
-        # print(self.A)
-        # print(self.B)
-        # print(self.omega)
-
     def gradient_X_centered_loglik(self, Xc):
         Xc_back = np.roll(Xc, 1, axis=1)
         Xc_back[:, 0] = 0
@@ -616,6 +592,25 @@ def gradient_X_centered_loglik(self, Xc):
         Z_fore = Xc_fore - self.A @ Xc
         return -self.omega_inv @ Z_back + self.A.T @ self.omega_inv @ Z_fore
 
+    def _maximize_likelihood(self, X: ArrayLike, dt=1e-2) -> ArrayLike:
+        """
+        Get the argmax of a posterior distribution.
+
+        Parameters
+        ----------
+        X : ArrayLike
+            Input numpy array.
+
+        Returns
+        -------
+        ArrayLike
+            DataFrame with imputed values.
+        """
+        Xc = X - self.B[:, None]
+        for n_optim in range(1000):
+            Xc += dt * self.gradient_X_centered_loglik(Xc)
+        return Xc + self.B[:, None]
+
     def _sample_ou(
         self,
         X: ArrayLike,
diff --git a/qolmat/imputations/models.py b/qolmat/imputations/models.py
@@ -19,6 +19,7 @@
 import missingpy
 
 from qolmat.benchmark import utils
+from qolmat.imputations import em_sampler
 from qolmat.imputations.rpca.pcp_rpca import RPCA
 from qolmat.imputations.rpca.temporal_rpca import OnlineTemporalRPCA, TemporalRPCA
 
@@ -561,6 +562,59 @@ def get_hyperparams(self) -> Dict[str, Union[str, float, int]]:
         }
 
 
+class ImputeEM(_BaseImputer):
+    def __init__(
+        self,
+        strategy: Optional[str] = "mle",
+        method: Optional[str] = "multinormal",
+        max_iter_em: Optional[int] = 200,
+        n_iter_ou: Optional[int] = 50,
+        ampli: Optional[int] = 1,
+        random_state: Optional[int] = 123,
+        dt: Optional[float] = 2e-2,
+        tolerance: Optional[float] = 1e-4,
+        stagnation_threshold: Optional[float] = 5e-3,
+        stagnation_loglik: Optional[float] = 2,
+    ):
+        if method == "multinormal":
+            self.model = em_sampler.ImputeMultiNormalEM(
+                strategy=strategy,
+                max_iter_em=max_iter_em,
+                n_iter_ou=n_iter_ou,
+                ampli=ampli,
+                random_state=random_state,
+                dt=dt,
+                tolerance=tolerance,
+                stagnation_threshold=stagnation_threshold,
+                stagnation_loglik=stagnation_loglik,
+            )
+        elif method == "VAR1":
+            self.model = em_sampler.ImputeVAR1EM(
+                strategy=strategy,
+                max_iter_em=max_iter_em,
+                n_iter_ou=n_iter_ou,
+                ampli=ampli,
+                random_state=random_state,
+                dt=dt,
+                tolerance=tolerance,
+                stagnation_threshold=stagnation_threshold,
+                stagnation_loglik=stagnation_loglik,
+            )
+        else:
+            raise ValueError("Strategy '{strategy}' is not handled by ImputeEM!")
+
+    def fit(self, df):
+        X = df.values
+        self.model.fit(X)
+        return self
+
+    def transform(self, df):
+        X = df.values
+        X_transformed = self.model.transform(X)
+        df_transformed = pd.DataFrame(X_transformed, columns=df.columns, index=df.index)
+        return df_transformed
+
+
 class ImputeMICE(_BaseImputer):
     """
     This class implements an iterative imputer in the multivariate case.
diff --git a/qolmat/notebooks/benchmark.md b/qolmat/notebooks/benchmark.md
@@ -53,7 +53,6 @@ from qolmat.benchmark import comparator, missing_patterns
 from qolmat.benchmark.utils import kl_divergence
 from qolmat.imputations import models
 from qolmat.utils import data, utils, plot
-from qolmat.imputations.em_sampler import ImputeMultiNormalEM, ImputeVAR1EM
 # from qolmat.drawing import display_bar_table
 
 ```
@@ -133,8 +132,9 @@ imputer_residuals = models.ImputeOnResiduals("additive", 7, "freq", "linear")
 imputer_rpca = models.ImputeRPCA(
   method="temporal", multivariate=False, **{"n_rows":7*4, "maxIter":1000, "tau":1, "lam":0.7}
   )
-imputer_ou = ImputeMultiNormalEM(max_iter_em=34, n_iter_ou=15, verbose=0, strategy="ou")
-imputer_tsou = ImputeVAR1EM(max_iter_em=34, n_iter_ou=15, verbose=0, strategy="ou")
+imputer_ou = models.ImputeEM(method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
+imputer_tsou = models.ImputeEM(method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
+imputer_tsmle = models.ImputeEM(method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
 imputer_locf = models.ImputeLOCF()
 imputer_nocb = models.ImputeNOCB()
 imputer_knn = models.ImputeKNN(k=10)
@@ -157,6 +157,7 @@ dict_models = {
     #"iterative": imputer_iterative,
     "OU": imputer_ou,
     "TSOU": imputer_tsou,
+    "TSMLE": imputer_tsmle,
     #"RPCA": imputer_rpca,
 }
 n_models = len(dict_models)
@@ -227,10 +228,6 @@ Let's look at the imputations.
 When the data is missing at random, imputation is easier. Missing block are more challenging.
 Note here we didn't fit the hyperparams of the RPCA... results might be of poor quality...
 
-```python
-plt.scatter(df_station["TEMP"], df_station["PRES"])
-```
-
 ```python
 palette = sns.color_palette("icefire", n_colors=len(dict_models))
 #palette = sns.color_palette("husl", 8)