Merge pull request #8 from Quantmetry/fix_rpca

JulienRoussel77 · web-flow · commit e22e3afb80ac · 2023-03-07T12:31:22.000+01:00
regressor patched
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -126,16 +126,14 @@ imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, mode
 imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=200, tau=2, lam=.3)
 imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
 
-imputer_ou = imputers.ImputeEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
-imputer_tsou = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
-imputer_tsmle = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
+imputer_ou = imputers.ImputerEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
+imputer_tsou = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
+imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
 
 
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
 imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
-impute_regressor = imputers.ImputerRegressor(
-  HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
-)
+impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
 impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
   HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
 )
@@ -151,12 +149,13 @@ dict_imputers = {
     "OU": imputer_ou,
     "TSOU": imputer_tsou,
     "TSMLE": imputer_tsmle,
-    "RPCA": imputer_rpca,
+    # "RPCA": imputer_rpca,
     # "RPCA_opti": imputer_rpca_opti,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
-    # "iterative": imputer_iterative,
+    "iterative": impute_regressor,
+    "regressor": imputer_iterative,
 }
 n_imputers = len(dict_imputers)
 
@@ -183,7 +182,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
 ```python tags=[]
-generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
+generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
diff --git a/examples/figures/imputations_benchmark.png b/examples/figures/imputations_benchmark.png
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import sklearn.neighbors._base
 
@@ -112,7 +112,7 @@ class ImputerMean(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
         self.fit_transform_element = pd.DataFrame.mean
@@ -137,7 +137,7 @@ class ImputerMedian(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
         self.fit_transform_element = pd.DataFrame.median
@@ -162,7 +162,7 @@ class ImputerMode(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
         self.fit_transform_element = lambda df: df.mode().iloc[0]
@@ -187,7 +187,7 @@ class ImputerShuffle(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
 
@@ -225,7 +225,7 @@ class ImputerLOCF(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
 
@@ -259,7 +259,7 @@ class ImputerNOCB(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
     ) -> None:
         super().__init__(groups=groups, columnwise=True)
 
@@ -305,7 +305,7 @@ class ImputerInterpolation(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
         method: str = "linear",
         order: int = None,
         col_time: Optional[str] = None,
@@ -373,7 +373,7 @@ class ImputerResiduals(Imputer):
 
     def __init__(
         self,
-        groups: Optional[List[str]] = [],
+        groups: List[str] = [],
         period: int = None,
         model_tsa: Optional[str] = "additive",
         extrapolate_trend: Optional[Union[int, str]] = "freq",
@@ -437,11 +437,12 @@ class ImputerKNN(Imputer):
 
     def __init__(
         self,
+        groups: List[str] = [],
         n_neighbors: int = 5,
         weights: str = "distance",
         **hyperparams,
     ) -> None:
-        super().__init__(columnwise=False, hyperparams=hyperparams)
+        super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
         self.n_neighbors = n_neighbors
         self.weights = weights
 
@@ -562,8 +563,10 @@ class ImputerRegressor(Imputer):
     >>> imputor.fit_transform(df)
     """
 
-    def __init__(self, type_model, fit_on_nan: bool = False, **hyperparams):
-        super().__init__(hyperparams=hyperparams)
+    def __init__(
+        self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
+    ):
+        super().__init__(groups=groups, hyperparams=hyperparams)
         self.columnwise = False
         self.type_model = type_model
         self.fit_on_nan = fit_on_nan
@@ -590,7 +593,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
 
         for col in cols_with_nans:
             hyperparams = {}
-            for hyperparam, value in self.hyperparams.items():
+            for hyperparam, value in self.hyperparams_element.items():
                 if isinstance(value, dict):
                     value = value[col]
                 hyperparams[hyperparam] = value
@@ -600,11 +603,15 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
             if self.fit_on_nan:
                 X = df.drop(columns=col)
             else:
-                X = df[cols_without_nans].drop(columns=col)
+                X = df[cols_without_nans].drop(columns=col, errors="ignore")
             y = df[col]
             is_na = y.isna()
-            model.fit(X[~is_na], y[~is_na])
-            df_imputed.loc[is_na, col] = model.predict(X[is_na])
+            if X.empty:
+                y_imputed = pd.Series(y.mean(), index=y.index)
+            else:
+                model.fit(X[~is_na], y[~is_na])
+                y_imputed = model.predict(X[is_na])
+            df_imputed.loc[is_na, col] = y_imputed
 
         return df_imputed
 
@@ -633,8 +640,8 @@ class ImputerStochasticRegressor(Imputer):
     >>> imputor.fit_transform(df)
     """
 
-    def __init__(self, type_model, **hyperparams) -> None:
-        super().__init__(hyperparams=hyperparams)
+    def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
+        super().__init__(groups=groups, hyperparams=hyperparams)
         self.type_model = type_model
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
@@ -728,7 +735,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         return df_imputed
 
 
-class ImputeEM(Imputer):
+class ImputerEM(Imputer):
     def __init__(
         self,
         groups: List[str] = [],