estimator based models debugged

Julien Roussel · Julien Roussel · commit a07c82fd51d4 · 2023-03-07T19:13:48.000+01:00
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -64,10 +64,11 @@ This dataset only contains numerical vairables.
 
 ```python
 df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
+df_data["cat"] = [i % 3 for i in range(len(df_data))]
 
 # cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
 # cols_to_impute = df_data.columns[df_data.isna().any()]
-cols_to_impute = ["TEMP", "PRES"]
+cols_to_impute = ["TEMP", "PRES", "cat"]
 
 ```
 
@@ -112,9 +113,13 @@ All presented methods are group-wise: here each station is imputed independently
 Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
 In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
 
+```python tags=[]
+hasattr(imputers.ImputerMean(), "groups")
+```
+
 ```python
 imputer_mean = imputers.ImputerMean(groups=["station"])
-imputer_median = imputers.ImputerMedian(groups=["station"])
+imputer_median = imputers.ImputerMedian(groups=["station", "cat"])
 imputer_mode = imputers.ImputerMode(groups=["station"])
 imputer_locf = imputers.ImputerLOCF(groups=["station"])
 imputer_nocb = imputers.ImputerNOCB(groups=["station"])
@@ -133,14 +138,12 @@ imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="
 
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
 imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
-impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
-impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
-  HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
-)
+impute_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
+impute_stochastic_regressor = imputers.ImputerStochasticRegressor(groups=["station"], estimator=LinearRegression())
 
 dict_imputers = {
     "mean": imputer_mean,
-    # "median": imputer_median,
+    "median": imputer_median,
     # "mode": imputer_mode,
     "interpolation": imputer_interpol,
     # "spline": imputer_spline,
@@ -182,7 +185,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
 ```python tags=[]
-generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
+generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
diff --git a/examples/figures/imputations_benchmark.png b/examples/figures/imputations_benchmark.png
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1,7 +1,10 @@
+import abc
+import copy
 import sys
 from typing import Any, Dict, List, Optional, Union
 
 import sklearn.neighbors._base
+from sklearn.base import BaseEstimator
 
 sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
 
@@ -20,12 +23,19 @@
 
 
 class Imputer(_BaseImputer):
-    def __init__(self, groups: List[str] = [], columnwise: bool = False, hyperparams: Dict = {}):
+    def __init__(
+        self,
+        groups: List[str] = [],
+        columnwise: bool = False,
+        shrink: bool = False,
+        hyperparams: Dict = {},
+    ):
         self.hyperparams_user = hyperparams
         self.hyperparams_optim = {}
         self.hyperparams_local = {}
         self.groups = groups
         self.columnwise = columnwise
+        self.shrink = shrink
 
     def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -47,6 +57,12 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
         hyperparams = self.hyperparams_user.copy()
         hyperparams.update(self.hyperparams_optim)
         cols_with_nans = df.columns[df.isna().any()]
+
+        if self.groups == []:
+            self.ngroups = pd.Series(0, index=df.index).rename("_ngroup")
+        else:
+            self.ngroups = df.groupby(self.groups).ngroup().rename("_ngroup")
+
         if self.columnwise:
 
             # imputed = pd.DataFrame(index=df.index, columns=df.columns)
@@ -79,8 +95,12 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
         df = df.copy()
         if self.groups:
 
-            groupby = utils.custom_groupby(df, self.groups)
-            imputation_values = groupby.apply(self.fit_transform_element)
+            # groupby = utils.custom_groupby(df, self.groups)
+            groupby = df.groupby(self.ngroups, group_keys=False)
+            if self.shrink:
+                imputation_values = groupby.transform(self.fit_transform_element)
+            else:
+                imputation_values = groupby.apply(self.fit_transform_element)
         else:
             imputation_values = self.fit_transform_element(df)
 
@@ -114,7 +134,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = pd.DataFrame.mean
 
 
@@ -139,7 +159,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = pd.DataFrame.median
 
 
@@ -164,7 +184,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = lambda df: df.mode().iloc[0]
 
 
@@ -509,9 +529,11 @@ class ImputerMICE(Imputer):
     def __init__(
         self,
         groups: List[str] = [],
+        estimator: Optional[BaseEstimator] = None,
         **hyperparams,
     ) -> None:
         super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
+        self.estimator = estimator
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -530,7 +552,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         if not isinstance(df, pd.DataFrame):
             raise ValueError("Input has to be a pandas.DataFrame.")
 
-        iterative_imputer = IterativeImputer(**self.hyperparams_element)
+        iterative_imputer = IterativeImputer(estimator=self.estimator, **self.hyperparams_element)
         res = iterative_imputer.fit_transform(df.values)
         imputed = pd.DataFrame(columns=df.columns)
         for ind, col in enumerate(imputed.columns):
@@ -564,11 +586,15 @@ class ImputerRegressor(Imputer):
     """
 
     def __init__(
-        self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
+        self,
+        groups: List[str] = [],
+        estimator: Optional[BaseEstimator] = None,
+        fit_on_nan: bool = False,
+        **hyperparams,
     ):
         super().__init__(groups=groups, hyperparams=hyperparams)
         self.columnwise = False
-        self.type_model = type_model
+        self.estimator = estimator
         self.fit_on_nan = fit_on_nan
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -598,7 +624,9 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
                     value = value[col]
                 hyperparams[hyperparam] = value
 
-            model = self.type_model(**hyperparams)
+            # model = copy.deepcopy(self.estimator)
+            # for hyperparam, value in hyperparams.items():
+            #     setattr(model, hyperparam, value)
 
             if self.fit_on_nan:
                 X = df.drop(columns=col)
@@ -609,8 +637,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
             if X.empty:
                 y_imputed = pd.Series(y.mean(), index=y.index)
             else:
-                model.fit(X[~is_na], y[~is_na])
-                y_imputed = model.predict(X[is_na])
+                self.estimator.fit(X[~is_na], y[~is_na])
+                y_imputed = self.estimator.predict(X[is_na])
             df_imputed.loc[is_na, col] = y_imputed
 
         return df_imputed
@@ -632,17 +660,19 @@ class ImputerStochasticRegressor(Imputer):
     >>> import pandas as pd
     >>> from qolmat.imputations.models import ImputeStochasticRegressor
     >>> from sklearn.ensemble import ExtraTreesRegressor
-    >>> imputor = ImputeStochasticRegressor(model=ExtraTreesRegressor())
+    >>> imputer = ImputeStochasticRegressor(estimator=ExtraTreesRegressor)
     >>> df = pd.DataFrame(data=[[1, 1, 1, 1],
     >>>                        [np.nan, np.nan, 2, 3],
     >>>                        [1, 2, 2, 5], [2, 2, 2, 2]],
     >>>                        columns=["var1", "var2", "var3", "var4"])
-    >>> imputor.fit_transform(df)
+    >>> imputer.fit_transform(df)
     """
 
-    def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
+    def __init__(
+        self, groups: List[str] = [], estimator: Optional[BaseEstimator] = None, **hyperparams
+    ) -> None:
         super().__init__(groups=groups, hyperparams=hyperparams)
-        self.type_model = type_model
+        self.estimator = estimator
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
         """
@@ -659,7 +689,6 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
             imputed dataframe
         """
         df_imp = df.copy()
-        model = self.type_model(**self.hyperparams)
         cols_with_nans = df.columns[df.isna().any()]
         cols_without_nans = df.columns[df.notna().all()]
 
@@ -670,8 +699,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
             X = df[cols_without_nans]
             y = df[col]
             is_na = y.isna()
-            model.fit(X[~is_na], y[~is_na])
-            y_pred = model.predict(X)
+            self.estimator.fit(X[~is_na], y[~is_na])
+            y_pred = self.estimator.predict(X)
             std_error = (y_pred[~is_na] - y[~is_na]).std()
             random_pred = np.random.normal(size=len(y), loc=y_pred, scale=std_error)
             df_imp.loc[is_na, col] = random_pred[is_na]
@@ -696,8 +725,8 @@ class ImputerRPCA(Imputer):
 
     def __init__(
         self,
-        method: str = "noisy",
         groups: List[str] = [],
+        method: str = "noisy",
         columnwise: bool = False,
         **hyperparams,
     ) -> None:
diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py
@@ -82,7 +82,7 @@ def get_data(name_data="Beijing", datapath: str = "data/", download: Optional[bo
 
 
 def preprocess_data(df: pd.DataFrame):
-    """Put data into dataframe
+    """Preprocess data from the "Beijing" datset
 
     Parameters
     ----------
@@ -106,14 +106,14 @@ def preprocess_data(df: pd.DataFrame):
     return df
 
 
-def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
+def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int):
     """
-    Creates holes in a dataset with no missing value. Only used in the documentation to design
+    Creates holes in a dataset with no missing value, starting from `df`. Only used in the documentation to design
     examples.
 
     Parameters
     ----------
-    X : pd.DataFrame
+    df : pd.DataFrame
         dataframe no missing values
 
     mean_size : int
@@ -130,18 +130,18 @@ def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
     pd.DataFrame
         dataframe with missing values
     """
-    groups = X.index.names.difference(["datetime", "date", "index"])
+    groups = df.index.names.difference(["datetime", "date", "index"])
     generator = missing_patterns.GeometricHoleGenerator(
-        1, ratio_masked=ratio_masked, subset=X.columns, groups=groups
+        1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
     )
 
-    generator.dict_probas_out = {column: 1 / mean_size for column in X.columns}
-    generator.dict_ratios = {column: 1 / len(X.columns) for column in X.columns}
+    generator.dict_probas_out = {column: 1 / mean_size for column in df.columns}
+    generator.dict_ratios = {column: 1 / len(df.columns) for column in df.columns}
     if generator.groups == []:
-        mask = generator.generate_mask(X)
+        mask = generator.generate_mask(df)
     else:
-        mask = X.groupby(groups, group_keys=False).apply(generator.generate_mask)
-    X_with_nans = X.copy()
+        mask = df.groupby(groups, group_keys=False).apply(generator.generate_mask)
+    X_with_nans = df.copy()
     X_with_nans[mask] = np.nan
     return X_with_nans
 
@@ -151,6 +151,22 @@ def get_data_corrupted(
     mean_size: int = 90,
     ratio_masked: float = 0.2,
 ):
+    """
+    Returns a dataframe with controled corruption optained from the source `name_data`
+
+    Parameters
+    ----------
+    name_data : str
+        Name of the data source, can be "Beijing" or "Artificial"
+    mean_size: int
+        Mean size of the holes to be generated using a geometric law
+    ratio_masked: float
+        Percent of missing data in each column in the output dataframe
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with missing values
+    """
     df = get_data(name_data)
     df = add_holes(df, mean_size=mean_size, ratio_masked=ratio_masked)
     return df