Merge pull request #9 from Quantmetry/fix_rpca

JulienRoussel77 · web-flow · commit 334924158c17 · 2023-03-08T12:26:04.000+01:00
Fix rpca
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -133,14 +133,12 @@ imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="
 
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
 imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
-impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
-impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
-  HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
-)
+impute_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
+impute_stochastic_regressor = imputers.ImputerStochasticRegressor(groups=["station"], estimator=LinearRegression())
 
 dict_imputers = {
     "mean": imputer_mean,
-    # "median": imputer_median,
+    "median": imputer_median,
     # "mode": imputer_mode,
     "interpolation": imputer_interpol,
     # "spline": imputer_spline,
@@ -182,7 +180,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
 ```python tags=[]
-generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
+generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
@@ -245,6 +243,8 @@ for col in cols_to_impute:
 ```
 
 ```python
+# plot.plot_imputations(df_station, dfs_imputed_station)
+
 n_columns = len(df_plot.columns)
 n_imputers = len(dict_imputers)
 
@@ -269,7 +269,6 @@ for name_imputer in dict_imputers:
         ax.xaxis.set_major_locator(loc)
         ax.tick_params(axis='both', which='major', labelsize=17)
         i_plot += 1
-        plt.xlim(0, 100)
 plt.savefig("figures/imputations_benchmark.png")
 plt.show()
 
diff --git a/examples/figures/imputations_benchmark.png b/examples/figures/imputations_benchmark.png
diff --git a/qolmat/__init__.py b/qolmat/__init__.py
@@ -1,5 +1,4 @@
+from . import benchmark, imputations, utils
 from ._version import __version__
 
-from . import utils
-
 __all__ = ["utils", "__version__"]
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1,7 +1,10 @@
+import abc
+import copy
 import sys
 from typing import Any, Dict, List, Optional, Union
 
 import sklearn.neighbors._base
+from sklearn.base import BaseEstimator
 
 sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
 
@@ -20,12 +23,19 @@
 
 
 class Imputer(_BaseImputer):
-    def __init__(self, groups: List[str] = [], columnwise: bool = False, hyperparams: Dict = {}):
+    def __init__(
+        self,
+        groups: List[str] = [],
+        columnwise: bool = False,
+        shrink: bool = False,
+        hyperparams: Dict = {},
+    ):
         self.hyperparams_user = hyperparams
         self.hyperparams_optim = {}
         self.hyperparams_local = {}
         self.groups = groups
         self.columnwise = columnwise
+        self.shrink = shrink
 
     def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -47,6 +57,12 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
         hyperparams = self.hyperparams_user.copy()
         hyperparams.update(self.hyperparams_optim)
         cols_with_nans = df.columns[df.isna().any()]
+
+        if self.groups == []:
+            self.ngroups = pd.Series(0, index=df.index).rename("_ngroup")
+        else:
+            self.ngroups = df.groupby(self.groups).ngroup().rename("_ngroup")
+
         if self.columnwise:
 
             # imputed = pd.DataFrame(index=df.index, columns=df.columns)
@@ -79,16 +95,20 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
         df = df.copy()
         if self.groups:
 
-            groupby = utils.custom_groupby(df, self.groups)
-            imputation_values = groupby.apply(self.fit_transform_element)
+            # groupby = utils.custom_groupby(df, self.groups)
+            groupby = df.groupby(self.ngroups, group_keys=False)
+            if self.shrink:
+                imputation_values = groupby.transform(self.fit_transform_element)
+            else:
+                imputation_values = groupby.apply(self.fit_transform_element)
         else:
             imputation_values = self.fit_transform_element(df)
 
         df = df.fillna(imputation_values)
-        # # fill na by applying imputation method without groups
-        # if df.isna().any().any():
-        #     imputation_values = self.fit_transform_fallback(df)
-        #     df = df.fillna(imputation_values)
+        # fill na by applying imputation method without groups
+        if df.isna().any().any():
+            imputation_values = self.fit_transform_fallback(df)
+            df = df.fillna(imputation_values)
 
         return df
 
@@ -114,7 +134,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = pd.DataFrame.mean
 
 
@@ -139,7 +159,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = pd.DataFrame.median
 
 
@@ -164,7 +184,7 @@ def __init__(
         self,
         groups: List[str] = [],
     ) -> None:
-        super().__init__(groups=groups, columnwise=True)
+        super().__init__(groups=groups, columnwise=True, shrink=True)
         self.fit_transform_element = lambda df: df.mode().iloc[0]
 
 
@@ -509,9 +529,11 @@ class ImputerMICE(Imputer):
     def __init__(
         self,
         groups: List[str] = [],
+        estimator: Optional[BaseEstimator] = None,
         **hyperparams,
     ) -> None:
         super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
+        self.estimator = estimator
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -530,7 +552,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
         if not isinstance(df, pd.DataFrame):
             raise ValueError("Input has to be a pandas.DataFrame.")
 
-        iterative_imputer = IterativeImputer(**self.hyperparams_element)
+        iterative_imputer = IterativeImputer(estimator=self.estimator, **self.hyperparams_element)
         res = iterative_imputer.fit_transform(df.values)
         imputed = pd.DataFrame(columns=df.columns)
         for ind, col in enumerate(imputed.columns):
@@ -564,11 +586,15 @@ class ImputerRegressor(Imputer):
     """
 
     def __init__(
-        self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
+        self,
+        groups: List[str] = [],
+        estimator: Optional[BaseEstimator] = None,
+        fit_on_nan: bool = False,
+        **hyperparams,
     ):
         super().__init__(groups=groups, hyperparams=hyperparams)
         self.columnwise = False
-        self.type_model = type_model
+        self.estimator = estimator
         self.fit_on_nan = fit_on_nan
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -598,7 +624,9 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
                     value = value[col]
                 hyperparams[hyperparam] = value
 
-            model = self.type_model(**hyperparams)
+            # model = copy.deepcopy(self.estimator)
+            # for hyperparam, value in hyperparams.items():
+            #     setattr(model, hyperparam, value)
 
             if self.fit_on_nan:
                 X = df.drop(columns=col)
@@ -609,8 +637,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
             if X.empty:
                 y_imputed = pd.Series(y.mean(), index=y.index)
             else:
-                model.fit(X[~is_na], y[~is_na])
-                y_imputed = model.predict(X[is_na])
+                self.estimator.fit(X[~is_na], y[~is_na])
+                y_imputed = self.estimator.predict(X[is_na])
             df_imputed.loc[is_na, col] = y_imputed
 
         return df_imputed
@@ -632,17 +660,19 @@ class ImputerStochasticRegressor(Imputer):
     >>> import pandas as pd
     >>> from qolmat.imputations.models import ImputeStochasticRegressor
     >>> from sklearn.ensemble import ExtraTreesRegressor
-    >>> imputor = ImputeStochasticRegressor(model=ExtraTreesRegressor())
+    >>> imputer = ImputeStochasticRegressor(estimator=ExtraTreesRegressor)
     >>> df = pd.DataFrame(data=[[1, 1, 1, 1],
     >>>                        [np.nan, np.nan, 2, 3],
     >>>                        [1, 2, 2, 5], [2, 2, 2, 2]],
     >>>                        columns=["var1", "var2", "var3", "var4"])
-    >>> imputor.fit_transform(df)
+    >>> imputer.fit_transform(df)
     """
 
-    def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
+    def __init__(
+        self, groups: List[str] = [], estimator: Optional[BaseEstimator] = None, **hyperparams
+    ) -> None:
         super().__init__(groups=groups, hyperparams=hyperparams)
-        self.type_model = type_model
+        self.estimator = estimator
 
     def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
         """
@@ -659,7 +689,6 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
             imputed dataframe
         """
         df_imp = df.copy()
-        model = self.type_model(**self.hyperparams)
         cols_with_nans = df.columns[df.isna().any()]
         cols_without_nans = df.columns[df.notna().all()]
 
@@ -670,8 +699,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
             X = df[cols_without_nans]
             y = df[col]
             is_na = y.isna()
-            model.fit(X[~is_na], y[~is_na])
-            y_pred = model.predict(X)
+            self.estimator.fit(X[~is_na], y[~is_na])
+            y_pred = self.estimator.predict(X)
             std_error = (y_pred[~is_na] - y[~is_na]).std()
             random_pred = np.random.normal(size=len(y), loc=y_pred, scale=std_error)
             df_imp.loc[is_na, col] = random_pred[is_na]
@@ -696,8 +725,8 @@ class ImputerRPCA(Imputer):
 
     def __init__(
         self,
-        method: str = "noisy",
         groups: List[str] = [],
+        method: str = "noisy",
         columnwise: bool = False,
         **hyperparams,
     ) -> None:
diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py
@@ -82,7 +82,7 @@ def get_data(name_data="Beijing", datapath: str = "data/", download: Optional[bo
 
 
 def preprocess_data(df: pd.DataFrame):
-    """Put data into dataframe
+    """Preprocess data from the "Beijing" datset
 
     Parameters
     ----------
@@ -106,14 +106,14 @@ def preprocess_data(df: pd.DataFrame):
     return df
 
 
-def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
+def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int):
     """
-    Creates holes in a dataset with no missing value. Only used in the documentation to design
+    Creates holes in a dataset with no missing value, starting from `df`. Only used in the documentation to design
     examples.
 
     Parameters
     ----------
-    X : pd.DataFrame
+    df : pd.DataFrame
         dataframe no missing values
 
     mean_size : int
@@ -130,18 +130,18 @@ def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
     pd.DataFrame
         dataframe with missing values
     """
-    groups = X.index.names.difference(["datetime", "date", "index"])
+    groups = df.index.names.difference(["datetime", "date", "index"])
     generator = missing_patterns.GeometricHoleGenerator(
-        1, ratio_masked=ratio_masked, subset=X.columns, groups=groups
+        1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
     )
 
-    generator.dict_probas_out = {column: 1 / mean_size for column in X.columns}
-    generator.dict_ratios = {column: 1 / len(X.columns) for column in X.columns}
+    generator.dict_probas_out = {column: 1 / mean_size for column in df.columns}
+    generator.dict_ratios = {column: 1 / len(df.columns) for column in df.columns}
     if generator.groups == []:
-        mask = generator.generate_mask(X)
+        mask = generator.generate_mask(df)
     else:
-        mask = X.groupby(groups, group_keys=False).apply(generator.generate_mask)
-    X_with_nans = X.copy()
+        mask = df.groupby(groups, group_keys=False).apply(generator.generate_mask)
+    X_with_nans = df.copy()
     X_with_nans[mask] = np.nan
     return X_with_nans
 
@@ -151,6 +151,22 @@ def get_data_corrupted(
     mean_size: int = 90,
     ratio_masked: float = 0.2,
 ):
+    """
+    Returns a dataframe with controled corruption optained from the source `name_data`
+
+    Parameters
+    ----------
+    name_data : str
+        Name of the data source, can be "Beijing" or "Artificial"
+    mean_size: int
+        Mean size of the holes to be generated using a geometric law
+    ratio_masked: float
+        Percent of missing data in each column in the output dataframe
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with missing values
+    """
     df = get_data(name_data)
     df = add_holes(df, mean_size=mean_size, ratio_masked=ratio_masked)
     return df
diff --git a/qolmat/utils/plot.py b/qolmat/utils/plot.py
@@ -4,10 +4,11 @@
 
 from __future__ import annotations
 
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import matplotlib as mpl
 import matplotlib.pyplot as plt
+import matplotlib.ticker as plticker
 import numpy as np
 import pandas as pd
 import scipy
@@ -256,3 +257,31 @@ def multibar(df, ax=None, orientation="vertical", colors=None, decimals=0):
     # ax.bar_label(rects2, padding=3)
 
     # plt.tight_layout()
+
+
+def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]):
+    n_columns = len(df.columns)
+    n_imputers = len(dict_df_imputed)
+
+    fig = plt.figure(figsize=(8 * n_columns, 6 * n_imputers))
+    i_plot = 1
+    for name_imputer, df_imputed in dict_df_imputed.items():
+        for col in df:
+
+            ax = fig.add_subplot(n_imputers, n_columns, i_plot)
+            values_orig = df[col]
+
+            plt.plot(values_orig, ".", color="black", label="original")
+            # plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
+
+            values_imp = df_imputed[col].copy()
+            values_imp[values_orig.notna()] = np.nan
+            plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
+            plt.ylabel(col, fontsize=16)
+            if i_plot % n_columns == 0:
+                plt.legend(loc=[1, 0], fontsize=18)
+            loc = plticker.MultipleLocator(base=2 * 365)
+            ax.xaxis.set_major_locator(loc)
+            ax.tick_params(axis="both", which="major", labelsize=17)
+            i_plot += 1
+    plt.show()