scikit-learn-contrib
diff --git a/‎examples/1_timeSeries.ipynb‎
Lines changed: 132 additions & 127 deletions b/‎examples/1_timeSeries.ipynb‎
Lines changed: 132 additions & 127 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 47 additions & 36 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 47 additions & 36 deletions
diff --git a/‎qolmat/benchmark/cross_validation.py‎
Lines changed: 49 additions & 27 deletions b/‎qolmat/benchmark/cross_validation.py‎
Lines changed: 49 additions & 27 deletions
diff --git a/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 4 additions & 3 deletions b/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 4 additions & 3 deletions
@@ -1,11 +1,17 @@
+import logging
 from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 
+from qolmat import logging as qlog
 from qolmat.benchmark import cross_validation, utils
 from qolmat.benchmark.missing_patterns import _HoleGenerator
 
+qlog.log_setup()
+logger = logging.getLogger(__name__)
+# logger.setLevel(logging.DEBUG)
+
 
 class Comparator:
     """
@@ -22,8 +28,8 @@ class Comparator:
     search_params: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
         dictionary of search space for each implementation method. By default, the value is set to
         {}.
-    n_cv_calls: Optional[int] = 10
-        number of calls of the hyperparameters cross-validation. By default, the value is set to
+    n_calls_opt: Optional[int] = 10
+        number of calls of the optimization algorithm
         10.
     """
 
@@ -33,18 +39,18 @@ def __init__(
         selected_columns: List[str],
         generator_holes: _HoleGenerator,
         search_params: Optional[Dict[str, Dict[str, Union[float, int, str]]]] = {},
-        n_cv_calls: Optional[int] = 10,
+        n_calls_opt: Optional[int] = 10,
     ):
 
         self.dict_models = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
         self.search_params = search_params
-        self.n_cv_calls = n_cv_calls
+        self.n_calls_opt = n_calls_opt
 
     def get_errors(
         self, df_origin: pd.DataFrame, df_imputed: pd.DataFrame, df_mask: pd.DataFrame
-    ) -> float:
+    ) -> pd.DataFrame:
         """Functions evaluating the reconstruction's quality
 
         Parameters
@@ -73,6 +79,7 @@ def get_errors(
             df_origin[df_mask],
             df_imputed[df_mask],
         )
+
         dict_errors["kl"] = utils.kl_divergence(
             df_origin[df_mask],
             df_imputed[df_mask],
@@ -82,8 +89,8 @@ def get_errors(
         return errors
 
     def evaluate_errors_sample(
-        self, tested_model: any, df: pd.DataFrame, search_space: Optional[dict] = None
-    ) -> Dict:
+        self, imputer: any, df: pd.DataFrame, list_spaces: List[Dict] = {}
+    ) -> pd.Series:
         """Evaluate the errors in the cross-validation
 
         Parameters
@@ -92,8 +99,8 @@ def evaluate_errors_sample(
             imputation model
         df : pd.DataFrame
             dataframe to impute
-        search_space : Optional[dict], optional
-            search space for tested_model's hyperparameters, by default None
+        search_space : Dict
+            search space for tested_model's hyperparameters
 
         Returns
         -------
@@ -102,19 +109,24 @@ def evaluate_errors_sample(
         """
         list_errors = []
         df_origin = df[self.selected_columns].copy()
+        if list_spaces:
+            print("Hyperparameter optimization")
+            print(list_spaces)
+        else:
+            print("No hyperparameter optimization")
         for df_mask in self.generator_holes.split(df_origin):
             df_corrupted = df_origin.copy()
             df_corrupted[df_mask] = np.nan
-            if search_space is None:
-                df_imputed = tested_model.fit_transform(X=df_corrupted)
-            else:
+            if list_spaces:
                 cv = cross_validation.CrossValidation(
-                    tested_model,
-                    search_space=search_space,
+                    imputer,
+                    list_spaces=list_spaces,
                     hole_generator=self.generator_holes,
-                    n_calls=self.n_cv_calls,
+                    n_calls=self.n_calls_opt,
                 )
-                df_imputed = cv.fit_transform(X=df_corrupted)
+                df_imputed = cv.fit_transform(df_corrupted)
+            else:
+                df_imputed = imputer.fit_transform(df_corrupted)
 
             subset = self.generator_holes.subset
             errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
@@ -140,30 +152,29 @@ def compare(self, df: pd.DataFrame, verbose: bool = True):
 
         dict_errors = {}
 
-        for name, tested_model in self.dict_models.items():
-            if verbose:
-                print("Tested model:", type(tested_model).__name__)
+        for name, imputer in self.dict_models.items():
+            logger.setLevel(logging.DEBUG)
+            print(f"Tested model: {type(imputer).__name__}")
+
+            search_params = self.search_params.get(name, {})
 
-            if str(type(tested_model).__name__) in self.search_params.keys():
-                if hasattr(tested_model, "columnwise") and tested_model.columnwise:
-                    if len(self.selected_columns) > 0:
-                        search_params = {}
-                        for col in self.selected_columns:
-                            for key, value in self.search_params[type(tested_model).__name__].items():
-                                search_params[f"('{col}', '{key}')"] = value
-                    else:
-                        search_params = self.search_params[type(tested_model).__name__]
-                else:
-                    search_params = self.search_params[type(tested_model).__name__]
-
-                search_space = utils.get_search_space(tested_model, search_params)
+            # if imputer.columnwise:
+            #     if len(self.selected_columns) > 0:
+            #         search_params = {}
+            #         for col in self.selected_columns:
+            #             for key, value in self.search_params[type(imputer).__name__].items():
+            #                 search_params[f"('{col}', '{key}')"] = value
+            #     else:
+            #         search_params = self.search_params[type(imputer).__name__]
+            # else:
+            #     search_params = self.search_params[type(imputer).__name__]
+
+            list_spaces = utils.get_search_space(search_params)
 
-            else:
-                search_space = None
             try:
-                dict_errors[name] = self.evaluate_errors_sample(tested_model, df, search_space)
+                dict_errors[name] = self.evaluate_errors_sample(imputer, df, list_spaces)
             except Exception as excp:
-                print("Error while testing ", type(tested_model).__name__)
+                print("Error while testing ", type(imputer).__name__)
                 raise excp
 
         df_errors = pd.DataFrame(dict_errors)
 
@@ -1,11 +1,16 @@
+import logging
 from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import skopt
+from skopt.space import Dimension
 
 from qolmat.benchmark.missing_patterns import _HoleGenerator
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
 
 class CrossValidation:
     """
@@ -37,16 +42,16 @@ class CrossValidation:
 
     def __init__(
         self,
-        model: any,
-        search_space: List[skopt.Space],
+        imputer: any,
+        list_spaces: List[Dimension],
         hole_generator: _HoleGenerator,
         n_calls: int = 10,
         n_jobs: int = -1,
         loss_norm: int = 1,
         verbose: bool = True,
     ):
-        self.model = model
-        self.search_space = search_space
+        self.imputer = imputer
+        self.list_spaces = list_spaces
         self.hole_generator = hole_generator
         self.n_calls = n_calls
         self.n_jobs = n_jobs
@@ -85,17 +90,26 @@ def loss_function(
         else:
             raise ValueError("loss_norm has to be 0 or 1 (int)")
 
-    def _set_params(self, all_params: Dict[str, Union[float, int, str]]):
+    def deflat_hyperparams(self, hyperparams_flat: Dict[str, Union[float, int, str]]) -> Dict:
         """
         Set the hyperparameters to the model
 
         Parameters
         ----------
-        all_params : Dict[str, Union[int, float, str]]
+        hyperparams_flat : Dict[str, Union[int, float, str]]
             dictionary containing the hyperparameters and their value
         """
-        self.model.set_params(**all_params)
-        return self
+        hyperparams = {}
+        for name_dimension, hyperparam in hyperparams_flat.items():
+            if "/" not in name_dimension:
+                hyperparams[name_dimension] = hyperparam
+            else:
+                name_hyperparam, col = name_dimension.split("/")
+                if name_hyperparam in hyperparams:
+                    hyperparams[name_hyperparam][col] = hyperparam
+                else:
+                    hyperparams[name_hyperparam] = {col: hyperparam}
+        return hyperparams
 
     def objective(self, X):
         """
@@ -106,11 +120,9 @@ def objective(self, X):
         _type_
             objective function
         """
-        @skopt.utils.use_named_args(self.search_space)
-        def obj_func(**all_params):
-            self._set_params(all_params=all_params)
-            if self.verbose:
-                print(all_params)
+        @skopt.utils.use_named_args(self.list_spaces)
+        def obj_func(**hyperparams_flat):
+            self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
 
             errors = []
 
@@ -119,7 +131,7 @@ def obj_func(**all_params):
                 df_corrupted = df_origin.copy()
                 df_corrupted[df_mask] = np.nan
                 cols_with_nans = X.columns[X.isna().any(axis=0)].tolist()
-                imputed = self.model.fit_transform(df_corrupted)
+                imputed = self.imputer.fit_transform(df_corrupted)
 
                 error = self.loss_function(
                     df_origin.loc[:, cols_with_nans],
@@ -129,14 +141,12 @@ def obj_func(**all_params):
                 errors.append(error)
 
             mean_errors = np.mean(errors)
-            if self.verbose:
-                print(mean_errors)
             return mean_errors
 
         return obj_func
 
     def fit_transform(
-        self, X: pd.DataFrame, return_hyper_params: Optional[bool] = False
+        self, df: pd.DataFrame, return_hyper_params: Optional[bool] = False
     ) -> pd.DataFrame:
         """
         Fit and transform estimator and impute the missing values.
@@ -154,24 +164,36 @@ def fit_transform(
             imputed dataframe
         """
 
-        n0 = self.n_calls//5 if (self.n_calls//5) >= 1 else self.n_calls        
+        n0 = max(5, self.n_calls // 5)
+        print("---")
+        print(self.n_calls)
+        print(n0)
+
+        # res = skopt.gp_minimize(
+        #     self.objective(X=df),
+        #     dimensions=self.list_spaces,
+        #     n_calls=self.n_calls,
+        #     n_initial_points=n0,
+        #     random_state=42,
+        #     n_jobs=self.n_jobs,
+        # )
 
         res = skopt.gp_minimize(
-            self.objective(X=X),
-            dimensions=self.search_space,
+            self.objective(X=df),
+            dimensions=self.list_spaces,
             n_calls=self.n_calls,
-            n_initial_points= n0,
+            n_initial_points=n0,
             random_state=42,
             n_jobs=self.n_jobs,
         )
 
-        best_params = {
-            self.search_space[param].name: res["x"][param] for param in range(len(res["x"]))
-        }
+        hyperparams_flat = {space.name: val for space, val in zip(self.list_spaces, res["x"])}
+        print(f"Optimal hyperparameters : {hyperparams_flat}")
+        print(f"Results: {res}")
 
-        self._set_params(all_params=best_params)
-        df_imputed = self.model.fit_transform(X=X)
+        self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
+        df_imputed = self.imputer.fit_transform(df)
 
         if return_hyper_params:
-            return df_imputed, best_params
+            return df_imputed, hyperparams_flat
         return df_imputed
@@ -89,7 +89,7 @@ def fit(self, X: pd.DataFrame) -> _HoleGenerator:
         if self.groups == []:
             self.ngroups = None
         else:
-            self.ngroups = X.groupby(self.groups).ngroup()
+            self.ngroups = X.groupby(self.groups).ngroup().rename("_ngroup")
 
         return self
 
@@ -395,10 +395,11 @@ def __init__(
             groups=groups,
         )
 
-    def compute_distribution_holes(self, states):
+    def compute_distribution_holes(self, states: pd.Series) -> pd.Series:
         series_id = (states.diff() != 0).cumsum()
         series_id = series_id[states]
         distribution_holes = series_id.value_counts().value_counts()
+        distribution_holes.index.name = "_size_hole"
         # distribution_holes /= distribution_holes.sum()
         return distribution_holes
 
@@ -428,7 +429,7 @@ def fit(self, X: pd.DataFrame) -> EmpiricalHoleGenerator:
                 distributions_holes = states.groupby(self.ngroups).apply(
                     self.compute_distribution_holes
                 )
-                distributions_holes = distributions_holes.groupby(level=0).sum()
+                distributions_holes = distributions_holes.groupby(by="_size_hole").sum()
                 self.dict_distributions_holes[column] = distributions_holes
 
     def sample_sizes(self, column, n_masked):