scikit-learn-contrib
diff --git a/‎examples/benchmark.md‎
Lines changed: 17 additions & 37 deletions b/‎examples/benchmark.md‎
Lines changed: 17 additions & 37 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 3 additions & 0 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎qolmat/benchmark/hyperparameters.py‎
Lines changed: 61 additions & 18 deletions b/‎qolmat/benchmark/hyperparameters.py‎
Lines changed: 61 additions & 18 deletions
diff --git a/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 0 deletions b/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qolmat/imputations/em_sampler.py‎
Lines changed: 10 additions & 0 deletions b/‎qolmat/imputations/em_sampler.py‎
Lines changed: 10 additions & 0 deletions
@@ -116,6 +116,8 @@ ratio_masked = 0.1
 ```
 
 ```python
+dict_config_opti = {}
+
 imputer_mean = imputers.ImputerMean(groups=("station",))
 imputer_median = imputers.ImputerMedian(groups=("station",))
 imputer_mode = imputers.ImputerMode(groups=("station",))
@@ -127,6 +129,18 @@ imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
 imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
 
 imputer_rpca = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
+imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
+dict_config_opti["RPCA_opti"] = {
+    "tau": ho.hp.uniform("tau", low=.5, high=5),
+    "lam": ho.hp.uniform("lam", low=.1, high=1),
+}
+imputer_rpca_opticw = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
+dict_config_opti["RPCA_opticw"] = {
+    "tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
+    "tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
+    "lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
+    "lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
+}
 
 imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
 imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
@@ -142,40 +156,6 @@ imputer_regressor = imputers.ImputerRegressor(groups=("station",), estimator=Lin
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked)
 ```
 
-```python
-dict_config_opti = {
-    "tau": ho.hp.uniform("tau", low=.5, high=5),
-    "lam": ho.hp.uniform("lam", low=.1, high=1),
-}
-imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
-imputer_rpca_opti = hyperparameters.optimize(
-    imputer_rpca_opti,
-    df_data,
-    generator = generator_holes,
-    metric="mae",
-    max_evals=10,
-    dict_spaces=dict_config_opti
-)
-```
-
-```python jupyter={"source_hidden": true}
-dict_config_opti2 = {
-    "tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
-    "tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
-    "lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
-    "lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
-}
-imputer_rpca_opti2 = imputers.ImputerRPCA(groups=("station",), columnwise=True, max_iterations=256)
-imputer_rpca_opti2 = hyperparameters.optimize(
-    imputer_rpca_opti2,
-    df_data,
-    generator = generator_holes,
-    metric="mae",
-    max_evals=10,
-    dict_spaces=dict_config_opti2
-)
-```
-
 ```python
 dict_imputers = {
     "mean": imputer_mean,
@@ -190,7 +170,7 @@ dict_imputers = {
     "TSMLE": imputer_tsmle,
     "RPCA": imputer_rpca,
     "RPCA_opti": imputer_rpca_opti,
-    # "RPCA_opti2": imputer_rpca_opti2,
+    # "RPCA_opticw": imputer_rpca_opti2,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
@@ -225,7 +205,7 @@ results = comparison.compare(df_data)
 results
 ```
 
-```python jupyter={"source_hidden": true}
+```python
 df_plot = results.loc["KL_columnwise",'TEMP']
 plt.barh(df_plot.index, df_plot, color=tab10(0))
 plt.title('TEMP')
@@ -246,7 +226,7 @@ plot.multibar(results.loc["mae"], decimals=1)
 plt.ylabel("mae")
 
 fig.add_subplot(2, 1, 2)
-plot.multibar(results.loc["dist_corr_pattern"], decimals=1)
+plot.multibar(results.loc["dist_corr_pattern"], decimals=2)
 plt.ylabel("dist_corr_pattern")
 
 plt.savefig("figures/imputations_benchmark_errors.png")
 
@@ -35,13 +35,15 @@ def __init__(
         metrics: List = ["mae", "wmape", "KL_columnwise"],
         dict_config_opti: Optional[Dict[str, Any]] = {},
         max_evals: int = 10,
+        verbose: bool = False,
     ):
         self.dict_imputers = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
         self.metrics = metrics
         self.dict_config_opti = dict_config_opti
         self.max_evals = max_evals
+        self.verbose = verbose
 
     def get_errors(
         self,
@@ -106,6 +108,7 @@ def evaluate_errors_sample(
                 metric_optim,
                 dict_config_opti_imputer,
                 max_evals=self.max_evals,
+                verbose=self.verbose,
             )
             df_imputed = imputer_opti.fit_transform(df_corrupted)
             subset = self.generator_holes.subset
 
@@ -11,18 +11,38 @@
 from qolmat.benchmark import metrics
 
 from qolmat.benchmark.missing_patterns import _HoleGenerator
+from qolmat.imputations.imputers import _Imputer
+from qolmat.utils.utils import HyperValue
 
-HyperValue = Union[int, float, str]
 
-
-def get_objective(imputer, df, generator, metric, names_hyperparams) -> Callable:
+def get_objective(
+    imputer: _Imputer,
+    df: pd.DataFrame,
+    generator: _HoleGenerator,
+    metric: str,
+    names_hyperparams: List[str],
+) -> Callable:
     """
-    Define the objective function for the cross-validation
+    Define the objective function, which is the average metric computed over the folds provided by
+    the hole generator, using a cross-validation.
+
+    Parameters
+    ----------
+    imputer: _Imputer
+        Imputer that should be optimized, it should at least have a fit_transform method and an
+        imputer_params attribute
+    generator: _HoleGenerator
+        Generator creating the masked values in the nested cross validation allowing to measure the
+         imputer performance
+    metric: str
+        Metric used as perfomance indicator, common values are `mse` and `mae`
+    names_hyperparams: List[str]
+        List of the names of the hyperparameters which are being optimized
 
     Returns
     -------
-    _type_
-        objective function
+    Callable[List[HyperValue], float]
+        Objective function
     """
 
     def fun_obf(args: List[HyperValue]) -> float:
@@ -47,32 +67,55 @@ def fun_obf(args: List[HyperValue]) -> float:
     return fun_obf
 
 
-def optimize(imputer, df, generator, metric, dict_spaces, max_evals=100):
-    """Optimize hyperparamaters
+def optimize(
+    imputer: _Imputer,
+    df: pd.DataFrame,
+    generator: _HoleGenerator,
+    metric: str,
+    dict_config: Dict[str, HyperValue],
+    max_evals: int = 100,
+    verbose: bool = False,
+):
+    """Return the provided imputer with hyperparameters optimized in the provided range in order to
+     minimize the provided metric.
 
     Parameters
     ----------
-    df : pd.DataFrame
-        DataFrame masked
+    imputer: _Imputer
+        Imputer that should be optimized, it should at least have a fit_transform method and an
+        imputer_params attribute
+    generator: _HoleGenerator
+        Generator creating the masked values in the nested cross validation allowing to measure the
+         imputer performance
+    metric: str
+        Metric used as perfomance indicator, common values are `mse` and `mae`
+    dict_config: Dict[str, HyperValue]
+        Search space for the tested hyperparameters
+    max_evals: int
+        Maximum number of evaluation of the performance of the algorithm. Each estimation involves
+        one call to fit_transform per fold returned by the generator. See the n_fold attribute.
+    verbose: bool
+        Verbosity switch, usefull for imputers that can have unstable behavior for some
+        hyperparameters values
 
     Returns
     -------
-    Dict[str, Any]
-        hyperparameters optimize flat
+    _Imputer
+        Optimized imputer
     """
     imputer = copy.deepcopy(imputer)
-    if dict_spaces == {}:
+    if dict_config == {}:
         return imputer
-    names_hyperparams = list(dict_spaces.keys())
-    values_hyperparams = list(dict_spaces.values())
-    imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_spaces.keys()))
+    names_hyperparams = list(dict_config.keys())
+    values_hyperparams = list(dict_config.values())
+    imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_config.keys()))
+    if verbose and hasattr(imputer, "verbose"):
+        setattr(imputer, "verbose", False)
     fun_obj = get_objective(imputer, df, generator, metric, names_hyperparams)
     hyperparams = ho.fmin(
         fn=fun_obj, space=values_hyperparams, algo=ho.tpe.suggest, max_evals=max_evals
     )
 
-    # hyperparams = deflat_hyperparams(hyperparams_flat)
     for key, value in hyperparams.items():
         setattr(imputer, key, value)
-    # imputer.hyperparams = hyperparams
     return imputer
@@ -919,6 +919,7 @@ def get_metric(name: str) -> Callable:
         "wasserstein_columnwise": partial(wasserstein_distance, method="columnwise"),
         "KL_columnwise": partial(kl_divergence, method="columnwise"),
         "KL_gaussian": partial(kl_divergence, method="gaussian"),
+        "KL_forest": partial(kl_divergence, method="random_forest"),
         "ks_test": kolmogorov_smirnov_test,
         "correlation_diff": mean_difference_correlation_matrix_numerical_features,
         "pairwise_dist": sum_pairwise_distances,
 
@@ -113,6 +113,7 @@ def __init__(
         stagnation_threshold: float = 5e-3,
         stagnation_loglik: float = 2,
         period: int = 1,
+        verbose: bool = False,
     ):
         if method not in ["mle", "sample"]:
             raise ValueError(f"`method` must be 'mle' or 'sample', provided value is '{method}'")
@@ -131,6 +132,7 @@ def __init__(
 
         self.dict_criteria_stop: Dict[str, List] = {}
         self.period = period
+        self.verbose = verbose
 
     def _convert_numpy(self, X: NDArray) -> NDArray:
         """
@@ -248,6 +250,8 @@ class MultiNormalEM(EM):
     dt : float
         Process integration time step, a large value increases the sample bias and can make
         the algorithm unstable, but compensates for a smaller n_iter_ou. By default, 2e-2.
+    verbose: bool
+        default `False`
 
     Attributes
     ----------
@@ -280,6 +284,7 @@ def __init__(
         stagnation_threshold: float = 5e-3,
         stagnation_loglik: float = 2,
         period: int = 1,
+        verbose: bool = False,
     ) -> None:
         super().__init__(
             method=method,
@@ -292,6 +297,7 @@ def __init__(
             stagnation_threshold=stagnation_threshold,
             stagnation_loglik=stagnation_loglik,
             period=period,
+            verbose=verbose,
         )
         self.dict_criteria_stop = {"logliks": [], "means": [], "covs": []}
 
@@ -473,6 +479,8 @@ class VAR1EM(EM):
     dt : float
         Process integration time step, a large value increases the sample bias and can make
         the algorithm unstable, but compensates for a smaller n_iter_ou. By default, 2e-2.
+    verbose: bool
+        default `False`
 
     Attributes
     ----------
@@ -505,6 +513,7 @@ def __init__(
         stagnation_threshold: float = 5e-3,
         stagnation_loglik: float = 2,
         period: int = 1,
+        verbose: bool = False,
     ) -> None:
         super().__init__(
             method=method,
@@ -517,6 +526,7 @@ def __init__(
             stagnation_threshold=stagnation_threshold,
             stagnation_loglik=stagnation_loglik,
             period=period,
+            verbose=verbose,
         )
 
     def fit_parameter_A(self, X):