scikit-learn-contrib
diff --git a/‎examples/RPCA.md‎
Lines changed: 6 additions & 3 deletions b/‎examples/RPCA.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/benchmark.md‎
Lines changed: 23 additions & 46 deletions b/‎examples/benchmark.md‎
Lines changed: 23 additions & 46 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 3 additions & 0 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎qolmat/benchmark/hyperparameters.py‎
Lines changed: 61 additions & 23 deletions b/‎qolmat/benchmark/hyperparameters.py‎
Lines changed: 61 additions & 23 deletions
diff --git a/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 0 deletions b/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 2 additions & 4 deletions b/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 2 additions & 4 deletions
@@ -46,7 +46,10 @@ X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalie
 signal = X_true + A_true + E_true
 
 # Adding missing data
-signal[5:20] = np.nan
+#signal[5:20] = np.nan
+mask = np.random.choice(len(signal), round(len(signal) / 20))
+signal[mask] = np.nan
+
 ```
 
 ```python
@@ -74,7 +77,7 @@ plt.show()
 
 ```python
 %%time
-rpca_pcp = RPCAPCP(period=100, max_iterations=5, mu=.5, lam=1)
+rpca_pcp = RPCAPCP(period=100, max_iterations=100, mu=.5, lam=0.1)
 X, A = rpca_pcp.decompose_rpca_signal(signal)
 imputed = signal - A
 ```
@@ -89,7 +92,7 @@ plt.plot(imputed)
 
 ```python
 %%time
-rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[10], list_etas=[0.01], norm="L2")
+rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, list_periods=[10], list_etas=[0.01], norm="L2")
 X, A = rpca_noisy.decompose_rpca_signal(signal)
 ```
 
 
@@ -116,6 +116,8 @@ ratio_masked = 0.1
 ```
 
 ```python
+dict_config_opti = {}
+
 imputer_mean = imputers.ImputerMean(groups=("station",))
 imputer_median = imputers.ImputerMedian(groups=("station",))
 imputer_mode = imputers.ImputerMode(groups=("station",))
@@ -126,7 +128,19 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
 imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
 imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
 
-imputer_rpca = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256, tau=2, lam=1)
+imputer_rpca = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
+imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
+dict_config_opti["RPCA_opti"] = {
+    "tau": ho.hp.uniform("tau", low=.5, high=5),
+    "lam": ho.hp.uniform("lam", low=.1, high=1),
+}
+imputer_rpca_opticw = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
+dict_config_opti["RPCA_opticw"] = {
+    "tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
+    "tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
+    "lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
+    "lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
+}
 
 imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
 imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
@@ -142,41 +156,6 @@ imputer_regressor = imputers.ImputerRegressor(groups=("station",), estimator=Lin
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked)
 ```
 
-```python
-dict_config_opti = {
-    "tau": ho.hp.uniform("tau", low=.5, high=5),
-    "lam": ho.hp.uniform("lam", low=.1, high=1),
-}
-imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
-imputer_rpca_opti = hyperparameters.optimize(
-    imputer_rpca_opti,
-    df_data,
-    generator = generator_holes,
-    metric="mae",
-    max_evals=10,
-    dict_spaces=dict_config_opti
-)
-# imputer_rpca_opti.params_optim = hyperparams_opti
-```
-
-```python
-dict_config_opti2 = {
-    "tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
-    "tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
-    "lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
-    "lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
-}
-imputer_rpca_opti2 = imputers.ImputerRPCA(groups=("station",), columnwise=True, max_iterations=256)
-imputer_rpca_opti2 = hyperparameters.optimize(
-    imputer_rpca_opti2,
-    df_data,
-    generator = generator_holes,
-    metric="mae",
-    max_evals=10,
-    dict_spaces=dict_config_opti2
-)
-```
-
 ```python
 dict_imputers = {
     "mean": imputer_mean,
@@ -189,9 +168,9 @@ dict_imputers = {
     # "OU": imputer_ou,
     "TSOU": imputer_tsou,
     "TSMLE": imputer_tsmle,
-    # "RPCA": imputer_rpca,
-    # "RPCA_opti": imputer_rpca_opti,
-    # "RPCA_opti2": imputer_rpca_opti2,
+    "RPCA": imputer_rpca,
+    "RPCA_opti": imputer_rpca_opti,
+    # "RPCA_opticw": imputer_rpca_opti2,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
@@ -218,7 +197,7 @@ comparison = comparator.Comparator(
     dict_imputers,
     cols_to_impute,
     generator_holes = generator_holes,
-    metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
+    metrics=["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"],
     max_evals=10,
     dict_config_opti=dict_config_opti,
 )
@@ -230,11 +209,13 @@ results
 df_plot = results.loc["KL_columnwise",'TEMP']
 plt.barh(df_plot.index, df_plot, color=tab10(0))
 plt.title('TEMP')
+plt.xlabel("KL")
 plt.show()
 
 df_plot = results.loc["KL_columnwise",'PRES']
 plt.barh(df_plot.index, df_plot, color=tab10(0))
 plt.title('PRES')
+plt.xlabel("KL")
 plt.show()
 ```
 
@@ -245,8 +226,8 @@ plot.multibar(results.loc["mae"], decimals=1)
 plt.ylabel("mae")
 
 fig.add_subplot(2, 1, 2)
-plot.multibar(results.loc["KL_columnwise"], decimals=1)
-plt.ylabel("KL")
+plot.multibar(results.loc["dist_corr_pattern"], decimals=2)
+plt.ylabel("dist_corr_pattern")
 
 plt.savefig("figures/imputations_benchmark_errors.png")
 plt.show()
@@ -294,10 +275,6 @@ for col in cols_to_impute:
 
 ```
 
-```python
-dfs_imputed
-```
-
 ```python
 # plot.plot_imputations(df_station, dfs_imputed_station)
 
 
@@ -35,13 +35,15 @@ def __init__(
         metrics: List = ["mae", "wmape", "KL_columnwise"],
         dict_config_opti: Optional[Dict[str, Any]] = {},
         max_evals: int = 10,
+        verbose: bool = False,
     ):
         self.dict_imputers = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
         self.metrics = metrics
         self.dict_config_opti = dict_config_opti
         self.max_evals = max_evals
+        self.verbose = verbose
 
     def get_errors(
         self,
@@ -106,6 +108,7 @@ def evaluate_errors_sample(
                 metric_optim,
                 dict_config_opti_imputer,
                 max_evals=self.max_evals,
+                verbose=self.verbose,
             )
             df_imputed = imputer_opti.fit_transform(df_corrupted)
             subset = self.generator_holes.subset
 
@@ -1,5 +1,4 @@
 import copy
-import logging
 from typing import Any, Callable, Dict, List, Union
 
 import numpy as np
@@ -12,21 +11,38 @@
 from qolmat.benchmark import metrics
 
 from qolmat.benchmark.missing_patterns import _HoleGenerator
+from qolmat.imputations.imputers import _Imputer
+from qolmat.utils.utils import HyperValue
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
 
-HyperValue = Union[int, float, str]
-
-
-def get_objective(imputer, df, generator, metric, names_hyperparams) -> Callable:
+def get_objective(
+    imputer: _Imputer,
+    df: pd.DataFrame,
+    generator: _HoleGenerator,
+    metric: str,
+    names_hyperparams: List[str],
+) -> Callable:
     """
-    Define the objective function for the cross-validation
+    Define the objective function, which is the average metric computed over the folds provided by
+    the hole generator, using a cross-validation.
+
+    Parameters
+    ----------
+    imputer: _Imputer
+        Imputer that should be optimized, it should at least have a fit_transform method and an
+        imputer_params attribute
+    generator: _HoleGenerator
+        Generator creating the masked values in the nested cross validation allowing to measure the
+         imputer performance
+    metric: str
+        Metric used as perfomance indicator, common values are `mse` and `mae`
+    names_hyperparams: List[str]
+        List of the names of the hyperparameters which are being optimized
 
     Returns
     -------
-    _type_
-        objective function
+    Callable[List[HyperValue], float]
+        Objective function
     """
 
     def fun_obf(args: List[HyperValue]) -> float:
@@ -39,7 +55,6 @@ def fun_obf(args: List[HyperValue]) -> float:
             df_origin = df.copy()
             df_corrupted = df_origin.copy()
             df_corrupted[df_mask] = np.nan
-
             df_imputed = imputer.fit_transform(df_corrupted)
             subset = generator.subset
             fun_metric = metrics.get_metric(metric)
@@ -52,32 +67,55 @@ def fun_obf(args: List[HyperValue]) -> float:
     return fun_obf
 
 
-def optimize(imputer, df, generator, metric, dict_spaces, max_evals=100):
-    """Optimize hyperparamaters
+def optimize(
+    imputer: _Imputer,
+    df: pd.DataFrame,
+    generator: _HoleGenerator,
+    metric: str,
+    dict_config: Dict[str, HyperValue],
+    max_evals: int = 100,
+    verbose: bool = False,
+):
+    """Return the provided imputer with hyperparameters optimized in the provided range in order to
+     minimize the provided metric.
 
     Parameters
     ----------
-    df : pd.DataFrame
-        DataFrame masked
+    imputer: _Imputer
+        Imputer that should be optimized, it should at least have a fit_transform method and an
+        imputer_params attribute
+    generator: _HoleGenerator
+        Generator creating the masked values in the nested cross validation allowing to measure the
+         imputer performance
+    metric: str
+        Metric used as perfomance indicator, common values are `mse` and `mae`
+    dict_config: Dict[str, HyperValue]
+        Search space for the tested hyperparameters
+    max_evals: int
+        Maximum number of evaluation of the performance of the algorithm. Each estimation involves
+        one call to fit_transform per fold returned by the generator. See the n_fold attribute.
+    verbose: bool
+        Verbosity switch, usefull for imputers that can have unstable behavior for some
+        hyperparameters values
 
     Returns
     -------
-    Dict[str, Any]
-        hyperparameters optimize flat
+    _Imputer
+        Optimized imputer
     """
     imputer = copy.deepcopy(imputer)
-    if dict_spaces == {}:
+    if dict_config == {}:
         return imputer
-    names_hyperparams = list(dict_spaces.keys())
-    values_hyperparams = list(dict_spaces.values())
-    imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_spaces.keys()))
+    names_hyperparams = list(dict_config.keys())
+    values_hyperparams = list(dict_config.values())
+    imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_config.keys()))
+    if verbose and hasattr(imputer, "verbose"):
+        setattr(imputer, "verbose", False)
     fun_obj = get_objective(imputer, df, generator, metric, names_hyperparams)
     hyperparams = ho.fmin(
         fn=fun_obj, space=values_hyperparams, algo=ho.tpe.suggest, max_evals=max_evals
     )
 
-    # hyperparams = deflat_hyperparams(hyperparams_flat)
     for key, value in hyperparams.items():
         setattr(imputer, key, value)
-    # imputer.hyperparams = hyperparams
     return imputer
@@ -919,6 +919,7 @@ def get_metric(name: str) -> Callable:
         "wasserstein_columnwise": partial(wasserstein_distance, method="columnwise"),
         "KL_columnwise": partial(kl_divergence, method="columnwise"),
         "KL_gaussian": partial(kl_divergence, method="gaussian"),
+        "KL_forest": partial(kl_divergence, method="random_forest"),
         "ks_test": kolmogorov_smirnov_test,
         "correlation_diff": mean_difference_correlation_matrix_numerical_features,
         "pairwise_dist": sum_pairwise_distances,
 
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 import functools
-import logging
 from typing import Callable, List, Optional, Tuple, Union
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -11,8 +11,6 @@
 
 from qolmat.utils.exceptions import NoMissingValue, SubsetIsAString
 
-logger = logging.getLogger(__name__)
-
 
 def compute_transition_counts_matrix(states: pd.Series):
     if isinstance(states.iloc[0], tuple):
@@ -305,7 +303,7 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
                     break
 
         if list_failed:
-            logger.warning(f"No place to introduce sampled holes of size {list_failed}!")
+            warnings.warn(f"No place to introduce sampled holes of size {list_failed}!")
         return mask