scikit-learn-contrib
diff --git a/‎environment.ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎environment.ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎environment.dev.yml‎
Lines changed: 2 additions & 2 deletions b/‎environment.dev.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/benchmark.md‎
Lines changed: 35 additions & 17 deletions b/‎examples/benchmark.md‎
Lines changed: 35 additions & 17 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 16 additions & 33 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 16 additions & 33 deletions
@@ -9,7 +9,7 @@ dependencies:
           - flake8
           - matplotlib
           - mypy
-          - numpy==1.19
+          - numpy
           - numpydoc
           - pytest
           - pytest-cov
 
@@ -9,14 +9,14 @@ dependencies:
     - jupyter=1.0.0
     - jupyterlab=1.2.6
     - jupytext=1.14.4
-    - numpy=1.21
+    - hyperopt=0.2.7
+    - numpy=1.24.4
     - packaging=23.1
     - pandas=2.0.1
     - python=3.8
     - pip=23.0.1
     - scipy=1.10.1
     - scikit-learn=1.2.2
-    - scikit-optimize=0.9
     - sphinx=6.2.1
     - sphinx-gallery=0.13.0
     - sphinx_rtd_theme=1.2.0
 
@@ -6,7 +6,7 @@ jupyter:
       extension: .md
       format_name: markdown
       format_version: '1.3'
-      jupytext_version: 1.14.5
+      jupytext_version: 1.14.4
   kernelspec:
     display_name: env_qolmat_dev
     language: python
@@ -32,6 +32,8 @@ import pandas as pd
 from datetime import datetime
 import numpy as np
 import scipy
+import hyperopt as ho
+from hyperopt.pyll.base import Apply as hoApply
 np.random.seed(1234)
 import pprint
 from matplotlib import pyplot as plt
@@ -48,7 +50,7 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
 
 
 import sys
-from qolmat.benchmark import comparator, missing_patterns
+from qolmat.benchmark import comparator, missing_patterns, hyperparameters
 from qolmat.benchmark.metrics import kl_divergence
 from qolmat.imputations import imputers
 from qolmat.utils import data, utils, plot
@@ -62,7 +64,8 @@ The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consist
 This dataset only contains numerical vairables.
 
 ```python
-df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
+df_data = data.get_data_corrupted("Beijing_offline", ratio_masked=.2, mean_size=20)
+df_data = df_data.iloc[:256]
 
 # cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
 # cols_to_impute = df_data.columns[df_data.isna().any()]
@@ -123,6 +126,10 @@ All presented methods are group-wise: here each station is imputed independently
 Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
 In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
 
+```python
+ratio_masked = 0.1
+```
+
 ```python
 imputer_mean = imputers.ImputerMean(groups=["station"])
 imputer_median = imputers.ImputerMedian(groups=["station"])
@@ -135,7 +142,6 @@ imputer_shuffle = imputers.ImputerShuffle(groups=["station"])
 imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
 
 imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=False, max_iter=256, tau=2, lam=1)
-# imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=7, max_iter=100)
 
 imputer_ou = imputers.ImputerEM(groups=["station"], model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
 imputer_tsou = imputers.ImputerEM(groups=["station"], model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
@@ -145,7 +151,30 @@ imputer_tsmle = imputers.ImputerEM(groups=["station"], model="VAR1", method="mle
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
 imputer_mice = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
 imputer_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
+```
+
+```python
+generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], subset=cols_to_impute, ratio_masked=ratio_masked)
+```
+
+```python
+dict_config_opti = {
+    "tau": ho.hp.uniform("tau", low=.5, high=5),
+    "lam": ho.hp.uniform("lam", low=.1, high=1),
+}
+imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=False, max_iter=256)
+imputer_rpca_opti = hyperparameters.optimize(
+    imputer_rpca_opti,
+    df_data,
+    generator = generator_holes,
+    metric="mae",
+    max_evals=10,
+    dict_config_opti=dict_config_opti
+)
+# imputer_rpca_opti.params_optim = hyperparams_opti
+```
 
+```python
 dict_imputers = {
     # "mean": imputer_mean,
     # "median": imputer_median,
@@ -158,23 +187,14 @@ dict_imputers = {
     "TSOU": imputer_tsou,
     "TSMLE": imputer_tsmle,
     "RPCA": imputer_rpca,
-    # "RPCA_opti": imputer_rpca_opti,
+    "RPCA_opti": imputer_rpca_opti,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
     # "ols": imputer_regressor,
     # "mice_ols": imputer_mice,
 }
 n_imputers = len(dict_imputers)
-
-dict_config_opti = {
-    "RPCA_opti": {
-        "tau": {"min": .5, "max": 5, "type":"Real"},
-        "lam": {"min": .1, "max": 1, "type":"Real"},
-    }
-}
-
-ratio_masked = 0.1
 ```
 
 In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.
@@ -190,14 +210,12 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
 ```python
-generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
-
 comparison = comparator.Comparator(
     dict_imputers,
     cols_to_impute,
     generator_holes = generator_holes,
     metrics=["mae", "wmape", "KL_columnwise", "ks_test", "energy"],
-    n_calls_opt=10,
+    max_evals=10,
     dict_config_opti=dict_config_opti,
 )
 results = comparison.compare(df_data)
 
@@ -1,10 +1,9 @@
-from functools import partial
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 import pandas as pd
 
-from qolmat.benchmark import cross_validation, metrics
+from qolmat.benchmark import hyperparameters, metrics
 from qolmat.benchmark.missing_patterns import _HoleGenerator
 
 
@@ -23,41 +22,26 @@ class Comparator:
     dict_config_opti: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
         dictionary of search space for each implementation method. By default, the value is set to
         {}.
-    n_calls_opt: int = 10
+    max_evals: int = 10
         number of calls of the optimization algorithm
         10.
     """
 
-    dict_metrics: Dict[str, Callable] = {
-        "mse": metrics.mean_squared_error,
-        "rmse": metrics.root_mean_squared_error,
-        "mae": metrics.mean_absolute_error,
-        "wmape": metrics.weighted_mean_absolute_percentage_error,
-        "wasserstein_columnwise": partial(metrics.wasserstein_distance, method="columnwise"),
-        "KL_columnwise": partial(metrics.kl_divergence, method="columnwise"),
-        "KL_gaussian": partial(metrics.kl_divergence, method="gaussian"),
-        "ks_test": metrics.kolmogorov_smirnov_test,
-        "correlation_diff": metrics.mean_difference_correlation_matrix_numerical_features,
-        "pairwise_dist": metrics.sum_pairwise_distances,
-        "energy": metrics.sum_energy_distances,
-        "frechet": metrics.frechet_distance,
-    }
-
     def __init__(
         self,
         dict_models: Dict[str, Any],
         selected_columns: List[str],
         generator_holes: _HoleGenerator,
         metrics: List = ["mae", "wmape", "KL_columnwise"],
         dict_config_opti: Optional[Dict[str, Any]] = {},
-        n_calls_opt: int = 10,
+        max_evals: int = 10,
     ):
         self.dict_imputers = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
         self.metrics = metrics
         self.dict_config_opti = dict_config_opti
-        self.n_calls_opt = n_calls_opt
+        self.max_evals = max_evals
 
     def get_errors(
         self,
@@ -81,7 +65,7 @@ def get_errors(
         """
         dict_errors = {}
         for name_metric in self.metrics:
-            dict_errors[name_metric] = Comparator.dict_metrics[name_metric](
+            dict_errors[name_metric] = metrics.get_metric(name_metric)(
                 df_origin, df_imputed, df_mask
             )
         errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
@@ -114,17 +98,16 @@ def evaluate_errors_sample(
         for df_mask in self.generator_holes.split(df_origin):
             df_corrupted = df_origin.copy()
             df_corrupted[df_mask] = np.nan
-            if dict_config_opti_imputer:
-                cv = cross_validation.CrossValidation(
-                    imputer,
-                    dict_config_opti_imputer=dict_config_opti_imputer,
-                    hole_generator=self.generator_holes,
-                    n_calls=self.n_calls_opt,
-                )
-                imputer.hyperparams_optim = cv.optimize_hyperparams(df_corrupted)
-            else:
-                imputer.hyperparams_optim = {}
-            df_imputed = imputer.fit_transform(df_corrupted)
+            metric_optim = "mae"
+            imputer_opti = hyperparameters.optimize(
+                imputer,
+                df,
+                self.generator_holes,
+                metric_optim,
+                dict_config_opti_imputer,
+                max_evals=self.max_evals,
+            )
+            df_imputed = imputer_opti.fit_transform(df_corrupted)
             subset = self.generator_holes.subset
             errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
             list_errors.append(errors)