scikit-learn-contrib
diff --git a/‎.flake8‎
Lines changed: 1 addition & 1 deletion b/‎.flake8‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 23 additions & 24 deletions b/‎.github/workflows/test.yml‎
Lines changed: 23 additions & 24 deletions
diff --git a/‎docs/examples/imputation_example.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/imputation_example.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/benchmark.md‎
Lines changed: 7 additions & 6 deletions b/‎examples/benchmark.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎examples/metrics_usage.md‎
Lines changed: 3 additions & 3 deletions b/‎examples/metrics_usage.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 23 additions & 55 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 23 additions & 55 deletions
@@ -1,7 +1,7 @@
 [flake8]
 exclude = .git,__pycache__,.vscode,tests
 max-line-length=99
-ignore=E302,E305,W503,E203,E731,E402,E501,E266,E712,F401,F821
+ignore=E302,E305,W503,E203,E731,E402,E266,E712,F401,F821
 indent-size = 4
 per-file-ignores=
     qolmat/imputations/imputers.py:F401
 
@@ -1,37 +1,36 @@
 name: Unit test Qolmat
 
-on: [push, pull_request,workflow_dispatch]
-
+on: [push, pull_request, workflow_dispatch]
 
 jobs:
   build-linux:
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
-        os: [ubuntu-latest,windows-latest]
-        python-version: [3.8,3.9]
+        os: [ubuntu-latest, windows-latest]
+        python-version: [3.8, 3.9]
     defaults:
       run:
         shell: bash -l {0}
 
     steps:
-    - name: Git clone
-      uses: actions/checkout@v3
-    - name: Set up venv for ci
-      uses: conda-incubator/setup-miniconda@v2
-      with:
-        python-version: ${{matrix.python-version}}
-        channels: default, conda-forge
-    - name: Lint with flake8
-      run: |
-        conda install flake8
-        flake8
-    - name: Test with pytest
-      run: |
-        conda install pytest
-        #pytest
-        echo you should uncomment pytest and delete this line
-    - name: typing with mypy
-      run: |
-        #mypy qolmat
-        echo you should uncomment mypy qolmat and delete this line
+      - name: Git clone
+        uses: actions/checkout@v3
+      - name: Set up venv for ci
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: ${{matrix.python-version}}
+          channels: default, conda-forge
+      - name: Lint with flake8
+        run: |
+          conda install flake8
+          flake8
+      - name: Test with pytest
+        run: |
+          conda install pytest
+          #pytest
+          echo you should uncomment pytest and delete this line
+      - name: typing with mypy
+        run: |
+          #mypy qolmat
+          echo you should uncomment mypy qolmat and delete this line
@@ -53,7 +53,7 @@ Some methods take arguments. For instance, if we want to impute by the mean, we
 * Here, in the :class:`ImputerMean` , we specify :class:`groups=["datetime.dt.month", "datetime.dt.dayofweek"]`, which means  the method will first use a groupby operation (via :class:`pd.DataFrame.groupby`) and then impute missing values with the mean of their corresponding group. 
 * For the :class:`ImputeInterpolation`, the method can be anything supported by :class:`pd.Series.interpolate`; hence for :class:`spline` and :class:`polynomial`, we have to provide an :class:`order`. 
 * For the :class:`ImputerRPCA`, we first need to specify the :class:`method`, i.e. :class:`PCP`, :class:`Temporal` or :class:`Online`. It is also mandatory to mention if we deal with multivariate or not. Finally, there is a set of hyperparameters that can be specify.  See the doc "Focus on RPCA" for more information. 
-* For the :class:`ImputerEM`, we can specify the maximum number of iterations or the strategy used, i.e. "sample" or "argmax" (By default, "sample"). See the doc "Focus on EM Sampler" for more information. 
+* For the :class:`ImputerEM`, we can specify the maximum number of iterations or the model used, i.e. "sample" or "mle" (By default, "sample"). See the doc "Focus on EM Sampler" for more information. 
 * For the :class:`ImputerIterative`, we can specify the regression model to use, with its own hyperparameters. 
 * For the :class:`ImputerRegressor`, we can specify the regression model to use, with its own hyperparameters as well as the name of the columns to impute. 
 
 
@@ -48,7 +48,7 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
 
 import sys
 from qolmat.benchmark import comparator, missing_patterns
-from qolmat.benchmark.utils import kl_divergence
+from qolmat.benchmark.metrics import kl_divergence
 from qolmat.imputations import imputers
 from qolmat.utils import data, utils, plot
 # from qolmat.drawing import display_bar_table
@@ -132,9 +132,9 @@ imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, mode
 imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=200, tau=2, lam=.3)
 imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
 
-imputer_ou = imputers.ImputerEM(groups=["station"], method="multinormal", strategy="ou", max_iter_em=34, n_iter_ou=15, dt=1e-3)
-imputer_tsou = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15, dt=1e-3)
-imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15, dt=1e-3)
+imputer_ou = imputers.ImputerEM(groups=["station"], model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
+imputer_tsou = imputers.ImputerEM(groups=["station"], model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
+imputer_tsmle = imputers.ImputerEM(groups=["station"], model="VAR1", method="mle", max_iter_em=34, n_iter_ou=15, dt=1e-3)
 
 
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
@@ -191,6 +191,7 @@ comparison = comparator.Comparator(
     dict_imputers,
     cols_to_impute,
     generator_holes = generator_holes,
+    metrics=["mae", "wmape", "KL"],
     n_calls_opt=10,
     search_params=search_params,
 )
@@ -205,8 +206,8 @@ plot.multibar(results.loc["mae"], decimals=1)
 plt.ylabel("mae")
 
 fig.add_subplot(2, 1, 2)
-plot.multibar(results.loc["energy"], decimals=1)
-plt.ylabel("energy")
+plot.multibar(results.loc["KL"], decimals=1)
+plt.ylabel("KL")
 plt.show()
 ```
 
 
@@ -62,9 +62,9 @@ imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, mode
 imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=200, tau=2, lam=.3)
 imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
 
-imputer_ou = imputers.ImputerEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
-imputer_tsou = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
-imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
+imputer_ou = imputers.ImputerEM(groups=["station"], model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15)
+imputer_tsou = imputers.ImputerEM(groups=["station"], model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15)
+imputer_tsmle = imputers.ImputerEM(groups=["station"], model="VAR1", method="mle", max_iter_em=34, n_iter_ou=15)
 
 
 imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
 
@@ -4,8 +4,7 @@
 import numpy as np
 import pandas as pd
 
-from qolmat.benchmark import cross_validation, utils
-from qolmat.benchmark import metrics as mtr
+from qolmat.benchmark import cross_validation, metrics, utils
 from qolmat.benchmark.missing_patterns import _HoleGenerator
 
 
@@ -30,30 +29,32 @@ class Comparator:
     """
 
     dict_metrics: Dict[str, Any] = {
-        "mse": mtr.mean_squared_error,
-        "rmse": mtr.root_mean_squared_error,
-        "mae": mtr.mean_absolute_error,
-        "wmape": mtr.weighted_mean_absolute_percentage_error,
-        "wasser": mtr.wasser_distance,
-        "KL": mtr.kl_divergence,
-        "ks_test": mtr.kolmogorov_smirnov_test,
-        "correlation_diff": mtr.mean_difference_correlation_matrix_numerical_features,
-        "pairwise_dist": mtr.sum_pairwise_distances,
-        "energy": mtr.sum_energy_distances,
-        "frechet": mtr.frechet_distance,
+        "mse": metrics.mean_squared_error,
+        "rmse": metrics.root_mean_squared_error,
+        "mae": metrics.mean_absolute_error,
+        "wmape": metrics.weighted_mean_absolute_percentage_error,
+        "wasser": metrics.wasser_distance,
+        "KL": metrics.kl_divergence_columnwise,
+        "ks_test": metrics.kolmogorov_smirnov_test,
+        "correlation_diff": metrics.mean_difference_correlation_matrix_numerical_features,
+        "pairwise_dist": metrics.sum_pairwise_distances,
+        "energy": metrics.sum_energy_distances,
+        "frechet": metrics.frechet_distance,
     }
 
     def __init__(
         self,
         dict_models: Dict[str, Any],
         selected_columns: List[str],
         generator_holes: _HoleGenerator,
+        metrics: List = ["mae", "wmape", "KL"],
         search_params: Optional[Dict[str, Dict[str, Union[float, int, str]]]] = {},
         n_calls_opt: int = 10,
     ):
         self.dict_imputers = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
+        self.metrics = metrics
         self.search_params = search_params
         self.n_calls_opt = n_calls_opt
 
@@ -62,8 +63,6 @@ def get_errors(
         df_origin: pd.DataFrame,
         df_imputed: pd.DataFrame,
         df_mask: pd.DataFrame,
-        metrics: List = ["mae", "wmape", "kl"],
-        on_mask=True,
     ) -> pd.DataFrame:
         """Functions evaluating the reconstruction's quality
 
@@ -79,14 +78,11 @@ def get_errors(
         dictionary
             dictionay of results obtained via different metrics
         """
-
-        # TODO comment comparer la distribution initiale et la distribution générée, pas la même taille,
-        # ne fonctionne pas avec les métriques actuelles
-
         dict_errors = {}
-        for name_metric in metrics:
-            dict_errors[name_metric] = Comparator.dict_metrics[name_metric](df_origin, df_imputed)
-
+        for name_metric in self.metrics:
+            dict_errors[name_metric] = Comparator.dict_metrics[name_metric](
+                df_origin, df_imputed, df_mask
+            )
         errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
         return errors
 
@@ -95,8 +91,6 @@ def evaluate_errors_sample(
         imputer: Any,
         df: pd.DataFrame,
         list_spaces: List[Dict] = [],
-        metrics: List = ["mae", "wmape", "kl"],
-        on_mask=True,
     ) -> pd.Series:
         """Evaluate the errors in the cross-validation
 
@@ -114,7 +108,6 @@ def evaluate_errors_sample(
         pd.DataFrame
             DataFrame with the errors for each metric (in column) and at each fold (in index)
         """
-
         list_errors = []
         df_origin = df[self.selected_columns].copy()
         for df_mask in self.generator_holes.split(df_origin):
@@ -130,11 +123,8 @@ def evaluate_errors_sample(
                 df_imputed = cv.fit_transform(df_corrupted)
             else:
                 df_imputed = imputer.fit_transform(df_corrupted)
-
             subset = self.generator_holes.subset
-            errors = self.get_errors(
-                df_origin[subset], df_imputed[subset], df_mask[subset], metrics, on_mask
-            )
+            errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
             list_errors.append(errors)
         df_errors = pd.DataFrame(list_errors)
         errors_mean = df_errors.mean(axis=0)
@@ -144,9 +134,6 @@ def evaluate_errors_sample(
     def compare(
         self,
         df: pd.DataFrame,
-        verbose: bool = True,
-        metrics: List = ["mae", "wmape", "KL"],
-        on_mask=True,
     ):
         """Function to compare different imputation methods on dataframe df
 
@@ -164,15 +151,12 @@ def compare(
         dict_errors = {}
 
         for name, imputer in self.dict_imputers.items():
-
             search_params = self.search_params.get(name, {})
 
             list_spaces = utils.get_search_space(search_params)
 
             try:
-                dict_errors[name] = self.evaluate_errors_sample(
-                    imputer, df, list_spaces, metrics, on_mask
-                )
+                dict_errors[name] = self.evaluate_errors_sample(imputer, df, list_spaces)
                 print(f"Tested model: {type(imputer).__name__}")
             except Exception as excp:
                 print("Error while testing ", type(imputer).__name__)
@@ -185,25 +169,12 @@ def compare(
 
 class ComparatorBasedPattern(Comparator):
 
-    dict_metrics: Dict[str, Any] = {
-        "mse": mtr.mean_squared_error,
-        "rmse": mtr.root_mean_squared_error,
-        "mae": mtr.mean_absolute_error,
-        "wmape": mtr.weighted_mean_absolute_percentage_error,
-        "wasser": mtr.wasser_distance,
-        "KL": mtr.kl_divergence,
-        "ks_test": mtr.kolmogorov_smirnov_test,
-        "correlation_diff": mtr.mean_difference_correlation_matrix_numerical_features,
-        "pairwise_dist": mtr.sum_pairwise_distances,
-        "energy": mtr.sum_energy_distances,
-        "frechet": mtr.frechet_distance,
-    }
-
     def __init__(
         self,
         dict_models: Dict[str, Any],
         selected_columns: List[str],
         generator_holes: _HoleGenerator,
+         metrics: List = ["mae", "wmape", "KL"],
         search_params: Optional[Dict[str, Dict[str, Union[float, int, str]]]] = {},
         n_calls_opt: int = 10,
         num_patterns: int = 5,
@@ -212,6 +183,7 @@ def __init__(
             dict_models=dict_models,
             selected_columns=selected_columns,
             generator_holes=generator_holes,
+            metrics=metrics,
             search_params=search_params,
             n_calls_opt=n_calls_opt,
         )
@@ -223,8 +195,6 @@ def evaluate_errors_sample(
         imputer: Any,
         df: pd.DataFrame,
         list_spaces: List[Dict] = [],
-        metrics: List = ["mae", "wmape", "KL"],
-        on_mask=True,
     ) -> pd.Series:
         """Evaluate the errors in the cross-validation
 
@@ -270,9 +240,7 @@ def evaluate_errors_sample(
 
                 subset = self.generator_holes.subset  # columns selected
                 subset = [col for col in subset if col in cols_pattern]
-                errors = self.get_errors(
-                    df_pattern[subset], df_imputed[subset], df_mask[subset], metrics, on_mask
-                )
+                errors = self.get_errors(df_pattern[subset], df_imputed[subset], df_mask[subset])
                 list_errors.append(errors)
 
         df_errors = pd.DataFrame(list_errors)