scikit-learn-contrib
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/1_timeSeries.ipynb‎
Lines changed: 277 additions & 0 deletions b/‎examples/1_timeSeries.ipynb‎
Lines changed: 277 additions & 0 deletions
diff --git a/‎examples/test.py‎
Lines changed: 55 additions & 0 deletions b/‎examples/test.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 50 additions & 38 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 50 additions & 38 deletions
@@ -6,6 +6,7 @@ __pycache__/
 /figures
 qolmat/notebooks/figures
 qolmat/notebooks/*.ipynb
+qolmat/examples/*.ipynb
 *.egg-info
 /dist
 /build
 
@@ -0,0 +1,55 @@
+
+import numpy as np
+import timesynth as ts # package for generating time series
+
+import matplotlib.pyplot as plt
+
+from qolmat.utils import plot
+from qolmat.imputations.rpca.pcp_rpca import PcpRPCA
+from qolmat.imputations.rpca.temporal_rpca import TemporalRPCA, OnlineTemporalRPCA
+np.random.seed(402)
+
+################################################################################
+
+time_sampler = ts.TimeSampler(stop_time=20)
+irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
+sinusoid = ts.signals.Sinusoidal(frequency=2)
+white_noise = ts.noise.GaussianNoise(std=0.1)
+timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
+samples, signals, errors = timeseries.sample(irregular_time_samples)
+
+n = len(samples)
+pc = 0.02
+indices_ano1 = np.random.choice(n, int(n*pc))
+samples[indices_ano1] = [np.random.uniform(low=2*np.min(samples), high=2*np.max(samples)) for i in range(int(n*pc))]
+indices = np.random.choice(n, int(n*pc))
+samples[indices] = np.nan
+
+
+################################################################################
+
+time_sampler = ts.TimeSampler(stop_time=20)
+irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
+sinusoid = ts.signals.Sinusoidal(frequency=3)
+white_noise = ts.noise.GaussianNoise(std=0)
+timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
+samples2, signals2, errors2 = timeseries.sample(irregular_time_samples)
+
+n2 = len(samples2)
+indices_ano2 = np.random.choice(n2, int(n*pc))
+samples2[indices_ano2] = [np.random.uniform(low=2*np.min(samples2), high=2*np.max(samples2)) for i in range(int(n2*pc))]
+indices = np.random.choice(n2, int(n*pc))
+samples2[indices] = np.nan
+
+samples += samples2
+signals += signals2
+errors += errors2
+
+
+
+online_temp_rpca = OnlineTemporalRPCA(n_rows=25, tau=1, lam=0.3, list_periods=[20], list_etas=[0.01],
+                       burnin=0.2, online_list_etas=[0.3], nwin=20)
+X, A = online_temp_rpca.fit_transform(X=samples)
+plot.plot_sig
+nal([samples, X, A], style="matplotlib")
+len(samples)
@@ -1,4 +1,5 @@
-from typing import Dict, List, Optional
+import logging
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -22,31 +23,29 @@ class Comparator:
     search_params: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
         dictionary of search space for each implementation method. By default, the value is set to
         {}.
-    n_cv_calls: Optional[int] = 10
-        number of calls of the hyperparameters cross-validation. By default, the value is set to
+    n_calls_opt: Optional[int] = 10
+        number of calls of the optimization algorithm
         10.
     """
 
     def __init__(
         self,
-        dict_models: Dict,
+        dict_models: Dict[str, any],
         selected_columns: List[str],
         generator_holes: _HoleGenerator,
-        columnwise_evaluation: Optional[bool] = True,
-        search_params: Optional[Dict] = {},
-        n_cv_calls: Optional[int] = 10,
+        search_params: Optional[Dict[str, Dict[str, Union[float, int, str]]]] = {},
+        n_calls_opt: Optional[int] = 10,
     ):
 
         self.dict_models = dict_models
         self.selected_columns = selected_columns
         self.generator_holes = generator_holes
-        self.columnwise_evaluation = columnwise_evaluation
         self.search_params = search_params
-        self.n_cv_calls = n_cv_calls
+        self.n_calls_opt = n_calls_opt
 
     def get_errors(
         self, df_origin: pd.DataFrame, df_imputed: pd.DataFrame, df_mask: pd.DataFrame
-    ) -> float:
+    ) -> pd.DataFrame:
         """Functions evaluating the reconstruction's quality
 
         Parameters
@@ -75,27 +74,18 @@ def get_errors(
             df_origin[df_mask],
             df_imputed[df_mask],
         )
+
         dict_errors["kl"] = utils.kl_divergence(
             df_origin[df_mask],
             df_imputed[df_mask],
         )
-        # if self.columnwise_evaluation:
-        #     wd = utils.wasser_distance(
-        #         df_origin,
-        #         df_imputed,
-        #     )
-        # if not self.columnwise_evaluation and df_origin.shape[1] > 1:
-        #     frechet = utils.frechet_distance(
-        #         df_origin,
-        #         df_imputed,
-        #         normalized=False,
-        #     )
+
         errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
         return errors
 
     def evaluate_errors_sample(
-        self, tested_model: any, df: pd.DataFrame, search_space: Optional[dict] = None
-    ) -> Dict:
+        self, imputer: any, df: pd.DataFrame, list_spaces: List[Dict] = {}
+    ) -> pd.Series:
         """Evaluate the errors in the cross-validation
 
         Parameters
@@ -104,8 +94,8 @@ def evaluate_errors_sample(
             imputation model
         df : pd.DataFrame
             dataframe to impute
-        search_space : Optional[dict], optional
-            search space for tested_model's hyperparameters, by default None
+        search_space : Dict
+            search space for tested_model's hyperparameters
 
         Returns
         -------
@@ -114,25 +104,30 @@ def evaluate_errors_sample(
         """
         list_errors = []
         df_origin = df[self.selected_columns].copy()
+        if list_spaces:
+            print("Hyperparameter optimization")
+            print(list_spaces)
+        else:
+            print("No hyperparameter optimization")
         for df_mask in self.generator_holes.split(df_origin):
             df_corrupted = df_origin.copy()
             df_corrupted[df_mask] = np.nan
-            if search_space is None:
-                df_imputed = tested_model.fit_transform(df_corrupted)
-            else:
+            if list_spaces:
                 cv = cross_validation.CrossValidation(
-                    tested_model,
-                    search_space=search_space,
+                    imputer,
+                    list_spaces=list_spaces,
                     hole_generator=self.generator_holes,
-                    n_calls=self.n_cv_calls,
+                    n_calls=self.n_calls_opt,
                 )
                 df_imputed = cv.fit_transform(df_corrupted)
+            else:
+                df_imputed = imputer.fit_transform(df_corrupted)
 
             subset = self.generator_holes.subset
             errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
             list_errors.append(errors)
         df_errors = pd.DataFrame(list_errors)
-        errors_mean = df_errors.mean()
+        errors_mean = df_errors.mean(axis=0)
 
         return errors_mean
 
@@ -151,13 +146,30 @@ def compare(self, df: pd.DataFrame, verbose: bool = True):
         """
 
         dict_errors = {}
-        for name, tested_model in self.dict_models.items():
-            if verbose:
-                print(type(tested_model).__name__)
-
-            search_space = utils.get_search_space(tested_model, self.search_params)
 
-            dict_errors[name] = self.evaluate_errors_sample(tested_model, df, search_space)
+        for name, imputer in self.dict_models.items():
+            print(f"Tested model: {type(imputer).__name__}")
+
+            search_params = self.search_params.get(name, {})
+            
+            # if imputer.columnwise:
+            #     if len(self.selected_columns) > 0:
+            #         search_params = {}
+            #         for col in self.selected_columns:
+            #             for key, value in self.search_params[type(imputer).__name__].items():
+            #                 search_params[f"('{col}', '{key}')"] = value
+            #     else:
+            #         search_params = self.search_params[type(imputer).__name__]
+            # else:
+            #     search_params = self.search_params[type(imputer).__name__]
+
+            list_spaces = utils.get_search_space(search_params)
+
+            try:
+                dict_errors[name] = self.evaluate_errors_sample(imputer, df, list_spaces)
+            except Exception as excp:
+                print("Error while testing ", type(imputer).__name__)
+                raise excp
 
         df_errors = pd.DataFrame(dict_errors)