scikit-learn-contrib
diff --git a/‎examples/RPCA.md‎
Lines changed: 73 additions & 66 deletions b/‎examples/RPCA.md‎
Lines changed: 73 additions & 66 deletions
diff --git a/‎examples/benchmark.md‎
Lines changed: 31 additions & 66 deletions b/‎examples/benchmark.md‎
Lines changed: 31 additions & 66 deletions
diff --git a/‎examples/figures/imputations_benchmark.png‎
-45.9 KB b/‎examples/figures/imputations_benchmark.png‎
-45.9 KB
diff --git a/‎qolmat/benchmark/comparator.py‎
Lines changed: 0 additions & 5 deletions b/‎qolmat/benchmark/comparator.py‎
Lines changed: 0 additions & 5 deletions
@@ -5,98 +5,105 @@ jupyter:
       extension: .md
       format_name: markdown
       format_version: '1.3'
-      jupytext_version: 1.14.0
+      jupytext_version: 1.14.4
   kernelspec:
-    display_name: Python 3.9.6 64-bit
+    display_name: env_qolmat
     language: python
-    name: python3
+    name: env_qolmat
 ---
 
 ```python
 %reload_ext autoreload
 %autoreload 2
 
 import numpy as np
-import timesynth as ts # package for generating time series
+# import timesynth as ts # package for generating time series
 
 import matplotlib.pyplot as plt
+
 import sys
-sys.path.append("../")
-from qolmat.imputations.rpca.utils import drawing, utils
-from qolmat.imputations.rpca.pcp_rpca import PcpRPCA
-from qolmat.imputations.rpca.temporal_rpca import TemporalRPCA, OnlineTemporalRPCA
+
+from math import pi
+
+from qolmat.utils import plot, data
+from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
+from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
 ```
 
 **Generate synthetic data**
 
 ```python
-np.random.seed(402)
-
-################################################################################
-
-time_sampler = ts.TimeSampler(stop_time=20)
-irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
-sinusoid = ts.signals.Sinusoidal(frequency=2)
-white_noise = ts.noise.GaussianNoise(std=0.1)
-timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
-samples, signals, errors = timeseries.sample(irregular_time_samples)
-
-n = len(samples)
-pc = 0.02
-indices_ano1 = np.random.choice(n, int(n*pc))
-samples[indices_ano1] = [np.random.uniform(low=2*np.min(samples), high=2*np.max(samples)) for i in range(int(n*pc))]
-indices = np.random.choice(n, int(n*pc))
-samples[indices] = np.nan
-
-
-################################################################################
-
-time_sampler = ts.TimeSampler(stop_time=20)
-irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
-sinusoid = ts.signals.Sinusoidal(frequency=3)
-white_noise = ts.noise.GaussianNoise(std=0)
-timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
-samples2, signals2, errors2 = timeseries.sample(irregular_time_samples)
-
-n2 = len(samples2)
-indices_ano2 = np.random.choice(n2, int(n*pc))
-samples2[indices_ano2] = [np.random.uniform(low=2*np.min(samples2), high=2*np.max(samples2)) for i in range(int(n2*pc))]
-indices = np.random.choice(n2, int(n*pc))
-samples2[indices] = np.nan
-
-samples += samples2
-signals += signals2
-errors += errors2
-
-################################################################################
-
-fig, ax = plt.subplots(4, 1, sharex=True, figsize=(12,6))
-ax[0].plot(range(n), signals, c="darkblue")
-ax[0].set_title("Low-rank signal", fontsize=15)
-ax[1].plot(range(n), errors, c="darkgreen")
-ax[1].set_title("Noise", fontsize=15)
-ax[2].plot(range(n), samples-signals-errors, c="tab:red")
-ax[2].set_title("Corruptions", fontsize=15)
-ax[3].plot(range(n), samples, c="k")
-ax[3].set_title("Corrupted signal", fontsize=15)
-ax[3].set_xlabel("Time", fontsize=16)
-plt.tight_layout()
+n_samples = 1000
+
+mesh = np.arange(n_samples)
+X_true = np.zeros(n_samples)
+A_true = np.zeros(n_samples)
+E_true = np.zeros(n_samples)
+p1 = 100
+p2 = 20
+X_true = 1 + np.sin(2 * pi * mesh / p1) + np.sin(2 * pi * mesh / p2)
+noise = np.random.uniform(size=n_samples)
+amplitude_A = .5
+freq_A = .05
+A_true = amplitude_A * np.where(noise < freq_A, -np.log(noise), 0) * (2 * (np.random.uniform(size=n_samples) > .5) - 1)
+amplitude_E = .1
+E_true = amplitude_E * np.random.normal(size=n_samples)
+
+signal = X_true + E_true
+signal[A_true != 0] = A_true[A_true != 0]
+signal = signal.reshape(-1, 1)
+
+# Adding missing data
+signal[5:20, 0] = np.nan
+```
+
+```python
+fig = plt.figure(figsize=(15, 8))
+ax = fig.add_subplot(4, 1, 1)
+ax.title.set_text("Low-rank signal")
+plt.plot(X_true)
+
+ax = fig.add_subplot(4, 1, 2)
+ax.title.set_text("Corruption signal")
+plt.plot(A_true)
+
+ax = fig.add_subplot(4, 1, 3)
+ax.title.set_text("Noise")
+plt.plot(E_true)
+
+ax = fig.add_subplot(4, 1, 4)
+ax.title.set_text("Corrupted signal")
+plt.plot(signal[:, 0])
+
 plt.show()
 ```
 
-**RPCA**
+## PCP RPCA
 
 ```python
 %%time
 
-pcp_rpca = PcpRPCA(n_rows=25)
-X, A, errors = pcp_rpca.fit_transform(signal=samples)
-drawing.plot_signal([samples, X, A], style="matplotlib")
+rpca_pcp = RPCAPCP(period=100, max_iter=5, mu=.5, lam=1)
+X = rpca_pcp.fit_transform(signal)
+corruptions = signal - X
+```
+
+## Temporal RPCA
+
+```python
+rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[10], list_etas=[0.01], norm="L2")
+X = rpca_noisy.fit_transform(signal)
+corruptions = signal - X
+plot.plot_signal([signal[:,0], X[:,0], corruptions[:, 0]])
 ```
 
 ```python
-temporal_rpca = TemporalRPCA(n_rows=25, tau=2, lam=0.3, list_periods=[20], list_etas=[0.01], norm="L2")
-X, A, errors =  temporal_rpca.fit_transform(signal=samples)
-drawing.plot_signal([samples, X, A], style="matplotlib")
+rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[], list_etas=[], norm="L2")
+X = rpca_noisy.fit_transform(signal)
+corruptions = signal - X
+plot.plot_signal([signal[:,0], X[:,0], corruptions[:, 0]])
 ```
 
+```python
+
+```
@@ -63,27 +63,41 @@ The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consist
 This dataset only contains numerical vairables.
 
 ```python
-df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
+# df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
 
 # cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
 # cols_to_impute = df_data.columns[df_data.isna().any()]
-cols_to_impute = ["TEMP", "PRES"]
+# cols_to_impute = ["TEMP", "PRES"]
 
 ```
 
 The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers.
 
 ```python
-# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
-# cols_to_impute = ["signal"]
+df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
+cols_to_impute = ["signal"]
 ```
 
 Let's take a look at variables to impute. We only consider a station, Aotizhongxin.
 Time series display seasonalities (roughly 12 months).
 
-```python
-imputer = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
-df = imputer.fit_transform(df_data)
+```python tags=[]
+n_stations = len(df_data.groupby("station").size())
+n_cols = len(cols_to_impute)
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(10 * n_stations, 2 * n_cols))
+for i_station, (station, df) in enumerate(df_data.groupby("station")):
+    df_station = df_data.loc[station]
+    for i_col, col in enumerate(cols_to_impute):
+        fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
+        plt.plot(df_station[col], '.', label=station)
+        # break
+        plt.ylabel(col, fontsize=12)
+        if i_col == 0:
+            plt.title(station)
+plt.show()
 ```
 
 ### **II. Imputation methods**
@@ -92,17 +106,11 @@ df = imputer.fit_transform(df_data)
 This part is devoted to the imputation methods. The idea is to try different algorithms and compare them.
 
 <u>**Methods**</u>:
-There are two kinds of methods. The first one is not specific to multivariate time series, as for instance ImputeByMean: columns with missing values are imputed separetaly, i.e. possible correlations are not taken into account. The second one is specific to multivariate time series, where other columns are needed to impute another.
-
-For the ImputeByMean or ImputeByMedian, the user is allow to specify a list of variables indicating how the groupby will be made, to impute the data. More precisely, data are first grouped and then the mean or median of each group is computed. The missign values are then imputed by the corresponding mean or median. If nothing is passed, then the mean or median of each column is used as value to impute.
-
-The ImputeOnResiduals method procedes in 3 steps. First, time series are decomposed (seasonality, trend and residuals). Then the residuals are imputed thanks to a interpolation method. And finally, time series are recomposed.
-
-For more information about the methods, we invite you to read the docs.
+All presented methods are group-wise: here each station is imputed independently. For example ImputerMean computes the mean of each variable in each station and uses the result for imputation; ImputerInterpolation interpolates termporal signals corresponding to each variable on each station.
 
 <u>**Hyperparameters' search**</u>:
-Some methods require hyperparameters. The user can directly specify them, or he can procede to a search. To do so, he has to define a search_params dictionary, where keys are the imputation method's name and the values are a dictionary specifyign the minimum, maximum or list of categories and type of values (Integer, Real or Category) to search.
-In pratice, we rely on a cross validation to find the best hyperparams values minimising an error reconstruction.
+Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
+In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
 
 ```python
 imputer_mean = imputers.ImputerMean(groups=["station"])
@@ -115,10 +123,8 @@ imputer_spline = imputers.ImputerInterpolation(groups=["station"], method="splin
 imputer_shuffle = imputers.ImputerShuffle(groups=["station"])
 imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
 
-dict_tau = {"TEMP": 2, "PRES": 1.1}
-dict_lam = {"TEMP": 0.3, "PRES": 0.8}
-imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000, tau=2, lam=.3)
-imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
+imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=100, max_iter=100, tau=2, lam=.3)
+imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
 
 imputer_ou = imputers.ImputeEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
 imputer_tsou = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
@@ -133,7 +139,6 @@ impute_regressor = imputers.ImputerRegressor(
 impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
   HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
 )
-# impute_mfe = imputers.ImputeMissForest()
 
 dict_imputers = {
     "mean": imputer_mean,
@@ -146,29 +151,15 @@ dict_imputers = {
     "OU": imputer_ou,
     "TSOU": imputer_tsou,
     "TSMLE": imputer_tsmle,
-    "RPCA": imputer_rpca_opti,
-    "RPCA_opti": imputer_rpca_opti,
+    "RPCA": imputer_rpca,
+    # "RPCA_opti": imputer_rpca_opti,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
     # "iterative": imputer_iterative,
 }
 n_imputers = len(dict_imputers)
 
-
-# search_params = {
-#     "RPCA_opti": {
-#         "lam": {
-#             "TEMP": {"min": .1, "max": 10, "type":"Real"},
-#             "PRES": {"min": .1, "max": 10, "type":"Real"}
-#         },
-#         "tau": {
-#             "TEMP": {"min": .1, "max": 10, "type":"Real"},
-#             "PRES": {"min": .1, "max": 10, "type":"Real"}
-#         }
-#     }
-# }
-
 search_params = {
     "RPCA_opti": {
         "tau": {"min": .5, "max": 5, "type":"Real"},
@@ -179,29 +170,20 @@ search_params = {
 ratio_masked = 0.1
 ```
 
-In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a cv number of times. For each method, we calculate the average error and compare the final errors.
+In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.
 
 <p align="center">
     <img src="../../docs/images/comparator.png"  width=50% height=50%>
 </p>
 
 
 
-Concretely, the comparator takes as input a dataframe to impute, a proportion of nan to create, a dictionary of imputers (those previously mentioned), a list with the columns names to impute, the number of articially corrupted dataframe to create: n_samples, the search dictionary search_params, and possibly a threshold for filter the nan value.
-
-Then, it suffices to use the compare function to obtain the results: a dataframe with different metrics.
-This allows an easy comparison of the different imputations.
+Concretely, the comparator takes as input a dataframe to impute, a proportion of nan to create, a dictionary of imputers (those previously mentioned), a list with the columns names to impute, a generator of holes specifying the type of holes to create and the search dictionary search_params for hyperparameter optimization.
 
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
-```python
-# doy = pd.Series(df_data.reset_index().datetime.dt.isocalendar().week.values, index=df_data.index)
-
+```python tags=[]
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
-# generator_holes = missing_patterns.GeometricHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
-# generator_holes = missing_patterns.UniformHoleGenerator(n_splits=2, ratio_masked=ratio_masked)
-# generator_holes = missing_patterns.GroupedHoleGenerator(n_splits=2, groups=["station", doy], ratio_masked=ratio_masked)
-# generator_holes = missing_patterns.MultiMarkovHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
@@ -235,32 +217,20 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
 ```
 
 ```python
-df_station
-```
-
-```python
-# station = "Aotizhongxin"
 station = df_plot.index.get_level_values("station")[0]
 df_station = df_plot.loc[station]
 dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 ```
 
 Let's look at the imputations.
 When the data is missing at random, imputation is easier. Missing block are more challenging.
-Note here we didn't fit the hyperparams of the RPCA... results might be of poor quality...
 
 ```python
-# palette = sns.color_palette("icefire", n_colors=len(dict_imputers))
-# palette = sns.color_palette("husl", 8)
-# sns.set_palette(palette)
-# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
-
 for col in cols_to_impute:
     fig, ax = plt.subplots(figsize=(10, 3))
     values_orig = df_station[col]
 
     plt.plot(values_orig, ".", color='black', label="original")
-    #plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
 
     for ind, (name, model) in enumerate(list(dict_imputers.items())):
         values_imp = dfs_imputed_station[name][col].copy()
@@ -276,11 +246,6 @@ for col in cols_to_impute:
 ```
 
 ```python
-# palette = sns.color_palette("icefire", n_colors=len(dict_imputers))
-# palette = sns.color_palette("husl", 8)
-# sns.set_palette(palette)
-# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
-
 n_columns = len(df_plot.columns)
 n_imputers = len(dict_imputers)
 
 
@@ -104,11 +104,6 @@ def evaluate_errors_sample(
         """
         list_errors = []
         df_origin = df[self.selected_columns].copy()
-        if list_spaces:
-            print("Hyperparameter optimization")
-            print(list_spaces)
-        else:
-            print("No hyperparameter optimization")
         for df_mask in self.generator_holes.split(df_origin):
             df_corrupted = df_origin.copy()
             df_corrupted[df_mask] = np.nan