Skip to content

Commit ac1460a

Browse files
Julien RousselJulien Roussel
authored andcommitted
rpca types renamed
1 parent 018b675 commit ac1460a

File tree

9 files changed

+167
-396
lines changed

9 files changed

+167
-396
lines changed

examples/RPCA.md

Lines changed: 73 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -5,98 +5,105 @@ jupyter:
55
extension: .md
66
format_name: markdown
77
format_version: '1.3'
8-
jupytext_version: 1.14.0
8+
jupytext_version: 1.14.4
99
kernelspec:
10-
display_name: Python 3.9.6 64-bit
10+
display_name: env_qolmat
1111
language: python
12-
name: python3
12+
name: env_qolmat
1313
---
1414

1515
```python
1616
%reload_ext autoreload
1717
%autoreload 2
1818

1919
import numpy as np
20-
import timesynth as ts # package for generating time series
20+
# import timesynth as ts # package for generating time series
2121

2222
import matplotlib.pyplot as plt
23+
2324
import sys
24-
sys.path.append("../")
25-
from qolmat.imputations.rpca.utils import drawing, utils
26-
from qolmat.imputations.rpca.pcp_rpca import PcpRPCA
27-
from qolmat.imputations.rpca.temporal_rpca import TemporalRPCA, OnlineTemporalRPCA
25+
26+
from math import pi
27+
28+
from qolmat.utils import plot, data
29+
from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
30+
from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
2831
```
2932

3033
**Generate synthetic data**
3134

3235
```python
33-
np.random.seed(402)
34-
35-
################################################################################
36-
37-
time_sampler = ts.TimeSampler(stop_time=20)
38-
irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
39-
sinusoid = ts.signals.Sinusoidal(frequency=2)
40-
white_noise = ts.noise.GaussianNoise(std=0.1)
41-
timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
42-
samples, signals, errors = timeseries.sample(irregular_time_samples)
43-
44-
n = len(samples)
45-
pc = 0.02
46-
indices_ano1 = np.random.choice(n, int(n*pc))
47-
samples[indices_ano1] = [np.random.uniform(low=2*np.min(samples), high=2*np.max(samples)) for i in range(int(n*pc))]
48-
indices = np.random.choice(n, int(n*pc))
49-
samples[indices] = np.nan
50-
51-
52-
################################################################################
53-
54-
time_sampler = ts.TimeSampler(stop_time=20)
55-
irregular_time_samples = time_sampler.sample_irregular_time(num_points=5_000, keep_percentage=100)
56-
sinusoid = ts.signals.Sinusoidal(frequency=3)
57-
white_noise = ts.noise.GaussianNoise(std=0)
58-
timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)
59-
samples2, signals2, errors2 = timeseries.sample(irregular_time_samples)
60-
61-
n2 = len(samples2)
62-
indices_ano2 = np.random.choice(n2, int(n*pc))
63-
samples2[indices_ano2] = [np.random.uniform(low=2*np.min(samples2), high=2*np.max(samples2)) for i in range(int(n2*pc))]
64-
indices = np.random.choice(n2, int(n*pc))
65-
samples2[indices] = np.nan
66-
67-
samples += samples2
68-
signals += signals2
69-
errors += errors2
70-
71-
################################################################################
72-
73-
fig, ax = plt.subplots(4, 1, sharex=True, figsize=(12,6))
74-
ax[0].plot(range(n), signals, c="darkblue")
75-
ax[0].set_title("Low-rank signal", fontsize=15)
76-
ax[1].plot(range(n), errors, c="darkgreen")
77-
ax[1].set_title("Noise", fontsize=15)
78-
ax[2].plot(range(n), samples-signals-errors, c="tab:red")
79-
ax[2].set_title("Corruptions", fontsize=15)
80-
ax[3].plot(range(n), samples, c="k")
81-
ax[3].set_title("Corrupted signal", fontsize=15)
82-
ax[3].set_xlabel("Time", fontsize=16)
83-
plt.tight_layout()
36+
n_samples = 1000
37+
38+
mesh = np.arange(n_samples)
39+
X_true = np.zeros(n_samples)
40+
A_true = np.zeros(n_samples)
41+
E_true = np.zeros(n_samples)
42+
p1 = 100
43+
p2 = 20
44+
X_true = 1 + np.sin(2 * pi * mesh / p1) + np.sin(2 * pi * mesh / p2)
45+
noise = np.random.uniform(size=n_samples)
46+
amplitude_A = .5
47+
freq_A = .05
48+
A_true = amplitude_A * np.where(noise < freq_A, -np.log(noise), 0) * (2 * (np.random.uniform(size=n_samples) > .5) - 1)
49+
amplitude_E = .1
50+
E_true = amplitude_E * np.random.normal(size=n_samples)
51+
52+
signal = X_true + E_true
53+
signal[A_true != 0] = A_true[A_true != 0]
54+
signal = signal.reshape(-1, 1)
55+
56+
# Adding missing data
57+
signal[5:20, 0] = np.nan
58+
```
59+
60+
```python
61+
fig = plt.figure(figsize=(15, 8))
62+
ax = fig.add_subplot(4, 1, 1)
63+
ax.title.set_text("Low-rank signal")
64+
plt.plot(X_true)
65+
66+
ax = fig.add_subplot(4, 1, 2)
67+
ax.title.set_text("Corruption signal")
68+
plt.plot(A_true)
69+
70+
ax = fig.add_subplot(4, 1, 3)
71+
ax.title.set_text("Noise")
72+
plt.plot(E_true)
73+
74+
ax = fig.add_subplot(4, 1, 4)
75+
ax.title.set_text("Corrupted signal")
76+
plt.plot(signal[:, 0])
77+
8478
plt.show()
8579
```
8680

87-
**RPCA**
81+
## PCP RPCA
8882

8983
```python
9084
%%time
9185

92-
pcp_rpca = PcpRPCA(n_rows=25)
93-
X, A, errors = pcp_rpca.fit_transform(signal=samples)
94-
drawing.plot_signal([samples, X, A], style="matplotlib")
86+
rpca_pcp = RPCAPCP(period=100, max_iter=5, mu=.5, lam=1)
87+
X = rpca_pcp.fit_transform(signal)
88+
corruptions = signal - X
89+
```
90+
91+
## Temporal RPCA
92+
93+
```python
94+
rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[10], list_etas=[0.01], norm="L2")
95+
X = rpca_noisy.fit_transform(signal)
96+
corruptions = signal - X
97+
plot.plot_signal([signal[:,0], X[:,0], corruptions[:, 0]])
9598
```
9699

97100
```python
98-
temporal_rpca = TemporalRPCA(n_rows=25, tau=2, lam=0.3, list_periods=[20], list_etas=[0.01], norm="L2")
99-
X, A, errors = temporal_rpca.fit_transform(signal=samples)
100-
drawing.plot_signal([samples, X, A], style="matplotlib")
101+
rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[], list_etas=[], norm="L2")
102+
X = rpca_noisy.fit_transform(signal)
103+
corruptions = signal - X
104+
plot.plot_signal([signal[:,0], X[:,0], corruptions[:, 0]])
101105
```
102106

107+
```python
108+
109+
```

examples/benchmark.md

Lines changed: 31 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -63,27 +63,41 @@ The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consist
6363
This dataset only contains numerical vairables.
6464

6565
```python
66-
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
66+
# df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
6767

6868
# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
6969
# cols_to_impute = df_data.columns[df_data.isna().any()]
70-
cols_to_impute = ["TEMP", "PRES"]
70+
# cols_to_impute = ["TEMP", "PRES"]
7171

7272
```
7373

7474
The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers.
7575

7676
```python
77-
# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
78-
# cols_to_impute = ["signal"]
77+
df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
78+
cols_to_impute = ["signal"]
7979
```
8080

8181
Let's take a look at variables to impute. We only consider a station, Aotizhongxin.
8282
Time series display seasonalities (roughly 12 months).
8383

84-
```python
85-
imputer = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
86-
df = imputer.fit_transform(df_data)
84+
```python tags=[]
85+
n_stations = len(df_data.groupby("station").size())
86+
n_cols = len(cols_to_impute)
87+
```
88+
89+
```python tags=[]
90+
fig = plt.figure(figsize=(10 * n_stations, 2 * n_cols))
91+
for i_station, (station, df) in enumerate(df_data.groupby("station")):
92+
df_station = df_data.loc[station]
93+
for i_col, col in enumerate(cols_to_impute):
94+
fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
95+
plt.plot(df_station[col], '.', label=station)
96+
# break
97+
plt.ylabel(col, fontsize=12)
98+
if i_col == 0:
99+
plt.title(station)
100+
plt.show()
87101
```
88102

89103
### **II. Imputation methods**
@@ -92,17 +106,11 @@ df = imputer.fit_transform(df_data)
92106
This part is devoted to the imputation methods. The idea is to try different algorithms and compare them.
93107

94108
<u>**Methods**</u>:
95-
There are two kinds of methods. The first one is not specific to multivariate time series, as for instance ImputeByMean: columns with missing values are imputed separetaly, i.e. possible correlations are not taken into account. The second one is specific to multivariate time series, where other columns are needed to impute another.
96-
97-
For the ImputeByMean or ImputeByMedian, the user is allow to specify a list of variables indicating how the groupby will be made, to impute the data. More precisely, data are first grouped and then the mean or median of each group is computed. The missign values are then imputed by the corresponding mean or median. If nothing is passed, then the mean or median of each column is used as value to impute.
98-
99-
The ImputeOnResiduals method procedes in 3 steps. First, time series are decomposed (seasonality, trend and residuals). Then the residuals are imputed thanks to a interpolation method. And finally, time series are recomposed.
100-
101-
For more information about the methods, we invite you to read the docs.
109+
All presented methods are group-wise: here each station is imputed independently. For example ImputerMean computes the mean of each variable in each station and uses the result for imputation; ImputerInterpolation interpolates termporal signals corresponding to each variable on each station.
102110

103111
<u>**Hyperparameters' search**</u>:
104-
Some methods require hyperparameters. The user can directly specify them, or he can procede to a search. To do so, he has to define a search_params dictionary, where keys are the imputation method's name and the values are a dictionary specifyign the minimum, maximum or list of categories and type of values (Integer, Real or Category) to search.
105-
In pratice, we rely on a cross validation to find the best hyperparams values minimising an error reconstruction.
112+
Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
113+
In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
106114

107115
```python
108116
imputer_mean = imputers.ImputerMean(groups=["station"])
@@ -115,10 +123,8 @@ imputer_spline = imputers.ImputerInterpolation(groups=["station"], method="splin
115123
imputer_shuffle = imputers.ImputerShuffle(groups=["station"])
116124
imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
117125

118-
dict_tau = {"TEMP": 2, "PRES": 1.1}
119-
dict_lam = {"TEMP": 0.3, "PRES": 0.8}
120-
imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000, tau=2, lam=.3)
121-
imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
126+
imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=100, max_iter=100, tau=2, lam=.3)
127+
imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
122128

123129
imputer_ou = imputers.ImputeEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
124130
imputer_tsou = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
@@ -133,7 +139,6 @@ impute_regressor = imputers.ImputerRegressor(
133139
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
134140
HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
135141
)
136-
# impute_mfe = imputers.ImputeMissForest()
137142

138143
dict_imputers = {
139144
"mean": imputer_mean,
@@ -146,29 +151,15 @@ dict_imputers = {
146151
"OU": imputer_ou,
147152
"TSOU": imputer_tsou,
148153
"TSMLE": imputer_tsmle,
149-
"RPCA": imputer_rpca_opti,
150-
"RPCA_opti": imputer_rpca_opti,
154+
"RPCA": imputer_rpca,
155+
# "RPCA_opti": imputer_rpca_opti,
151156
# "locf": imputer_locf,
152157
# "nocb": imputer_nocb,
153158
# "knn": imputer_knn,
154159
# "iterative": imputer_iterative,
155160
}
156161
n_imputers = len(dict_imputers)
157162

158-
159-
# search_params = {
160-
# "RPCA_opti": {
161-
# "lam": {
162-
# "TEMP": {"min": .1, "max": 10, "type":"Real"},
163-
# "PRES": {"min": .1, "max": 10, "type":"Real"}
164-
# },
165-
# "tau": {
166-
# "TEMP": {"min": .1, "max": 10, "type":"Real"},
167-
# "PRES": {"min": .1, "max": 10, "type":"Real"}
168-
# }
169-
# }
170-
# }
171-
172163
search_params = {
173164
"RPCA_opti": {
174165
"tau": {"min": .5, "max": 5, "type":"Real"},
@@ -179,29 +170,20 @@ search_params = {
179170
ratio_masked = 0.1
180171
```
181172

182-
In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a cv number of times. For each method, we calculate the average error and compare the final errors.
173+
In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.
183174

184175
<p align="center">
185176
<img src="../../docs/images/comparator.png" width=50% height=50%>
186177
</p>
187178

188179

189180

190-
Concretely, the comparator takes as input a dataframe to impute, a proportion of nan to create, a dictionary of imputers (those previously mentioned), a list with the columns names to impute, the number of articially corrupted dataframe to create: n_samples, the search dictionary search_params, and possibly a threshold for filter the nan value.
191-
192-
Then, it suffices to use the compare function to obtain the results: a dataframe with different metrics.
193-
This allows an easy comparison of the different imputations.
181+
Concretely, the comparator takes as input a dataframe to impute, a proportion of nan to create, a dictionary of imputers (those previously mentioned), a list with the columns names to impute, a generator of holes specifying the type of holes to create and the search dictionary search_params for hyperparameter optimization.
194182

195183
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
196184

197-
```python
198-
# doy = pd.Series(df_data.reset_index().datetime.dt.isocalendar().week.values, index=df_data.index)
199-
185+
```python tags=[]
200186
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
201-
# generator_holes = missing_patterns.GeometricHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
202-
# generator_holes = missing_patterns.UniformHoleGenerator(n_splits=2, ratio_masked=ratio_masked)
203-
# generator_holes = missing_patterns.GroupedHoleGenerator(n_splits=2, groups=["station", doy], ratio_masked=ratio_masked)
204-
# generator_holes = missing_patterns.MultiMarkovHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
205187

206188
comparison = comparator.Comparator(
207189
dict_imputers,
@@ -235,32 +217,20 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
235217
```
236218

237219
```python
238-
df_station
239-
```
240-
241-
```python
242-
# station = "Aotizhongxin"
243220
station = df_plot.index.get_level_values("station")[0]
244221
df_station = df_plot.loc[station]
245222
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
246223
```
247224

248225
Let's look at the imputations.
249226
When the data is missing at random, imputation is easier. Missing block are more challenging.
250-
Note here we didn't fit the hyperparams of the RPCA... results might be of poor quality...
251227

252228
```python
253-
# palette = sns.color_palette("icefire", n_colors=len(dict_imputers))
254-
# palette = sns.color_palette("husl", 8)
255-
# sns.set_palette(palette)
256-
# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
257-
258229
for col in cols_to_impute:
259230
fig, ax = plt.subplots(figsize=(10, 3))
260231
values_orig = df_station[col]
261232

262233
plt.plot(values_orig, ".", color='black', label="original")
263-
#plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
264234

265235
for ind, (name, model) in enumerate(list(dict_imputers.items())):
266236
values_imp = dfs_imputed_station[name][col].copy()
@@ -276,11 +246,6 @@ for col in cols_to_impute:
276246
```
277247

278248
```python
279-
# palette = sns.color_palette("icefire", n_colors=len(dict_imputers))
280-
# palette = sns.color_palette("husl", 8)
281-
# sns.set_palette(palette)
282-
# markers = ["o", "s", "D", "+", "P", ">", "^", "d"]
283-
284249
n_columns = len(df_plot.columns)
285250
n_imputers = len(dict_imputers)
286251

-45.9 KB
Loading

qolmat/benchmark/comparator.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,6 @@ def evaluate_errors_sample(
104104
"""
105105
list_errors = []
106106
df_origin = df[self.selected_columns].copy()
107-
if list_spaces:
108-
print("Hyperparameter optimization")
109-
print(list_spaces)
110-
else:
111-
print("No hyperparameter optimization")
112107
for df_mask in self.generator_holes.split(df_origin):
113108
df_corrupted = df_origin.copy()
114109
df_corrupted[df_mask] = np.nan

0 commit comments

Comments
 (0)