Skip to content

Commit 6fe7926

Browse files
Merge pull request #10 from Quantmetry/fix_rpca
Fix rpca
2 parents 8166181 + e445f05 commit 6fe7926

26 files changed

+69
-420880
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@ qolmat/.converge
6060
# Notebooks
6161
**.ipynb_checkpoints
6262
examples/*.ipynb
63+
examples/figures/*
64+
examples/data/*
65+
examples/*.ipynb
6366

6467

6568
# VSCode
6669
.vscode
67-
examples/benchmark.ipynb
70+
# examples/benchmark.ipynb

environment.dev.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,14 @@ dependencies:
66
- bump2version
77
- ipykernel
88
- jupyter
9-
- seaborn
109
- sphinx
1110
- sphinx-gallery
1211
- sphinx_rtd_theme
1312
- twine
1413
- wheel
1514
- jupyterlab
1615
- jupytext
17-
- s3fs
18-
- pyarrow
16+
- statsmodels
1917
- pip
2018
- pip:
2119
- flake8==6.0.0

examples/benchmark.md

Lines changed: 51 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ jupyter:
88
format_version: '1.3'
99
jupytext_version: 1.14.4
1010
kernelspec:
11-
display_name: env_qolmat
11+
display_name: env_qolmat_dev
1212
language: python
13-
name: env_qolmat
13+
name: env_qolmat_dev
1414
---
1515

1616
**This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -38,6 +38,7 @@ import matplotlib.image as mpimg
3838
import matplotlib.ticker as plticker
3939

4040
tab10 = plt.get_cmap("tab10")
41+
plt.rcParams.update({'font.size': 18})
4142

4243
from typing import Optional
4344

@@ -46,7 +47,6 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
4647

4748

4849
import sys
49-
# sys.path.append("../../")
5050
from qolmat.benchmark import comparator, missing_patterns
5151
from qolmat.benchmark.utils import kl_divergence
5252
from qolmat.imputations import imputers
@@ -73,6 +73,10 @@ cols_to_impute = ["TEMP", "PRES"]
7373

7474
The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers.
7575

76+
```python tags=[]
77+
df_data
78+
```
79+
7680
```python
7781
# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
7882
# cols_to_impute = ["signal"]
@@ -87,16 +91,19 @@ n_cols = len(cols_to_impute)
8791
```
8892

8993
```python tags=[]
90-
fig = plt.figure(figsize=(10 * n_stations, 2 * n_cols))
94+
fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
9195
for i_station, (station, df) in enumerate(df_data.groupby("station")):
9296
df_station = df_data.loc[station]
9397
for i_col, col in enumerate(cols_to_impute):
9498
fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
9599
plt.plot(df_station[col], '.', label=station)
96100
# break
97-
plt.ylabel(col, fontsize=12)
101+
plt.ylabel(col)
102+
plt.xticks(rotation=15)
98103
if i_col == 0:
99104
plt.title(station)
105+
if i_col != n_cols - 1:
106+
plt.xticks([], [])
100107
plt.show()
101108
```
102109

@@ -200,7 +207,7 @@ plt.ylabel("mae")
200207
plt.show()
201208
```
202209

203-
### **IV. Comparison of methods**
210+
### **III. Comparison of methods**
204211

205212

206213
We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
@@ -248,49 +255,57 @@ for col in cols_to_impute:
248255
n_columns = len(df_plot.columns)
249256
n_imputers = len(dict_imputers)
250257

251-
fig = plt.figure(figsize=(8 * n_columns, 6 * n_imputers))
258+
fig = plt.figure(figsize=(8 * n_imputers, 6 * n_columns))
252259
i_plot = 1
253-
for name_imputer in dict_imputers:
254-
for col in df_plot:
260+
for i_col, col in enumerate(df_plot):
261+
for name_imputer, df_imp in dfs_imputed_station.items():
255262

256-
fig.add_subplot(n_imputers, n_columns, i_plot)
263+
fig.add_subplot(n_columns, n_imputers, i_plot)
257264
values_orig = df_station[col]
258265

259266
plt.plot(values_orig, ".", color='black', label="original")
260267
#plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
261268

262-
values_imp = dfs_imputed_station[name_imputer][col].copy()
269+
values_imp = df_imp[col].copy()
263270
values_imp[values_orig.notna()] = np.nan
264271
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
265272
plt.ylabel(col, fontsize=16)
266-
if i_plot % n_columns == 0:
273+
if i_plot % n_columns == 1:
267274
plt.legend(loc=[1, 0], fontsize=18)
275+
plt.xticks(rotation=15)
276+
if i_col == 0:
277+
plt.title(name_imputer)
278+
if i_col != n_columns - 1:
279+
plt.xticks([], [])
268280
loc = plticker.MultipleLocator(base=2*365)
269281
ax.xaxis.set_major_locator(loc)
270-
ax.tick_params(axis='both', which='major', labelsize=17)
282+
ax.tick_params(axis='both', which='major')
271283
i_plot += 1
272284
plt.savefig("figures/imputations_benchmark.png")
273285
plt.show()
274286

275287
```
276288

277-
**IV.a. Covariance**
289+
## Covariance
278290

279291

280292
We first check the covariance. We simply plot one variable versus one another.
281293
One observes the methods provide similar visual resuls: it's difficult to compare them based on this criterion.
282294

283295
```python
284-
for i_model, model in enumerate(dict_imputers.keys()):
285-
fig, axs = plt.subplots(1, len(cols_to_impute)-1, figsize=(4 * (len(cols_to_impute)-1), 4))
286-
df_imp = dfs_imputed_station[model]
287-
for i in range(len(cols_to_impute)-1):
288-
plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+1], axs, color=tab10(i_model))
289-
axs.set_title(f"imputation method: {model}", fontsize=20)
290-
plt.show()
296+
fig = plt.figure(figsize=(6 * n_imputers, 6 * n_columns))
297+
i_plot = 1
298+
for i, col in enumerate(cols_to_impute[:-1]):
299+
for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed.items()):
300+
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
301+
plot.compare_covariances(df_plot, df_imp, col, cols_to_impute[i+1], ax, color=tab10(i_imputer), label=name_imputer)
302+
ax.set_title(f"imputation method: {name_imputer}", fontsize=20)
303+
i_plot += 1
304+
ax.legend()
305+
plt.show()
291306
```
292307

293-
**IV.b. Auto-correlation**
308+
## Auto-correlation
294309

295310

296311
We are now interested in th eauto-correlation function (ACF). As seen before, time series display seaonal patterns.
@@ -302,63 +317,26 @@ On th econtrary, for the PRES variable, all methods overestimates the autocorrel
302317
Finally, for the DEWP variable, the methods cannot impute to obtain a behavior close to the original: the autocorrelation decreases to linearly.
303318

304319
```python
305-
from statsmodels.tsa.stattools import acf
306-
307-
palette = sns.dark_palette("b", n_colors=len(dict_i), reverse=False)
308-
sns.set_palette(palette)
309-
markers = ["o", "s", "*", "D", "P", ">", "^", "d"]
310-
311-
fig, axs = plt.subplots(1, len(cols_to_impute), figsize=(16, 2))
312-
for i, col in enumerate(cols_to_impute):
313-
axs[i].plot(acf(df_station[col].dropna()), color="k", marker=markers[0], lw=0.8)
314-
for j, (name, df) in enumerate(dfs_imputed_station.items()):
315-
axs[i].plot(acf(df[col]), marker=markers[j+1], lw=0.8)
316-
axs[i].set_xlabel("Lags [days]", fontsize=15)
317-
axs[i].set_ylabel("Correlation", fontsize=15)
318-
axs[i].set_ylim([0.5, 1])
319-
axs[i].set_title(col, fontsize=15)
320-
axs[-1].legend(["Original dataset"] + list(dfs_imputed.keys()), loc=[1, 0])
321-
sns.despine()
322-
```
323-
324-
**IV.b. Distances between distributions**
325-
326-
327-
We are now interested in a way for quantifying the distance between two distributions.
328-
Until now, we look at the reconstruction error, whatever the distributions.
329-
330-
There is a plethora of methods to quantify the distance between distributions $P$ and $Q$.
331-
For instance, those based on the information theory as for instance, the well-known [Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence). A simple interpretation of the KL divergence of $P$ from $Q$ is the expected excess surprise from using $Q$ as a model when the actual distribution is $P$.
332-
333-
A drawback with this divergence is it ignores the underlying geometry of the space (the KL divergence is somewhat difficult to intuitively interpret).
334-
As a remedy, we consider a second metric, the [Wasserstein distance](https://en.wikipedia.org/wiki/Wasserstein_metric), a distance function defined between probability distributions on a given metric space $M$.
335-
336-
To understand one of the differences between these two quantities, let us look at this simple example.
337-
The KL between the 2 distributions on the left is the same as that of the 2 distributions on the right: the KL divergence does not take into account the underlying metric space. Conversely, the Wasserstein metric is larger for those on the left since the "transport" is greater than for those on the right.
320+
n_columns = len(df_plot.columns)
321+
n_imputers = len(dict_imputers)
338322

339-
<p align="center">
340-
<img src="../../docs/images/KL_wasser.png" width=50% height=50%>
341-
</p>
323+
fig = plt.figure(figsize=(6 * n_columns, 6))
324+
for i_col, col in enumerate(df_plot):
325+
ax = fig.add_subplot(1, n_columns, i_col + 1)
326+
values_orig = df_station[col]
342327

328+
acf = utils.acf(values_orig)
343329

344-
```python
345-
df_kl = pd.DataFrame(np.nan, index=dfs_imputed_station.keys(), columns=cols_to_impute)
346-
for model, df_imputed in dfs_imputed_station.items():
347-
for col in cols_to_impute:
348-
kl = kl_divergence(df_station[[col]].dropna(how="all"), df_imputed[[col]]).iloc[0]
349-
df_kl.loc[model, col] = kl
330+
plt.plot(acf, color="black")
331+
for name_imputer, df_imp in dfs_imputed_station.items():
350332

351-
plot.display_bar_table(df_kl, ylabel="KL divergence")
352-
```
333+
acf = utils.acf(df_imp[col])
334+
plt.plot(acf, label=name_imputer)
335+
plt.legend()
353336

354-
```python
355-
df_wasserstein = pd.DataFrame(np.nan, index=dfs_imputed_station.keys(), columns=cols_to_impute)
356-
for model, df_imputed in dfs_imputed_station.items():
357-
for col in cols_to_impute:
358-
wasserstein = scipy.stats.wasserstein_distance(df_station[col].dropna(how="all"), df_imputed[col])
359-
df_wasserstein.loc[model, col] = wasserstein
337+
plt.savefig("figures/acf.png")
338+
plt.show()
360339

361-
plot.display_bar_table(df_wasserstein, ylabel="Wasserstein distance")
362340
```
363341

364342
```python
-7.59 MB
Binary file not shown.

0 commit comments

Comments
 (0)