Skip to content

Commit e445f05

Browse files
Julien RousselJulien Roussel
authored andcommitted
data and png removed from repo
1 parent 2339d41 commit e445f05

20 files changed

+63
-420856
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@ qolmat/.converge
6060
# Notebooks
6161
**.ipynb_checkpoints
6262
examples/*.ipynb
63+
examples/figures/*
64+
examples/data/*
65+
examples/*.ipynb
6366

6467

6568
# VSCode
6669
.vscode
67-
examples/benchmark.ipynb
70+
# examples/benchmark.ipynb

environment.dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies:
1313
- wheel
1414
- jupyterlab
1515
- jupytext
16+
- statsmodels
1617
- pip
1718
- pip:
1819
- flake8==6.0.0

examples/benchmark.md

Lines changed: 47 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ jupyter:
88
format_version: '1.3'
99
jupytext_version: 1.14.4
1010
kernelspec:
11-
display_name: env_qolmat
11+
display_name: env_qolmat_dev
1212
language: python
13-
name: env_qolmat
13+
name: env_qolmat_dev
1414
---
1515

1616
**This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -38,6 +38,7 @@ import matplotlib.image as mpimg
3838
import matplotlib.ticker as plticker
3939

4040
tab10 = plt.get_cmap("tab10")
41+
plt.rcParams.update({'font.size': 18})
4142

4243
from typing import Optional
4344

@@ -46,7 +47,6 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
4647

4748

4849
import sys
49-
# sys.path.append("../../")
5050
from qolmat.benchmark import comparator, missing_patterns
5151
from qolmat.benchmark.utils import kl_divergence
5252
from qolmat.imputations import imputers
@@ -91,16 +91,19 @@ n_cols = len(cols_to_impute)
9191
```
9292

9393
```python tags=[]
94-
fig = plt.figure(figsize=(10 * n_stations, 2 * n_cols))
94+
fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
9595
for i_station, (station, df) in enumerate(df_data.groupby("station")):
9696
df_station = df_data.loc[station]
9797
for i_col, col in enumerate(cols_to_impute):
9898
fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
9999
plt.plot(df_station[col], '.', label=station)
100100
# break
101-
plt.ylabel(col, fontsize=12)
101+
plt.ylabel(col)
102+
plt.xticks(rotation=15)
102103
if i_col == 0:
103104
plt.title(station)
105+
if i_col != n_cols - 1:
106+
plt.xticks([], [])
104107
plt.show()
105108
```
106109

@@ -204,7 +207,7 @@ plt.ylabel("mae")
204207
plt.show()
205208
```
206209

207-
### **IV. Comparison of methods**
210+
### **III. Comparison of methods**
208211

209212

210213
We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
@@ -252,49 +255,57 @@ for col in cols_to_impute:
252255
n_columns = len(df_plot.columns)
253256
n_imputers = len(dict_imputers)
254257

255-
fig = plt.figure(figsize=(8 * n_columns, 6 * n_imputers))
258+
fig = plt.figure(figsize=(8 * n_imputers, 6 * n_columns))
256259
i_plot = 1
257-
for name_imputer in dict_imputers:
258-
for col in df_plot:
260+
for i_col, col in enumerate(df_plot):
261+
for name_imputer, df_imp in dfs_imputed_station.items():
259262

260-
fig.add_subplot(n_imputers, n_columns, i_plot)
263+
fig.add_subplot(n_columns, n_imputers, i_plot)
261264
values_orig = df_station[col]
262265

263266
plt.plot(values_orig, ".", color='black', label="original")
264267
#plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
265268

266-
values_imp = dfs_imputed_station[name_imputer][col].copy()
269+
values_imp = df_imp[col].copy()
267270
values_imp[values_orig.notna()] = np.nan
268271
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
269272
plt.ylabel(col, fontsize=16)
270-
if i_plot % n_columns == 0:
273+
if i_plot % n_columns == 1:
271274
plt.legend(loc=[1, 0], fontsize=18)
275+
plt.xticks(rotation=15)
276+
if i_col == 0:
277+
plt.title(name_imputer)
278+
if i_col != n_columns - 1:
279+
plt.xticks([], [])
272280
loc = plticker.MultipleLocator(base=2*365)
273281
ax.xaxis.set_major_locator(loc)
274-
ax.tick_params(axis='both', which='major', labelsize=17)
282+
ax.tick_params(axis='both', which='major')
275283
i_plot += 1
276284
plt.savefig("figures/imputations_benchmark.png")
277285
plt.show()
278286

279287
```
280288

281-
**IV.a. Covariance**
289+
## Covariance
282290

283291

284292
We first check the covariance. We simply plot one variable versus one another.
285293
One observes the methods provide similar visual resuls: it's difficult to compare them based on this criterion.
286294

287295
```python
288-
for i_model, model in enumerate(dict_imputers.keys()):
289-
fig, axs = plt.subplots(1, len(cols_to_impute)-1, figsize=(4 * (len(cols_to_impute)-1), 4))
290-
df_imp = dfs_imputed_station[model]
291-
for i in range(len(cols_to_impute)-1):
292-
plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+1], axs, color=tab10(i_model))
293-
axs.set_title(f"imputation method: {model}", fontsize=20)
294-
plt.show()
296+
fig = plt.figure(figsize=(6 * n_imputers, 6 * n_columns))
297+
i_plot = 1
298+
for i, col in enumerate(cols_to_impute[:-1]):
299+
for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed.items()):
300+
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
301+
plot.compare_covariances(df_plot, df_imp, col, cols_to_impute[i+1], ax, color=tab10(i_imputer), label=name_imputer)
302+
ax.set_title(f"imputation method: {name_imputer}", fontsize=20)
303+
i_plot += 1
304+
ax.legend()
305+
plt.show()
295306
```
296307

297-
**IV.b. Auto-correlation**
308+
## Auto-correlation
298309

299310

300311
We are now interested in th eauto-correlation function (ACF). As seen before, time series display seaonal patterns.
@@ -306,63 +317,26 @@ On th econtrary, for the PRES variable, all methods overestimates the autocorrel
306317
Finally, for the DEWP variable, the methods cannot impute to obtain a behavior close to the original: the autocorrelation decreases to linearly.
307318

308319
```python
309-
from statsmodels.tsa.stattools import acf
310-
311-
palette = sns.dark_palette("b", n_colors=len(dict_i), reverse=False)
312-
sns.set_palette(palette)
313-
markers = ["o", "s", "*", "D", "P", ">", "^", "d"]
314-
315-
fig, axs = plt.subplots(1, len(cols_to_impute), figsize=(16, 2))
316-
for i, col in enumerate(cols_to_impute):
317-
axs[i].plot(acf(df_station[col].dropna()), color="k", marker=markers[0], lw=0.8)
318-
for j, (name, df) in enumerate(dfs_imputed_station.items()):
319-
axs[i].plot(acf(df[col]), marker=markers[j+1], lw=0.8)
320-
axs[i].set_xlabel("Lags [days]", fontsize=15)
321-
axs[i].set_ylabel("Correlation", fontsize=15)
322-
axs[i].set_ylim([0.5, 1])
323-
axs[i].set_title(col, fontsize=15)
324-
axs[-1].legend(["Original dataset"] + list(dfs_imputed.keys()), loc=[1, 0])
325-
sns.despine()
326-
```
327-
328-
**IV.b. Distances between distributions**
329-
330-
331-
We are now interested in a way for quantifying the distance between two distributions.
332-
Until now, we look at the reconstruction error, whatever the distributions.
333-
334-
There is a plethora of methods to quantify the distance between distributions $P$ and $Q$.
335-
For instance, those based on the information theory as for instance, the well-known [Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence). A simple interpretation of the KL divergence of $P$ from $Q$ is the expected excess surprise from using $Q$ as a model when the actual distribution is $P$.
336-
337-
A drawback with this divergence is it ignores the underlying geometry of the space (the KL divergence is somewhat difficult to intuitively interpret).
338-
As a remedy, we consider a second metric, the [Wasserstein distance](https://en.wikipedia.org/wiki/Wasserstein_metric), a distance function defined between probability distributions on a given metric space $M$.
339-
340-
To understand one of the differences between these two quantities, let us look at this simple example.
341-
The KL between the 2 distributions on the left is the same as that of the 2 distributions on the right: the KL divergence does not take into account the underlying metric space. Conversely, the Wasserstein metric is larger for those on the left since the "transport" is greater than for those on the right.
320+
n_columns = len(df_plot.columns)
321+
n_imputers = len(dict_imputers)
342322

343-
<p align="center">
344-
<img src="../../docs/images/KL_wasser.png" width=50% height=50%>
345-
</p>
323+
fig = plt.figure(figsize=(6 * n_columns, 6))
324+
for i_col, col in enumerate(df_plot):
325+
ax = fig.add_subplot(1, n_columns, i_col + 1)
326+
values_orig = df_station[col]
346327

328+
acf = utils.acf(values_orig)
347329

348-
```python
349-
df_kl = pd.DataFrame(np.nan, index=dfs_imputed_station.keys(), columns=cols_to_impute)
350-
for model, df_imputed in dfs_imputed_station.items():
351-
for col in cols_to_impute:
352-
kl = kl_divergence(df_station[[col]].dropna(how="all"), df_imputed[[col]]).iloc[0]
353-
df_kl.loc[model, col] = kl
330+
plt.plot(acf, color="black")
331+
for name_imputer, df_imp in dfs_imputed_station.items():
354332

355-
plot.display_bar_table(df_kl, ylabel="KL divergence")
356-
```
333+
acf = utils.acf(df_imp[col])
334+
plt.plot(acf, label=name_imputer)
335+
plt.legend()
357336

358-
```python
359-
df_wasserstein = pd.DataFrame(np.nan, index=dfs_imputed_station.keys(), columns=cols_to_impute)
360-
for model, df_imputed in dfs_imputed_station.items():
361-
for col in cols_to_impute:
362-
wasserstein = scipy.stats.wasserstein_distance(df_station[col].dropna(how="all"), df_imputed[col])
363-
df_wasserstein.loc[model, col] = wasserstein
337+
plt.savefig("figures/acf.png")
338+
plt.show()
364339

365-
plot.display_bar_table(df_wasserstein, ylabel="Wasserstein distance")
366340
```
367341

368342
```python
-7.59 MB
Binary file not shown.

0 commit comments

Comments
 (0)