88 format_version : ' 1.3'
99 jupytext_version : 1.14.4
1010 kernelspec :
11- display_name : env_qolmat
11+ display_name : env_qolmat_dev
1212 language : python
13- name : env_qolmat
13+ name : env_qolmat_dev
1414---
1515
1616** This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -38,6 +38,7 @@ import matplotlib.image as mpimg
3838import matplotlib.ticker as plticker
3939
4040tab10 = plt.get_cmap(" tab10" )
41+ plt.rcParams.update({' font.size' : 18 })
4142
4243from typing import Optional
4344
@@ -46,7 +47,6 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
4647
4748
4849import sys
49- # sys.path.append("../../")
5050from qolmat.benchmark import comparator, missing_patterns
5151from qolmat.benchmark.utils import kl_divergence
5252from qolmat.imputations import imputers
@@ -73,6 +73,10 @@ cols_to_impute = ["TEMP", "PRES"]
7373
7474The dataset ` Artificial ` is designed to have a sum of a periodical signal, a white noise and some outliers.
7575
76+ ``` python tags=[]
77+ df_data
78+ ```
79+
7680``` python
7781# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
7882# cols_to_impute = ["signal"]
@@ -87,16 +91,19 @@ n_cols = len(cols_to_impute)
8791```
8892
8993``` python tags=[]
90- fig = plt.figure(figsize = (10 * n_stations, 2 * n_cols))
94+ fig = plt.figure(figsize = (10 * n_stations, 3 * n_cols))
9195for i_station, (station, df) in enumerate (df_data.groupby(" station" )):
9296 df_station = df_data.loc[station]
9397 for i_col, col in enumerate (cols_to_impute):
9498 fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1 )
9599 plt.plot(df_station[col], ' .' , label = station)
96100 # break
97- plt.ylabel(col, fontsize = 12 )
101+ plt.ylabel(col)
102+ plt.xticks(rotation = 15 )
98103 if i_col == 0 :
99104 plt.title(station)
105+ if i_col != n_cols - 1 :
106+ plt.xticks([], [])
100107plt.show()
101108```
102109
@@ -200,7 +207,7 @@ plt.ylabel("mae")
200207plt.show()
201208```
202209
203- ### ** IV . Comparison of methods**
210+ ### ** III . Comparison of methods**
204211
205212
206213We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
@@ -248,49 +255,57 @@ for col in cols_to_impute:
248255n_columns = len (df_plot.columns)
249256n_imputers = len (dict_imputers)
250257
251- fig = plt.figure(figsize = (8 * n_columns , 6 * n_imputers ))
258+ fig = plt.figure(figsize = (8 * n_imputers , 6 * n_columns ))
252259i_plot = 1
253- for name_imputer in dict_imputers :
254- for col in df_plot :
260+ for i_col, col in enumerate (df_plot) :
261+ for name_imputer, df_imp in dfs_imputed_station.items() :
255262
256- fig.add_subplot(n_imputers, n_columns , i_plot)
263+ fig.add_subplot(n_columns, n_imputers , i_plot)
257264 values_orig = df_station[col]
258265
259266 plt.plot(values_orig, " ." , color = ' black' , label = " original" )
260267 # plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
261268
262- values_imp = dfs_imputed_station[name_imputer] [col].copy()
269+ values_imp = df_imp [col].copy()
263270 values_imp[values_orig.notna()] = np.nan
264271 plt.plot(values_imp, " ." , color = tab10(0 ), label = name_imputer, alpha = 1 )
265272 plt.ylabel(col, fontsize = 16 )
266- if i_plot % n_columns == 0 :
273+ if i_plot % n_columns == 1 :
267274 plt.legend(loc = [1 , 0 ], fontsize = 18 )
275+ plt.xticks(rotation = 15 )
276+ if i_col == 0 :
277+ plt.title(name_imputer)
278+ if i_col != n_columns - 1 :
279+ plt.xticks([], [])
268280 loc = plticker.MultipleLocator(base = 2 * 365 )
269281 ax.xaxis.set_major_locator(loc)
270- ax.tick_params(axis = ' both' , which = ' major' , labelsize = 17 )
282+ ax.tick_params(axis = ' both' , which = ' major' )
271283 i_plot += 1
272284plt.savefig(" figures/imputations_benchmark.png" )
273285plt.show()
274286
275287```
276288
277- ** IV.a. Covariance**
289+ ## Covariance
278290
279291
280292We first check the covariance. We simply plot one variable versus one another.
281293One observes the methods provide similar visual resuls: it's difficult to compare them based on this criterion.
282294
283295``` python
284- for i_model, model in enumerate (dict_imputers.keys()):
285- fig, axs = plt.subplots(1 , len (cols_to_impute)- 1 , figsize = (4 * (len (cols_to_impute)- 1 ), 4 ))
286- df_imp = dfs_imputed_station[model]
287- for i in range (len (cols_to_impute)- 1 ):
288- plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+ 1 ], axs, color = tab10(i_model))
289- axs.set_title(f " imputation method: { model} " , fontsize = 20 )
290- plt.show()
296+ fig = plt.figure(figsize = (6 * n_imputers, 6 * n_columns))
297+ i_plot = 1
298+ for i, col in enumerate (cols_to_impute[:- 1 ]):
299+ for i_imputer, (name_imputer, df_imp) in enumerate (dfs_imputed.items()):
300+ ax = fig.add_subplot(n_columns, n_imputers, i_plot)
301+ plot.compare_covariances(df_plot, df_imp, col, cols_to_impute[i+ 1 ], ax, color = tab10(i_imputer), label = name_imputer)
302+ ax.set_title(f " imputation method: { name_imputer} " , fontsize = 20 )
303+ i_plot += 1
304+ ax.legend()
305+ plt.show()
291306```
292307
293- ** IV.b. Auto-correlation**
308+ ## Auto-correlation
294309
295310
296311We are now interested in th eauto-correlation function (ACF). As seen before, time series display seaonal patterns.
@@ -302,63 +317,26 @@ On th econtrary, for the PRES variable, all methods overestimates the autocorrel
302317Finally, for the DEWP variable, the methods cannot impute to obtain a behavior close to the original: the autocorrelation decreases to linearly.
303318
304319``` python
305- from statsmodels.tsa.stattools import acf
306-
307- palette = sns.dark_palette(" b" , n_colors = len (dict_i), reverse = False )
308- sns.set_palette(palette)
309- markers = [" o" , " s" , " *" , " D" , " P" , " >" , " ^" , " d" ]
310-
311- fig, axs = plt.subplots(1 , len (cols_to_impute), figsize = (16 , 2 ))
312- for i, col in enumerate (cols_to_impute):
313- axs[i].plot(acf(df_station[col].dropna()), color = " k" , marker = markers[0 ], lw = 0.8 )
314- for j, (name, df) in enumerate (dfs_imputed_station.items()):
315- axs[i].plot(acf(df[col]), marker = markers[j+ 1 ], lw = 0.8 )
316- axs[i].set_xlabel(" Lags [days]" , fontsize = 15 )
317- axs[i].set_ylabel(" Correlation" , fontsize = 15 )
318- axs[i].set_ylim([0.5 , 1 ])
319- axs[i].set_title(col, fontsize = 15 )
320- axs[- 1 ].legend([" Original dataset" ] + list (dfs_imputed.keys()), loc = [1 , 0 ])
321- sns.despine()
322- ```
323-
324- ** IV.b. Distances between distributions**
325-
326-
327- We are now interested in a way for quantifying the distance between two distributions.
328- Until now, we look at the reconstruction error, whatever the distributions.
329-
330- There is a plethora of methods to quantify the distance between distributions $P$ and $Q$.
331- For instance, those based on the information theory as for instance, the well-known [ Kullback-Leibler divergence] ( https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence ) . A simple interpretation of the KL divergence of $P$ from $Q$ is the expected excess surprise from using $Q$ as a model when the actual distribution is $P$.
332-
333- A drawback with this divergence is it ignores the underlying geometry of the space (the KL divergence is somewhat difficult to intuitively interpret).
334- As a remedy, we consider a second metric, the [ Wasserstein distance] ( https://en.wikipedia.org/wiki/Wasserstein_metric ) , a distance function defined between probability distributions on a given metric space $M$.
335-
336- To understand one of the differences between these two quantities, let us look at this simple example.
337- The KL between the 2 distributions on the left is the same as that of the 2 distributions on the right: the KL divergence does not take into account the underlying metric space. Conversely, the Wasserstein metric is larger for those on the left since the "transport" is greater than for those on the right.
320+ n_columns = len (df_plot.columns)
321+ n_imputers = len (dict_imputers)
338322
339- <p align =" center " >
340- <img src="../../docs/images/KL_wasser.png" width=50% height=50%>
341- </p >
323+ fig = plt.figure(figsize = (6 * n_columns, 6 ))
324+ for i_col, col in enumerate (df_plot):
325+ ax = fig.add_subplot(1 , n_columns, i_col + 1 )
326+ values_orig = df_station[col]
342327
328+ acf = utils.acf(values_orig)
343329
344- ``` python
345- df_kl = pd.DataFrame(np.nan, index = dfs_imputed_station.keys(), columns = cols_to_impute)
346- for model, df_imputed in dfs_imputed_station.items():
347- for col in cols_to_impute:
348- kl = kl_divergence(df_station[[col]].dropna(how = " all" ), df_imputed[[col]]).iloc[0 ]
349- df_kl.loc[model, col] = kl
330+ plt.plot(acf, color = " black" )
331+ for name_imputer, df_imp in dfs_imputed_station.items():
350332
351- plot.display_bar_table(df_kl, ylabel = " KL divergence" )
352- ```
333+ acf = utils.acf(df_imp[col])
334+ plt.plot(acf, label = name_imputer)
335+ plt.legend()
353336
354- ``` python
355- df_wasserstein = pd.DataFrame(np.nan, index = dfs_imputed_station.keys(), columns = cols_to_impute)
356- for model, df_imputed in dfs_imputed_station.items():
357- for col in cols_to_impute:
358- wasserstein = scipy.stats.wasserstein_distance(df_station[col].dropna(how = " all" ), df_imputed[col])
359- df_wasserstein.loc[model, col] = wasserstein
337+ plt.savefig(" figures/acf.png" )
338+ plt.show()
360339
361- plot.display_bar_table(df_wasserstein, ylabel = " Wasserstein distance" )
362340```
363341
364342``` python
0 commit comments