88 format_version : ' 1.3'
99 jupytext_version : 1.14.4
1010 kernelspec :
11- display_name : env_qolmat
11+ display_name : env_qolmat_dev
1212 language : python
13- name : env_qolmat
13+ name : env_qolmat_dev
1414---
1515
1616** This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -38,6 +38,7 @@ import matplotlib.image as mpimg
3838import matplotlib.ticker as plticker
3939
4040tab10 = plt.get_cmap(" tab10" )
41+ plt.rcParams.update({' font.size' : 18 })
4142
4243from typing import Optional
4344
@@ -46,7 +47,6 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGra
4647
4748
4849import sys
49- # sys.path.append("../../")
5050from qolmat.benchmark import comparator, missing_patterns
5151from qolmat.benchmark.utils import kl_divergence
5252from qolmat.imputations import imputers
@@ -91,16 +91,19 @@ n_cols = len(cols_to_impute)
9191```
9292
9393``` python tags=[]
94- fig = plt.figure(figsize = (10 * n_stations, 2 * n_cols))
94+ fig = plt.figure(figsize = (10 * n_stations, 3 * n_cols))
9595for i_station, (station, df) in enumerate (df_data.groupby(" station" )):
9696 df_station = df_data.loc[station]
9797 for i_col, col in enumerate (cols_to_impute):
9898 fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1 )
9999 plt.plot(df_station[col], ' .' , label = station)
100100 # break
101- plt.ylabel(col, fontsize = 12 )
101+ plt.ylabel(col)
102+ plt.xticks(rotation = 15 )
102103 if i_col == 0 :
103104 plt.title(station)
105+ if i_col != n_cols - 1 :
106+ plt.xticks([], [])
104107plt.show()
105108```
106109
@@ -204,7 +207,7 @@ plt.ylabel("mae")
204207plt.show()
205208```
206209
207- ### ** IV . Comparison of methods**
210+ ### ** III . Comparison of methods**
208211
209212
210213We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
@@ -252,49 +255,57 @@ for col in cols_to_impute:
252255n_columns = len (df_plot.columns)
253256n_imputers = len (dict_imputers)
254257
255- fig = plt.figure(figsize = (8 * n_columns , 6 * n_imputers ))
258+ fig = plt.figure(figsize = (8 * n_imputers , 6 * n_columns ))
256259i_plot = 1
257- for name_imputer in dict_imputers :
258- for col in df_plot :
260+ for i_col, col in enumerate (df_plot) :
261+ for name_imputer, df_imp in dfs_imputed_station.items() :
259262
260- fig.add_subplot(n_imputers, n_columns , i_plot)
263+ fig.add_subplot(n_columns, n_imputers , i_plot)
261264 values_orig = df_station[col]
262265
263266 plt.plot(values_orig, " ." , color = ' black' , label = " original" )
264267 # plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
265268
266- values_imp = dfs_imputed_station[name_imputer] [col].copy()
269+ values_imp = df_imp [col].copy()
267270 values_imp[values_orig.notna()] = np.nan
268271 plt.plot(values_imp, " ." , color = tab10(0 ), label = name_imputer, alpha = 1 )
269272 plt.ylabel(col, fontsize = 16 )
270- if i_plot % n_columns == 0 :
273+ if i_plot % n_columns == 1 :
271274 plt.legend(loc = [1 , 0 ], fontsize = 18 )
275+ plt.xticks(rotation = 15 )
276+ if i_col == 0 :
277+ plt.title(name_imputer)
278+ if i_col != n_columns - 1 :
279+ plt.xticks([], [])
272280 loc = plticker.MultipleLocator(base = 2 * 365 )
273281 ax.xaxis.set_major_locator(loc)
274- ax.tick_params(axis = ' both' , which = ' major' , labelsize = 17 )
282+ ax.tick_params(axis = ' both' , which = ' major' )
275283 i_plot += 1
276284plt.savefig(" figures/imputations_benchmark.png" )
277285plt.show()
278286
279287```
280288
281- ** IV.a. Covariance**
289+ ## Covariance
282290
283291
284292We first check the covariance. We simply plot one variable versus one another.
285293One observes the methods provide similar visual resuls: it's difficult to compare them based on this criterion.
286294
287295``` python
288- for i_model, model in enumerate (dict_imputers.keys()):
289- fig, axs = plt.subplots(1 , len (cols_to_impute)- 1 , figsize = (4 * (len (cols_to_impute)- 1 ), 4 ))
290- df_imp = dfs_imputed_station[model]
291- for i in range (len (cols_to_impute)- 1 ):
292- plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+ 1 ], axs, color = tab10(i_model))
293- axs.set_title(f " imputation method: { model} " , fontsize = 20 )
294- plt.show()
296+ fig = plt.figure(figsize = (6 * n_imputers, 6 * n_columns))
297+ i_plot = 1
298+ for i, col in enumerate (cols_to_impute[:- 1 ]):
299+ for i_imputer, (name_imputer, df_imp) in enumerate (dfs_imputed.items()):
300+ ax = fig.add_subplot(n_columns, n_imputers, i_plot)
301+ plot.compare_covariances(df_plot, df_imp, col, cols_to_impute[i+ 1 ], ax, color = tab10(i_imputer), label = name_imputer)
302+ ax.set_title(f " imputation method: { name_imputer} " , fontsize = 20 )
303+ i_plot += 1
304+ ax.legend()
305+ plt.show()
295306```
296307
297- ** IV.b. Auto-correlation**
308+ ## Auto-correlation
298309
299310
300311We are now interested in th eauto-correlation function (ACF). As seen before, time series display seaonal patterns.
@@ -306,63 +317,26 @@ On th econtrary, for the PRES variable, all methods overestimates the autocorrel
306317Finally, for the DEWP variable, the methods cannot impute to obtain a behavior close to the original: the autocorrelation decreases to linearly.
307318
308319``` python
309- from statsmodels.tsa.stattools import acf
310-
311- palette = sns.dark_palette(" b" , n_colors = len (dict_i), reverse = False )
312- sns.set_palette(palette)
313- markers = [" o" , " s" , " *" , " D" , " P" , " >" , " ^" , " d" ]
314-
315- fig, axs = plt.subplots(1 , len (cols_to_impute), figsize = (16 , 2 ))
316- for i, col in enumerate (cols_to_impute):
317- axs[i].plot(acf(df_station[col].dropna()), color = " k" , marker = markers[0 ], lw = 0.8 )
318- for j, (name, df) in enumerate (dfs_imputed_station.items()):
319- axs[i].plot(acf(df[col]), marker = markers[j+ 1 ], lw = 0.8 )
320- axs[i].set_xlabel(" Lags [days]" , fontsize = 15 )
321- axs[i].set_ylabel(" Correlation" , fontsize = 15 )
322- axs[i].set_ylim([0.5 , 1 ])
323- axs[i].set_title(col, fontsize = 15 )
324- axs[- 1 ].legend([" Original dataset" ] + list (dfs_imputed.keys()), loc = [1 , 0 ])
325- sns.despine()
326- ```
327-
328- ** IV.b. Distances between distributions**
329-
330-
331- We are now interested in a way for quantifying the distance between two distributions.
332- Until now, we look at the reconstruction error, whatever the distributions.
333-
334- There is a plethora of methods to quantify the distance between distributions $P$ and $Q$.
335- For instance, those based on the information theory as for instance, the well-known [ Kullback-Leibler divergence] ( https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence ) . A simple interpretation of the KL divergence of $P$ from $Q$ is the expected excess surprise from using $Q$ as a model when the actual distribution is $P$.
336-
337- A drawback with this divergence is it ignores the underlying geometry of the space (the KL divergence is somewhat difficult to intuitively interpret).
338- As a remedy, we consider a second metric, the [ Wasserstein distance] ( https://en.wikipedia.org/wiki/Wasserstein_metric ) , a distance function defined between probability distributions on a given metric space $M$.
339-
340- To understand one of the differences between these two quantities, let us look at this simple example.
341- The KL between the 2 distributions on the left is the same as that of the 2 distributions on the right: the KL divergence does not take into account the underlying metric space. Conversely, the Wasserstein metric is larger for those on the left since the "transport" is greater than for those on the right.
320+ n_columns = len (df_plot.columns)
321+ n_imputers = len (dict_imputers)
342322
343- <p align =" center " >
344- <img src="../../docs/images/KL_wasser.png" width=50% height=50%>
345- </p >
323+ fig = plt.figure(figsize = (6 * n_columns, 6 ))
324+ for i_col, col in enumerate (df_plot):
325+ ax = fig.add_subplot(1 , n_columns, i_col + 1 )
326+ values_orig = df_station[col]
346327
328+ acf = utils.acf(values_orig)
347329
348- ``` python
349- df_kl = pd.DataFrame(np.nan, index = dfs_imputed_station.keys(), columns = cols_to_impute)
350- for model, df_imputed in dfs_imputed_station.items():
351- for col in cols_to_impute:
352- kl = kl_divergence(df_station[[col]].dropna(how = " all" ), df_imputed[[col]]).iloc[0 ]
353- df_kl.loc[model, col] = kl
330+ plt.plot(acf, color = " black" )
331+ for name_imputer, df_imp in dfs_imputed_station.items():
354332
355- plot.display_bar_table(df_kl, ylabel = " KL divergence" )
356- ```
333+ acf = utils.acf(df_imp[col])
334+ plt.plot(acf, label = name_imputer)
335+ plt.legend()
357336
358- ``` python
359- df_wasserstein = pd.DataFrame(np.nan, index = dfs_imputed_station.keys(), columns = cols_to_impute)
360- for model, df_imputed in dfs_imputed_station.items():
361- for col in cols_to_impute:
362- wasserstein = scipy.stats.wasserstein_distance(df_station[col].dropna(how = " all" ), df_imputed[col])
363- df_wasserstein.loc[model, col] = wasserstein
337+ plt.savefig(" figures/acf.png" )
338+ plt.show()
364339
365- plot.display_bar_table(df_wasserstein, ylabel = " Wasserstein distance" )
366340```
367341
368342``` python
0 commit comments