88 format_version : ' 1.3'
99 jupytext_version : 1.14.4
1010 kernelspec :
11- display_name : env_qolmat
11+ display_name : env_qolmat_dev
1212 language : python
13- name : env_qolmat
13+ name : env_qolmat_dev
1414---
1515
1616** This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -28,6 +28,8 @@ import warnings
2828% reload_ext autoreload
2929% autoreload 2
3030
31+ from IPython.display import Image
32+
3133import pandas as pd
3234from datetime import datetime
3335import numpy as np
@@ -82,12 +84,12 @@ n_cols = len(cols_to_impute)
8284```
8385
8486``` python
85- fig = plt.figure(figsize = (10 * n_stations, 3 * n_cols))
87+ fig = plt.figure(figsize = (20 * n_stations, 6 * n_cols))
8688for i_station, (station, df) in enumerate (df_data.groupby(" station" )):
8789 df_station = df_data.loc[station]
8890 for i_col, col in enumerate (cols_to_impute):
8991 fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1 )
90- plt.plot(df_station[col], ' . ' , label = station)
92+ plt.plot(df_station[col], label = station)
9193 # break
9294 plt.ylabel(col)
9395 plt.xticks(rotation = 15 )
@@ -127,7 +129,7 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
127129imputer_shuffle = imputers.ImputerShuffle(groups = (" station" ,))
128130imputer_residuals = imputers.ImputerResiduals(groups = (" station" ,), period = 365 , model_tsa = " additive" , extrapolate_trend = " freq" , method_interpolation = " linear" )
129131
130- imputer_rpca = imputers.ImputerRpcaNoisy(groups = (" station" ,), columnwise = False , max_iterations = 500 , tau = 2 , lam = 0.05 )
132+ imputer_rpca = imputers.ImputerRpcaNoisy(groups = (" station" ,), columnwise = False , max_iterations = 500 , tau = .01 , lam = 5 , rank = 1 )
131133imputer_rpca_opti = imputers.ImputerRpcaNoisy(groups = (" station" ,), columnwise = False , max_iterations = 256 )
132134dict_config_opti[" RPCA_opti" ] = {
133135 " tau" : ho.hp.uniform(" tau" , low = .5 , high = 5 ),
@@ -141,9 +143,9 @@ dict_config_opti["RPCA_opticw"] = {
141143 " lam/PRES" : ho.hp.uniform(" lam/PRES" , low = .1 , high = 1 ),
142144}
143145
144- imputer_ou = imputers.ImputerEM(groups = (" station" ,), model = " multinormal" , method = " sample" , max_iter_em = 34 , n_iter_ou = 15 , dt = 1e-3 )
145- imputer_tsou = imputers.ImputerEM(groups = (" station" ,), model = " VAR" , method = " sample" , max_iter_em = 34 , n_iter_ou = 15 , dt = 1e-3 , p = 1 )
146- imputer_tsmle = imputers.ImputerEM(groups = (" station" ,), model = " VAR" , method = " mle" , max_iter_em = 100 , n_iter_ou = 15 , dt = 1e-3 , p = 1 )
146+ imputer_normal_sample = imputers.ImputerEM(groups = (" station" ,), model = " multinormal" , method = " sample" , max_iter_em = 8 , n_iter_ou = 128 , dt = 4e-2 )
147+ imputer_var_sample = imputers.ImputerEM(groups = (" station" ,), model = " VAR" , method = " sample" , max_iter_em = 8 , n_iter_ou = 128 , dt = 4e-2 , p = 1 )
148+ imputer_var_max = imputers.ImputerEM(groups = (" station" ,), model = " VAR" , method = " mle" , max_iter_em = 8 , n_iter_ou = 128 , dt = 4e-2 , p = 1 )
147149
148150imputer_knn = imputers.ImputerKNN(groups = (" station" ,), n_neighbors = 10 )
149151imputer_mice = imputers.ImputerMICE(groups = (" station" ,), estimator = LinearRegression(), sample_posterior = False , max_iter = 100 )
@@ -163,25 +165,25 @@ dict_imputers = {
163165 # "spline": imputer_spline,
164166 # "shuffle": imputer_shuffle,
165167 " residuals" : imputer_residuals,
166- # "OU ": imputer_ou ,
167- " TSOU " : imputer_tsou ,
168- " TSMLE " : imputer_tsmle ,
168+ " Normal_sample " : imputer_normal_sample ,
169+ " VAR_sample " : imputer_var_sample ,
170+ " VAR_max " : imputer_var_max ,
169171 " RPCA" : imputer_rpca,
170172 # "RPCA_opti": imputer_rpca,
171173 # "RPCA_opticw": imputer_rpca_opti2,
172174 # "locf": imputer_locf,
173175 # "nocb": imputer_nocb,
174176 # "knn": imputer_knn,
175- " ols " : imputer_regressor,
176- " mice_ols " : imputer_mice,
177+ " OLS " : imputer_regressor,
178+ " MICE_OLS " : imputer_mice,
177179}
178180n_imputers = len (dict_imputers)
179181```
180182
181183In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to ` n_splits ` . For each method, we calculate the average error and compare the final errors.
182184
183185<p align =" center " >
184- <img src="../../ docs/images/comparator .png" width=50% height=50%>
186+ <img src="https://raw.githubusercontent.com/Quantmetry/qolmat/main/ docs/images/schema_qolmat .png" width=50% height=50%>
185187</p >
186188
187189
@@ -190,14 +192,14 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
190192
191193Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
192194
193- ``` python
194- metrics = [" mae" , " wmape" , " KL_columnwise" , " ks_test " , " dist_corr_pattern " ]
195+ ``` python tags=[]
196+ metrics = [" mae" , " wmape" , " KL_columnwise" , " frechet " ]
195197comparison = comparator.Comparator(
196198 dict_imputers,
197199 cols_to_impute,
198200 generator_holes = generator_holes,
199201 metrics = metrics,
200- max_evals = 10 ,
202+ max_evals = 2 ,
201203 dict_config_opti = dict_config_opti,
202204)
203205results = comparison.compare(df_data)
@@ -220,9 +222,9 @@ plt.show()
220222### ** III. Comparison of methods**
221223
222224
223- We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis .
225+ We now run just one time each algorithm on the initial corrupted dataframe and visualize the different imputations .
224226
225- ``` python
227+ ``` python tags=[]
226228df_plot = df_data[cols_to_impute]
227229```
228230
@@ -233,12 +235,17 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
233235``` python
234236station = df_plot.index.get_level_values(" station" )[0 ]
235237df_station = df_plot.loc[station]
238+ # dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
236239dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
237240```
238241
239242Let's look at the imputations.
240243When the data is missing at random, imputation is easier. Missing block are more challenging.
241244
245+ ``` python
246+ dfs_imputed_station[" VAR_max" ]
247+ ```
248+
242249``` python
243250for col in cols_to_impute:
244251 fig, ax = plt.subplots(figsize = (10 , 3 ))
@@ -270,21 +277,21 @@ i_plot = 1
270277for i_col, col in enumerate (cols_to_impute):
271278 for name_imputer, df_imp in dfs_imputed_station.items():
272279
273- fig.add_subplot(n_columns, n_imputers, i_plot)
280+ ax = fig.add_subplot(n_columns, n_imputers, i_plot)
274281 values_orig = df_station[col]
275282
276283 values_imp = df_imp[col].copy()
277284 values_imp[values_orig.notna()] = np.nan
278- plt.plot(values_imp, marker = " o" , color = tab10(0 ), label = name_imputer , alpha = 1 )
285+ plt.plot(values_imp, marker = " o" , color = tab10(0 ), label = " imputation " , alpha = 1 )
279286 plt.plot(values_orig, color = ' black' , marker = " o" , label = " original" )
280287 plt.ylabel(col, fontsize = 16 )
281- if i_plot % n_columns == 1 :
282- plt.legend(loc = [ 1 , 0 ] , fontsize = 18 )
288+ if i_plot % n_imputers == 0 :
289+ plt.legend(loc = " lower right " , fontsize = 18 )
283290 plt.xticks(rotation = 15 )
284291 if i_col == 0 :
285292 plt.title(name_imputer)
286293 if i_col != n_columns - 1 :
287- plt.xticks([], [])
294+ ax.set_xticklabels( [])
288295 loc = plticker.MultipleLocator(base = 2 * 365 )
289296 ax.xaxis.set_major_locator(loc)
290297 ax.tick_params(axis = ' both' , which = ' major' )
@@ -297,7 +304,7 @@ plt.show()
297304## (Optional) Deep Learning Model
298305
299306
300- In this section, we present an MLP model of data imputation using Keras , which can be installed using a "pip install pytorch".
307+ In this section, we present an MLP model of data imputation using PyTorch , which can be installed using a "pip install qolmat [ pytorch] ".
301308
302309``` python
303310from qolmat.imputations import imputers_pytorch
@@ -308,17 +315,6 @@ except ModuleNotFoundError:
308315 raise PyTorchExtraNotInstalled
309316```
310317
311- For the MLP model, we work on a dataset that corresponds to weather data with missing values. We add missing MCAR values on the features "TEMP", "PRES" and other features with NaN values. The goal is impute the missing values for the features "TEMP" and "PRES" by a Deep Learning method. We add features to take into account the seasonality of the data set and a feature for the station name
312-
313- ``` python
314- df = data.get_data(" Beijing" )
315- cols_to_impute = [" TEMP" , " PRES" ]
316- cols_with_nans = list (df.columns[df.isna().any()])
317- df_data = data.add_datetime_features(df)
318- df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked = .1 , mean_size = 120 )
319- df_data
320- ```
321-
322318For the example, we use a simple MLP model with 3 layers of neurons.
323319Then we train the model without taking a group on the stations
324320
@@ -340,49 +336,75 @@ plt.show()
340336```
341337
342338``` python
343- # estimator = nn.Sequential(
344- # nn.Linear(np.sum(df_data.isna().sum()==0), 256),
345- # nn.ReLU(),
346- # nn.Linear(256, 128),
347- # nn.ReLU(),
348- # nn.Linear(128, 64),
349- # nn.ReLU(),
350- # nn.Linear(64, 1)
351- # )
352- estimator = imputers_pytorch.build_mlp(input_dim = np.sum(df_data.isna().sum()== 0 ), list_num_neurons = [256 ,128 ,64 ])
353- encoder, decoder = imputers_pytorch.build_autoencoder(input_dim = df_data.values.shape[1 ],latent_dim = 4 , output_dim = df_data.values.shape[1 ], list_num_neurons = [4 * 4 , 2 * 4 ])
339+ n_variables = len (cols_to_impute)
340+
341+ estimator = imputers_pytorch.build_mlp(input_dim = n_variables- 1 , list_num_neurons = [256 ,128 ,64 ])
342+ encoder, decoder = imputers_pytorch.build_autoencoder(input_dim = n_variables,latent_dim = 4 , output_dim = n_variables, list_num_neurons = [4 * 4 , 2 * 4 ])
354343```
355344
356345``` python
357- dict_imputers[" MLP" ] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator = estimator, groups = (' station' ,), handler_nan = " column " , epochs = 500 )
346+ dict_imputers[" MLP" ] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator = estimator, groups = (' station' ,), epochs = 500 )
358347dict_imputers[" Autoencoder" ] = imputer_autoencoder = imputers_pytorch.ImputerAutoencoder(encoder, decoder, max_iterations = 100 , epochs = 100 )
359348dict_imputers[" Diffusion" ] = imputer_diffusion = imputers_pytorch.ImputerDiffusion(model = TabDDPM(num_sampling = 5 ), epochs = 100 , batch_size = 100 )
360349```
361350
362351We can re-run the imputation model benchmark as before.
352+ ``` python
353+ comparison = comparator.Comparator(
354+ dict_imputers,
355+ cols_to_impute,
356+ generator_holes = generator_holes,
357+ metrics = metrics,
358+ max_evals = 2 ,
359+ dict_config_opti = dict_config_opti,
360+ )
361+ ```
362+
363363``` python tags=[]
364364generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits = 3 , groups = (' station' ,), subset = cols_to_impute, ratio_masked = ratio_masked)
365365
366366comparison = comparator.Comparator(
367367 dict_imputers,
368- selected_columns = df_data.columns ,
368+ cols_to_impute ,
369369 generator_holes = generator_holes,
370- metrics = [ " mae " , " wmape " , " KL_columnwise " , " ks_test " ] ,
371- max_evals = 10 ,
370+ metrics = metrics ,
371+ max_evals = 2 ,
372372 dict_config_opti = dict_config_opti,
373373)
374374results = comparison.compare(df_data)
375375results.style.highlight_min(color = " green" , axis = 1 )
376376```
377+ ``` python
378+ n_metrics = len (metrics)
379+ fig = plt.figure(figsize = (24 , 4 * n_metrics))
380+ for i, metric in enumerate (metrics):
381+ fig.add_subplot(n_metrics, 1 , i + 1 )
382+ df = results.loc[metric]
383+ plot.multibar(df, decimals = 2 )
384+ plt.ylabel(metric)
385+
386+ # plt.savefig("figures/imputations_benchmark_errors.png")
387+ plt.show()
388+ ```
389+
377390``` python tags=[]
378- df_plot = df_data
391+ df_plot = df_data[cols_to_impute]
392+ ```
393+
394+ ``` python
379395dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
396+ ```
397+
398+ ``` python
380399station = df_plot.index.get_level_values(" station" )[0 ]
381400df_station = df_plot.loc[station]
382401dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
383402```
384403
385- ``` python tags=[]
404+ Let's look at the imputations.
405+ When the data is missing at random, imputation is easier. Missing block are more challenging.
406+
407+ ``` python
386408for col in cols_to_impute:
387409 fig, ax = plt.subplots(figsize = (10 , 3 ))
388410 values_orig = df_station[col]
@@ -399,39 +421,42 @@ for col in cols_to_impute:
399421 ax.xaxis.set_major_locator(loc)
400422 ax.tick_params(axis = ' both' , which = ' major' , labelsize = 17 )
401423 plt.show()
424+
402425```
403426
404427``` python
405- n_columns = len (df_plot.columns)
428+ # plot.plot_imputations(df_station, dfs_imputed_station)
429+
430+ n_columns = len (cols_to_impute)
406431n_imputers = len (dict_imputers)
407432
408- fig = plt.figure(figsize = (8 * n_imputers, 6 * n_columns))
433+ fig = plt.figure(figsize = (12 * n_imputers, 4 * n_columns))
409434i_plot = 1
410- for i_col, col in enumerate (df_plot ):
435+ for i_col, col in enumerate (cols_to_impute ):
411436 for name_imputer, df_imp in dfs_imputed_station.items():
412437
413- fig.add_subplot(n_columns, n_imputers, i_plot)
438+ ax = fig.add_subplot(n_columns, n_imputers, i_plot)
414439 values_orig = df_station[col]
415440
416- plt.plot(values_orig, " ." , color = ' black' , label = " original" )
417-
418441 values_imp = df_imp[col].copy()
419442 values_imp[values_orig.notna()] = np.nan
420- plt.plot(values_imp, " ." , color = tab10(0 ), label = name_imputer, alpha = 1 )
443+ plt.plot(values_imp, marker = " o" , color = tab10(0 ), label = " imputation" , alpha = 1 )
444+ plt.plot(values_orig, color = ' black' , marker = " o" , label = " original" )
421445 plt.ylabel(col, fontsize = 16 )
422- if i_plot % n_columns == 1 :
423- plt.legend(loc = [ 1 , 0 ] , fontsize = 18 )
446+ if i_plot % n_imputers == 0 :
447+ plt.legend(loc = " lower right " , fontsize = 18 )
424448 plt.xticks(rotation = 15 )
425449 if i_col == 0 :
426450 plt.title(name_imputer)
427451 if i_col != n_columns - 1 :
428- plt.xticks([], [])
452+ ax.set_xticklabels( [])
429453 loc = plticker.MultipleLocator(base = 2 * 365 )
430454 ax.xaxis.set_major_locator(loc)
431455 ax.tick_params(axis = ' both' , which = ' major' )
432456 i_plot += 1
433- plt.savefig( " figures/imputations_benchmark.png " )
457+
434458plt.show()
459+
435460```
436461
437462## Covariance
0 commit comments