@@ -76,26 +76,12 @@ The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consist
7676This dataset only contains numerical vairables.
7777
7878``` python
79- df_data = data.get_data_corrupted(" Beijing" , ratio_masked = .2 , mean_size = 20 )
80-
81- # cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
82- # cols_to_impute = df_data.columns[df_data.isna().any()]
79+ df_data = data.get_data_corrupted(" Beijing" , ratio_masked = .2 , mean_size = 120 )
8380cols_to_impute = [" TEMP" , " PRES" ]
84-
8581```
8682
8783The dataset ` Artificial ` is designed to have a sum of a periodical signal, a white noise and some outliers.
8884
89- ``` python
90- # df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
91- # cols_to_impute = ["signal"]
92- ```
93-
94- ``` python
95- # df_data = data.get_data("SNCF", n_groups_max=2)
96- # cols_to_impute = ["val_in"]
97- ```
98-
9985``` python
10086df_data
10187```
@@ -244,7 +230,7 @@ comparison = comparator.Comparator(
244230 dict_imputers,
245231 cols_to_impute,
246232 generator_holes = generator_holes,
247- metrics = [" mae" , " wmape" , " KL_columnwise" , " ks_test" , " energy " ],
233+ metrics = [" mae" , " wmape" , " KL_columnwise" , " ks_test" ],
248234 max_evals = 10 ,
249235 dict_config_opti = dict_config_opti,
250236)
@@ -253,8 +239,14 @@ results
253239```
254240
255241``` python
256- df_plot = results.loc[" energy " , " All " ]
242+ df_plot = results.loc[" KL_columnwise " , ' TEMP ' ]
257243plt.barh(df_plot.index, df_plot, color = tab10(0 ))
244+ plt.title(' TEMP' )
245+ plt.show()
246+
247+ df_plot = results.loc[" KL_columnwise" ,' PRES' ]
248+ plt.barh(df_plot.index, df_plot, color = tab10(0 ))
249+ plt.title(' PRES' )
258250plt.show()
259251```
260252
@@ -314,15 +306,19 @@ for col in cols_to_impute:
314306
315307```
316308
309+ ``` python
310+ dfs_imputed
311+ ```
312+
317313``` python
318314# plot.plot_imputations(df_station, dfs_imputed_station)
319315
320- n_columns = len (df_plot.columns )
316+ n_columns = len (cols_to_impute )
321317n_imputers = len (dict_imputers)
322318
323319fig = plt.figure(figsize = (12 * n_imputers, 4 * n_columns))
324320i_plot = 1
325- for i_col, col in enumerate (df_plot ):
321+ for i_col, col in enumerate (cols_to_impute ):
326322 for name_imputer, df_imp in dfs_imputed_station.items():
327323
328324 fig.add_subplot(n_columns, n_imputers, i_plot)
@@ -367,7 +363,6 @@ df = data.get_data("Beijing")
367363cols_to_impute = [" TEMP" , " PRES" ]
368364cols_with_nans = list (df.columns[df.isna().any()])
369365df_data = data.add_datetime_features(df)
370- df_data = data.add_station_features(df_data)
371366df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked = .1 , mean_size = 120 )
372367df_data
373368```
@@ -377,46 +372,38 @@ Then we train the model without taking a group on the stations
377372
378373``` python
379374estimator = tf.keras.models.Sequential([
380- tf.keras.layers.Dense(256 , activation = ' sigmoid ' ),
381- tf.keras.layers.Dense(128 , activation = ' sigmoid ' ),
382- tf.keras.layers.Dense(64 , activation = ' sigmoid ' ),
375+ tf.keras.layers.Dense(256 , activation = ' relu ' ),
376+ tf.keras.layers.Dense(128 , activation = ' relu ' ),
377+ tf.keras.layers.Dense(64 , activation = ' relu ' ),
383378 tf.keras.layers.Dense(1 )])
384- estimator.compile(optimizer = ' adam' , loss = ' mse ' )
385- dict_imputers[" MLP" ] = imputer_mlp = imputers_keras.ImputerRegressorKeras(estimator = estimator, handler_nan = " column" )
379+ estimator.compile(optimizer = ' adam' , loss = ' mae ' )
380+ dict_imputers[" MLP" ] = imputer_mlp = imputers_keras.ImputerRegressorKeras(estimator = estimator, groups = [ ' station ' ], handler_nan = " column" )
386381```
387382
388383We can re-run the imputation model benchmark as before.
389-
390- ``` python
391- generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits = 2 , subset = cols_to_impute, ratio_masked = ratio_masked)
384+ ``` python jupyter={"outputs_hidden": true} tags=[]
385+ generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits = 2 , groups = [" station" ], subset = cols_to_impute, ratio_masked = ratio_masked)
392386
393387comparison = comparator.Comparator(
394388 dict_imputers,
395- df_data.columns ,
389+ cols_to_impute ,
396390 generator_holes = generator_holes,
397- n_calls_opt = 10 ,
391+ metrics = [" mae" , " wmape" , " KL_columnwise" , " ks_test" ],
392+ max_evals = 10 ,
398393 dict_config_opti = dict_config_opti,
399394)
400395results = comparison.compare(df_data)
401396results
402397```
403-
404- ``` python
405- fig = plt.figure(figsize = (24 , 4 ))
406- plot.multibar(results.loc[" mae" ], decimals = 1 )
407- plt.ylabel(" mae" )
408- plt.show()
409- ```
410-
411- ``` python
398+ ``` python jupyter={"outputs_hidden": true, "source_hidden": true} tags=[]
412399df_plot = df_data
413400dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
414401station = df_plot.index.get_level_values(" station" )[0 ]
415402df_station = df_plot.loc[station]
416403dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
417404```
418405
419- ``` python
406+ ``` python jupyter={"source_hidden": true} tags=[]
420407for col in cols_to_impute:
421408 fig, ax = plt.subplots(figsize = (10 , 3 ))
422409 values_orig = df_station[col]
0 commit comments