66 extension : .md
77 format_name : markdown
88 format_version : ' 1.3'
9- jupytext_version : 1.14.1
9+ jupytext_version : 1.14.4
1010 kernelspec :
1111 display_name : env_qolmat
1212 language : python
@@ -59,31 +59,32 @@ from qolmat.utils import data, utils, plot
5959### ** I. Load data**
6060<!-- #endregion -->
6161
62- The data used in this example is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/ .
62+ The dataset ` Beijing ` is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/ .
6363This dataset only contains numerical vairables.
6464
65-
66- df = pd.read_csv("/Users/hlbotterman/Downloads/m5-daily-sales.csv")
67- df.head()
68- df[ "Date"] = pd.to_datetime(df[ "Date"] )
69- df = df.rename(columns={"Date": "datetime"})
70- df.set_index("datetime", inplace = True)
71- df[ "Sales"] = df[ 'Sales'] .astype(float)
72- cols_to_impute = [ "Sales"]
73-
7465``` python
75- download = True
76- df_data = data.get_data_corrupted(download = download, ratio_masked = .2 , mean_size = 120 , groups = [" station" ])
66+ df_data = data.get_data_corrupted(" Beijing" , ratio_masked = .2 , mean_size = 120 )
7767
7868# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
7969# cols_to_impute = df_data.columns[df_data.isna().any()]
8070cols_to_impute = [" TEMP" , " PRES" ]
8171
8272```
8373
74+ The dataset ` Artificial ` is designed to have a sum of a periodical signal, a white noise and some outliers.
75+
76+ ``` python
77+ # df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
78+ # cols_to_impute = ["signal"]
79+ ```
80+
8481Let's take a look at variables to impute. We only consider a station, Aotizhongxin.
8582Time series display seasonalities (roughly 12 months).
8683
84+ ``` python
85+ imputer = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , period = 20 , max_iter = 1000 )
86+ df = imputer.fit_transform(df_data)
87+ ```
8788
8889### ** II. Imputation methods**
8990
@@ -114,11 +115,10 @@ imputer_spline = imputers.ImputerInterpolation(groups=["station"], method="splin
114115imputer_shuffle = imputers.ImputerShuffle(groups = [" station" ])
115116imputer_residuals = imputers.ImputerResiduals(groups = [" station" ], period = 7 , model_tsa = " additive" , extrapolate_trend = " freq" , method_interpolation = " linear" )
116117
117- # imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=False, n_rows=7*4, max_iter=1000, tau=1, lam=0.7)
118- dict_tau = {" TEMP" : 1 , " PRES" : 1.1 }
119- dict_lam = {" TEMP" : 0.7 , " PRES" : 0.8 }
120- imputer_rpca = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , period = 365 , max_iter = 1000 , tau = dict_tau, lam = dict_lam)
121- imputer_rpca_opti = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , n_rows = 365 , max_iter = 1000 )
118+ dict_tau = {" TEMP" : 2 , " PRES" : 1.1 }
119+ dict_lam = {" TEMP" : 0.3 , " PRES" : 0.8 }
120+ imputer_rpca = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , period = 20 , max_iter = 1000 , tau = 2 , lam = .3 )
121+ imputer_rpca_opti = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , period = 20 , max_iter = 1000 )
122122
123123imputer_ou = imputers.ImputeEM(groups = [" station" ], method = " multinormal" , max_iter_em = 34 , n_iter_ou = 15 , strategy = " ou" )
124124imputer_tsou = imputers.ImputeEM(groups = [" station" ], method = " VAR1" , strategy = " ou" , max_iter_em = 34 , n_iter_ou = 15 )
@@ -146,8 +146,8 @@ dict_imputers = {
146146 " OU" : imputer_ou,
147147 " TSOU" : imputer_tsou,
148148 " TSMLE" : imputer_tsmle,
149- " RPCA" : imputer_rpca ,
150- # "RPCA_opti": imputer_rpca_opti,
149+ " RPCA" : imputer_rpca_opti ,
150+ " RPCA_opti" : imputer_rpca_opti,
151151 # "locf": imputer_locf,
152152 # "nocb": imputer_nocb,
153153 # "knn": imputer_knn,
@@ -156,16 +156,23 @@ dict_imputers = {
156156n_imputers = len (dict_imputers)
157157
158158
159+ # search_params = {
160+ # "RPCA_opti": {
161+ # "lam": {
162+ # "TEMP": {"min": .1, "max": 10, "type":"Real"},
163+ # "PRES": {"min": .1, "max": 10, "type":"Real"}
164+ # },
165+ # "tau": {
166+ # "TEMP": {"min": .1, "max": 10, "type":"Real"},
167+ # "PRES": {"min": .1, "max": 10, "type":"Real"}
168+ # }
169+ # }
170+ # }
171+
159172search_params = {
160173 " RPCA_opti" : {
161- " lam" : {
162- " TEMP" : {" min" : .1 , " max" : 10 , " type" :" Real" },
163- " PRES" : {" min" : .1 , " max" : 10 , " type" :" Real" }
164- },
165- " tau" : {
166- " TEMP" : {" min" : .1 , " max" : 10 , " type" :" Real" },
167- " PRES" : {" min" : .1 , " max" : 10 , " type" :" Real" }
168- }
174+ " tau" : {" min" : .5 , " max" : 5 , " type" :" Real" },
175+ " lam" : {" min" : .1 , " max" : 1 , " type" :" Real" },
169176 }
170177}
171178
@@ -209,7 +216,8 @@ results
209216
210217``` python
211218fig = plt.figure(figsize = (24 , 4 ))
212- plot.multibar(results.loc[" mae" ])
219+ plot.multibar(results.loc[" mae" ], decimals = 1 )
220+ plt.ylabel(" mae" )
213221plt.show()
214222```
215223
@@ -219,15 +227,20 @@ plt.show()
219227We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
220228
221229``` python
222- df_plot = df_data[[ " TEMP " , " PRES " ] ]
230+ df_plot = df_data[cols_to_impute ]
223231```
224232
225233``` python
226234dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
227235```
228236
229237``` python
230- station = " Aotizhongxin"
238+ df_station
239+ ```
240+
241+ ``` python
242+ # station = "Aotizhongxin"
243+ station = df_plot.index.get_level_values(" station" )[0 ]
231244df_station = df_plot.loc[station]
232245dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
233246```
@@ -286,12 +299,13 @@ for name_imputer in dict_imputers:
286299 values_imp[values_orig.notna()] = np.nan
287300 plt.plot(values_imp, " ." , color = tab10(0 ), label = name_imputer, alpha = 1 )
288301 plt.ylabel(col, fontsize = 16 )
289- if i_plot % 2 == 0 :
302+ if i_plot % n_columns == 0 :
290303 plt.legend(loc = [1 , 0 ], fontsize = 18 )
291304 loc = plticker.MultipleLocator(base = 2 * 365 )
292305 ax.xaxis.set_major_locator(loc)
293306 ax.tick_params(axis = ' both' , which = ' major' , labelsize = 17 )
294307 i_plot += 1
308+ plt.xlim(0 , 100 )
295309plt.savefig(" figures/imputations_benchmark.png" )
296310plt.show()
297311
@@ -310,7 +324,6 @@ for i_model, model in enumerate(dict_imputers.keys()):
310324 for i in range (len (cols_to_impute)- 1 ):
311325 plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+ 1 ], axs, color = tab10(i_model))
312326 axs.set_title(f " imputation method: { model} " , fontsize = 20 )
313- sns.despine()
314327 plt.show()
315328```
316329
0 commit comments