@@ -72,7 +72,7 @@ df["Sales"] = df['Sales'].astype(float)
7272cols_to_impute = [ "Sales"]
7373
7474``` python
75- download = False
75+ download = True
7676df_data = data.get_data_corrupted(download = download, ratio_masked = .2 , mean_size = 120 , groups = [" station" ])
7777
7878# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
@@ -84,67 +84,9 @@ cols_to_impute = ["TEMP", "PRES"]
8484Let's take a look at variables to impute. We only consider a station, Aotizhongxin.
8585Time series display seasonalities (roughly 12 months).
8686
87- ``` python
88- df_data
89- ```
90-
91- ``` python
92- df0 = df_data
93- ```
94-
95- ``` python
96- # df_data = df0[df0.index.get_level_values("station").isin(["Gucheng"])]
97- # df_data = df0[df0.index.get_level_values("station").isin(["Gucheng", "Aotizhongxin"])]
98- ```
99-
100- ``` python
101- n_stations = len (df_data.groupby(" station" ).size())
102- n_cols = len (cols_to_impute)
103- ```
104-
105- ``` python
106- fig = plt.figure(figsize = (10 * n_stations, 2 * n_cols))
107- for i_station, (station, df) in enumerate (df_data.groupby(" station" )):
108- for i_col, col in enumerate (cols_to_impute):
109- fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1 )
110- plt.plot(df.reset_index().datetime, df[col], ' .' , label = station)
111- # break
112- plt.ylabel(col, fontsize = 12 )
113- if i_col == 0 :
114- plt.title(station)
115- plt.show()
116- ```
11787
11888### ** II. Imputation methods**
11989
120- ``` python
121- station = " Gucheng"
122- df_data = df0[df0.index.get_level_values(" station" ).isin([station])]
123- df_data = df_data[[" TEMP" ]]
124- ```
125-
126- ``` python
127- # imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="PCP", columnwise=True, period=365, max_iter=1000)
128- imputer_rpca = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , max_iter = 1000 , period = 10 , tau = 2 , lam = 0.3 , list_periods = [10 ], list_etas = [0.01 ], norm = " L2" )
129-
130- ```
131-
132- ``` python
133- df_data.values.size
134- ```
135-
136- ``` python
137- df_imputed = imputer_rpca.fit_transform(df_data)
138- ```
139-
140- ``` python
141- df_imputed.iloc[:365 * (df_imputed.size // 365 )]
142- ```
143-
144- ``` python
145- plt.plot(df_data.loc[" Wonderland" ], " ." , color = " black" )
146- plt.plot(df_imputed.loc[" Wonderland" ])
147- ```
14890
14991This part is devoted to the imputation methods. The idea is to try different algorithms and compare them.
15092
@@ -175,8 +117,8 @@ imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, mode
175117# imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=False, n_rows=7*4, max_iter=1000, tau=1, lam=0.7)
176118dict_tau = {" TEMP" : 1 , " PRES" : 1.1 }
177119dict_lam = {" TEMP" : 0.7 , " PRES" : 0.8 }
178- imputer_rpca = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , n_rows = 7 * 4 , max_iter = 1000 , tau = dict_tau, lam = dict_lam)
179- imputer_rpca_opti = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , n_rows = 7 * 4 , max_iter = 1000 )
120+ imputer_rpca = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , period = 365 , max_iter = 1000 , tau = dict_tau, lam = dict_lam)
121+ imputer_rpca_opti = imputers.ImputerRPCA(groups = [" station" ], method = " temporal" , columnwise = True , n_rows = 365 , max_iter = 1000 )
180122
181123imputer_ou = imputers.ImputeEM(groups = [" station" ], method = " multinormal" , max_iter_em = 34 , n_iter_ou = 15 , strategy = " ou" )
182124imputer_tsou = imputers.ImputeEM(groups = [" station" ], method = " VAR1" , strategy = " ou" , max_iter_em = 34 , n_iter_ou = 15 )
@@ -197,15 +139,15 @@ dict_imputers = {
197139 " mean" : imputer_mean,
198140 # "median": imputer_median,
199141 # "mode": imputer_mode,
200- # "interpolation": imputer_interpol,
142+ " interpolation" : imputer_interpol,
201143 # "spline": imputer_spline,
202144 # "shuffle": imputer_shuffle,
203145 # "residuals": imputer_residuals,
204- # "OU": imputer_ou,
205- # "TSOU": imputer_tsou,
206- # "TSMLE": imputer_tsmle,
146+ " OU" : imputer_ou,
147+ " TSOU" : imputer_tsou,
148+ " TSMLE" : imputer_tsmle,
207149 " RPCA" : imputer_rpca,
208- " RPCA_opti" : imputer_rpca_opti,
150+ # "RPCA_opti": imputer_rpca_opti,
209151 # "locf": imputer_locf,
210152 # "nocb": imputer_nocb,
211153 # "knn": imputer_knn,
0 commit comments