@@ -148,6 +148,7 @@ def __init__(
148148 self .convergence_threshold = tolerance
149149 self .stagnation_threshold = stagnation_threshold
150150 self .stagnation_loglik = stagnation_loglik
151+ self .scaler = StandardScaler ()
151152
152153 self .dict_criteria_stop = {}
153154
@@ -200,109 +201,79 @@ def _convert_numpy(self, X: ArrayLike) -> np.ndarray:
200201 def _check_convergence (self ) -> bool :
201202 return False
202203
203- def _maximize_likelihood (self , X : ArrayLike ) -> ArrayLike :
204+ def fit (self , X : np . array ) :
204205 """
205- Get the argmax of a posterior distribution .
206+ Fit the statistical distribution with the input X array .
206207
207208 Parameters
208209 ----------
209- X : ArrayLike
210- Input DataFrame.
211-
212- Returns
213- -------
214- ArrayLike
215- DataFrame with imputed values.
210+ X : np.array
211+ Numpy array to be imputed
216212 """
217- X_center = X - self .means [:, None ]
218- X_imputed = _gradient_conjugue (self .cov_inv , X_center )
219- X_imputed = self .means [:, None ] + X_imputed
220- return X_imputed
221-
222- def impute_em (self , X : ArrayLike ) -> ArrayLike :
223- """Imputation via EM algorithm
224-
225- Parameters
226- ----------
227- X : ArrayLike
228- array with missing values
213+ X = X .copy ()
214+ self .hash_fit = hash (X .tobytes ())
215+ if not isinstance (X , np .ndarray ):
216+ raise AssertionError ("Invalid type. X must be a np.ndarray." )
229217
230- Returns
231- -------
232- X_transformed
233- imputed array
234- """
218+ if X .shape [0 ] < 2 :
219+ raise AssertionError ("Invalid dimensions: X must be of dimension (n,m) with m>1." )
235220
236- X_ = self ._convert_numpy (X )
237- if np .nansum (X_ ) == 0 :
238- return X_
221+ X = self .scaler .fit_transform (X )
222+ X = X .T
239223
240224 mask_na = np .isnan (X )
241225
242226 # first imputation
243- X_transformed = self ._linear_interpolation (X_ )
227+ X_sample_last = self ._linear_interpolation (X )
244228
245- self .fit_distribution (X_transformed )
229+ self .fit_distribution (X_sample_last )
246230
247231 for iter_em in range (self .max_iter_em ):
248232
249- X_transformed = self ._sample_ou (X_transformed , mask_na )
233+ X_sample_last = self ._sample_ou (X_sample_last , mask_na )
250234
251235 if self ._check_convergence ():
252236 logger .info (f"EM converged after { iter_em } iterations." )
253237 break
254238
255- if self .strategy == "mle" :
256- X_transformed = self ._maximize_likelihood (X_ )
257- elif self .strategy == "ou" :
258- X_transformed = self ._sample_ou (X_transformed , mask_na )
259-
260239 self .dict_criteria_stop = {key : [] for key in self .dict_criteria_stop }
240+ self .X_sample_last = X_sample_last
241+ return self
261242
262- if np .all (np .isnan (X_transformed )):
263- raise WarningMessage ("Result contains NaN. This is a bug." )
264-
265- return X_transformed
266-
267- def fit_transform (self , df : pd .DataFrame ) -> pd .DataFrame :
243+ def transform (self , X : np .array ) -> np .array :
268244 """
269- Fit and impute input X array.
245+ Transform the input X array by imputing the missing values .
270246
271247 Parameters
272248 ----------
273- X : pd.DataFrame
274- DataFrame to be imputed
249+ X : np.array
250+ Numpy array to be imputed
275251
276252 Returns
277253 -------
278254 ArrayLike
279255 Final array after EM sampling.
280256 """
281- if not ((isinstance (df , np .ndarray )) or (isinstance (df , pd .DataFrame ))):
282- raise AssertionError ("Invalid type. X must be either pd.DataFrame or np.ndarray." )
283-
284- if df .shape [1 ] < 2 :
285- raise AssertionError ("Invalid dimensions: X must be of dimension (n,m) with m>1." )
286257
287- X = df .values
258+ if hash (X .tobytes ()) == self .hash_fit :
259+ X = self .X_sample_last
260+ else :
261+ X = self .scaler .transform (X )
262+ X = X .T
263+ X = self ._linear_interpolation (X )
288264
289- scaler = StandardScaler ()
290- X = scaler .fit_transform (X )
291- X = X .T
292- X = self .impute_em (X )
293- X = X .T
294- X = scaler .inverse_transform (X )
265+ if self .strategy == "mle" :
266+ X_transformed = self ._maximize_likelihood (X )
267+ elif self .strategy == "ou" :
268+ mask_na = np .isnan (X )
269+ X_transformed = self ._sample_ou (X , mask_na )
295270
296- if np .isnan (np .sum ( X )):
271+ if np .all (np .isnan ( X_transformed )):
297272 raise WarningMessage ("Result contains NaN. This is a bug." )
298273
299- if isinstance (df , np .ndarray ):
300- return X
301- elif isinstance (df , pd .DataFrame ):
302- return pd .DataFrame (X , index = df .index , columns = df .columns )
303-
304- else :
305- raise AssertionError ("Invalid type. X must be either pd.DataFrame or np.ndarray." )
274+ X_transformed = X_transformed .T
275+ X_transformed = self .scaler .inverse_transform (X_transformed )
276+ return X_transformed
306277
307278
308279class ImputeMultiNormalEM (ImputeEM ): # type: ignore
@@ -372,18 +343,32 @@ def __init__(
372343 )
373344 self .tolerance = tolerance
374345
375- # self.list_logliks = []
376- # self.list_means = []
377- # self.list_covs = []
378346 self .dict_criteria_stop = {"logliks" : [], "means" : [], "covs" : []}
379347
380348 def fit_distribution (self , X ):
381- # first estimation of params
382349 self .means = np .mean (X , axis = 1 )
383350 self .cov = np .cov (X )
384-
385351 self .cov_inv = invert_robust (self .cov , epsilon = 1e-2 )
386352
353+ def _maximize_likelihood (self , X : ArrayLike ) -> ArrayLike :
354+ """
355+ Get the argmax of a posterior distribution.
356+
357+ Parameters
358+ ----------
359+ X : ArrayLike
360+ Input DataFrame.
361+
362+ Returns
363+ -------
364+ ArrayLike
365+ DataFrame with imputed values.
366+ """
367+ X_center = X - self .means [:, None ]
368+ X_imputed = _gradient_conjugue (self .cov_inv , X_center )
369+ X_imputed = self .means [:, None ] + X_imputed
370+ return X_imputed
371+
387372 def _sample_ou (
388373 self ,
389374 X : ArrayLike ,
@@ -465,10 +450,6 @@ def _check_convergence(self) -> bool:
465450 True/False if the algorithm has converged
466451 """
467452
468- # self.list_means.append(self.means)
469- # self.list_covs.append(self.cov)
470- # self.list_logliks.append(self.loglik)
471-
472453 list_means = self .dict_criteria_stop ["means" ]
473454 list_covs = self .dict_criteria_stop ["covs" ]
474455 list_logliks = self .dict_criteria_stop ["logliks" ]
@@ -602,11 +583,6 @@ def fit_distribution(self, X):
602583 self .fit_parameter_A (X )
603584 self .fit_parameter_omega (X )
604585
605- # print("distribution fitted :")
606- # print(self.A)
607- # print(self.B)
608- # print(self.omega)
609-
610586 def gradient_X_centered_loglik (self , Xc ):
611587 Xc_back = np .roll (Xc , 1 , axis = 1 )
612588 Xc_back [:, 0 ] = 0
@@ -616,6 +592,25 @@ def gradient_X_centered_loglik(self, Xc):
616592 Z_fore = Xc_fore - self .A @ Xc
617593 return - self .omega_inv @ Z_back + self .A .T @ self .omega_inv @ Z_fore
618594
595+ def _maximize_likelihood (self , X : ArrayLike , dt = 1e-2 ) -> ArrayLike :
596+ """
597+ Get the argmax of a posterior distribution.
598+
599+ Parameters
600+ ----------
601+ X : ArrayLike
602+ Input numpy array.
603+
604+ Returns
605+ -------
606+ ArrayLike
607+ DataFrame with imputed values.
608+ """
609+ Xc = X - self .B [:, None ]
610+ for n_optim in range (1000 ):
611+ Xc += dt * self .gradient_X_centered_loglik (Xc )
612+ return Xc + self .B [:, None ]
613+
619614 def _sample_ou (
620615 self ,
621616 X : ArrayLike ,
0 commit comments