11import warnings
22from typing import Dict , List , Optional , Union
3+ from abc import abstractmethod
34
45import numpy as np
56import pandas as pd
@@ -41,19 +42,38 @@ class Imputer(_BaseImputer):
4142
4243 def __init__ (
4344 self ,
44- groups : List [str ] = [],
4545 columnwise : bool = False ,
4646 shrink : bool = False ,
47- hyperparams : Dict = {},
4847 random_state : Union [None , int , np .random .RandomState ] = None ,
48+ missing_values = np .nan ,
49+ groups : List [str ] = [],
50+ hyperparams : Dict = {},
4951 ):
50- self .hyperparams_user = hyperparams
51- self .groups = groups
5252 self .columnwise = columnwise
5353 self .shrink = shrink
5454 self .random_state = random_state
55-
56- def fit_transform (self , df : pd .DataFrame ) -> pd .DataFrame :
55+ self .missing_values = missing_values
56+ self .groups = groups
57+ self .hyperparams = hyperparams
58+
59+ def _more_tags (self ):
60+ """Define tags for scikit-learn"""
61+
62+ return {
63+ "allow_nan" : True ,
64+ "requires_fit" : False ,
65+ "_xfail_checks" : {
66+ "check_parameters_default_constructible" : "The imputer need Dict as a parammeter" ,
67+ "check_no_attributes_set_in_init" : """The imputer can define an attribute
68+ modifiable in init""" ,
69+ },
70+ }
71+
72+ def fit (self , X , y : pd .DataFrame = None ):
73+ X = self ._validate_data (X , force_all_finite = "allow-nan" )
74+ return self
75+
76+ def fit_transform (self , X : pd .DataFrame , y = None ) -> pd .DataFrame :
5777 """
5878 Returns a dataframe with same shape as `df`, unchanged values, where all nans are replaced
5979 by non-nan values.
@@ -70,24 +90,27 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
7090 pd.DataFrame
7191 Imputed dataframe.
7292 """
73- if not isinstance (df , pd .DataFrame ):
74- raise ValueError ("Input has to be a pandas.DataFrame." )
93+ self .fit (X )
94+
95+ if not isinstance (X , (pd .DataFrame , np .ndarray )):
96+ raise ValueError ("Input has to be a pandas.DataFrame or numpy.ndarray." )
97+ df = pd .DataFrame (X )
7598 for column in df :
7699 if df [column ].isnull ().all ():
77100 raise ValueError ("Input contains a column full of NaN" )
78101 self .rng = sku .check_random_state (self .random_state )
79102 if hasattr (self , "estimator" ) and hasattr (self .estimator , "random_state" ):
80103 self .estimator .random_state = self .rng
81104
82- hyperparams = self .hyperparams_user .copy ()
105+ hyperparams = self .hyperparams .copy ()
83106 if hasattr (self , "hyperparams_optim" ):
84107 hyperparams .update (self .hyperparams_optim )
85108 cols_with_nans = df .columns [df .isna ().any ()]
86109
87110 if self .groups == []:
88- self .ngroups = pd .Series (0 , index = df .index ).rename ("_ngroup" )
111+ self .ngroups_ = pd .Series (0 , index = df .index ).rename ("_ngroup" )
89112 else :
90- self .ngroups = df .groupby (self .groups ).ngroup ().rename ("_ngroup" )
113+ self .ngroups_ = df .groupby (self .groups ).ngroup ().rename ("_ngroup" )
91114
92115 if self .columnwise :
93116 df_imputed = df .copy ()
@@ -156,8 +179,8 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
156179 raise ValueError ("Input has to be a pandas.DataFrame." )
157180 df = df .copy ()
158181 if self .groups :
159- # groupby = utils.custom_groupby(df, self. groups)
160- groupby = df .groupby (self .ngroups , group_keys = False )
182+ # groupby = utils.custom_groupby(df, groups)
183+ groupby = df .groupby (self .ngroups_ , group_keys = False )
161184 if self .shrink :
162185 imputation_values = groupby .transform (self .fit_transform_element )
163186 else :
@@ -173,6 +196,10 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
173196
174197 return df
175198
199+ @abstractmethod
200+ def fit_transform_element (self , df : pd .DataFrame ):
201+ return df
202+
176203
177204class ImputerOracle (Imputer ):
178205 """
@@ -195,7 +222,7 @@ def __init__(
195222 super ().__init__ ()
196223 self .df = df
197224
198- def fit_transform (self , df : pd .DataFrame ) -> pd .DataFrame :
225+ def fit_transform (self , X : pd .DataFrame , y = None ) -> pd .DataFrame :
199226 """Impute df with corresponding known values
200227
201228 Parameters
@@ -207,8 +234,10 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
207234 pd.DataFrame
208235 dataframe imputed with premasked values
209236 """
210- if not isinstance (df , pd .DataFrame ):
211- raise ValueError ("Input has to be a pandas.DataFrame." )
237+ self .fit (X )
238+ if not isinstance (X , (pd .DataFrame , np .ndarray )):
239+ raise ValueError ("Input has to be a pandas.DataFrame or numpy.ndarray." )
240+ df = pd .DataFrame (X )
212241 return df .fillna (self .df )
213242
214243
@@ -244,7 +273,12 @@ def __init__(
244273 groups : List [str ] = [],
245274 ) -> None :
246275 super ().__init__ (groups = groups , columnwise = True , shrink = True )
247- self .fit_transform_element = pd .DataFrame .mean
276+
277+ def _more_tags (self ):
278+ return {"allow_nan" : True , "requires_fit" : False }
279+
280+ def fit_transform_element (self , df : pd .DataFrame ):
281+ return pd .DataFrame .mean (df )
248282
249283
250284class ImputerMedian (Imputer ):
@@ -279,7 +313,9 @@ def __init__(
279313 groups : List [str ] = [],
280314 ) -> None :
281315 super ().__init__ (groups = groups , columnwise = True , shrink = True )
282- self .fit_transform_element = pd .DataFrame .median
316+
317+ def fit_transform_element (self , df : pd .DataFrame ):
318+ return pd .DataFrame .median (df )
283319
284320
285321class ImputerMode (Imputer ):
@@ -314,7 +350,9 @@ def __init__(
314350 groups : List [str ] = [],
315351 ) -> None :
316352 super ().__init__ (groups = groups , columnwise = True , shrink = True )
317- self .fit_transform_element = lambda df : df .mode ().iloc [0 ]
353+
354+ def fit_transform_element (self , df : pd .DataFrame ):
355+ return df .mode ().iloc [0 ]
318356
319357
320358class ImputerShuffle (Imputer ):
@@ -647,6 +685,7 @@ def __init__(
647685 super ().__init__ (groups = groups , columnwise = False , hyperparams = hyperparams )
648686 self .n_neighbors = n_neighbors
649687 self .weights = weights
688+ self .hyperparams_optim : Dict = {}
650689
651690 def fit_transform_element (self , df : pd .DataFrame ) -> pd .DataFrame :
652691 imputer = KNNImputer (
@@ -663,7 +702,8 @@ class ImputerMICE(Imputer):
663702 This class implements an iterative imputer in the multivariate case.
664703 It imputes each Series within a DataFrame multiple times using an iteration of fits
665704 and transformations to reach a stable state of imputation each time.
666- It uses sklearn.impute.IterativeImputer, see the docs for more information about the arguments.
705+ It uses sklearn.impute.IterativeImputer, see the docs for more information about the
706+ arguments.
667707
668708 Parameters
669709 ----------
@@ -711,6 +751,7 @@ def __init__(
711751 random_state = random_state ,
712752 )
713753 self .estimator = estimator
754+ self .hyperparams_optim : Dict = {}
714755
715756 def fit_transform_element (self , df : pd .DataFrame ) -> pd .DataFrame :
716757 iterative_imputer = IterativeImputer (estimator = self .estimator , ** self .hyperparams_element )
@@ -769,6 +810,7 @@ def __init__(
769810 self .columnwise = False
770811 self .estimator = estimator
771812 self .handler_nan = handler_nan
813+ self .hyperparams_optim : Dict = {}
772814
773815 def get_params_fit (self ) -> Dict :
774816 return {}
@@ -842,8 +884,8 @@ class ImputerRPCA(Imputer):
842884 """
843885 This class implements the Robust Principal Component Analysis imputation.
844886
845- The imputation minimizes a loss function combining a low-rank criterium on the dataframe and a
846- L1 penalization on the residuals.
887+ The imputation minimizes a loss function combining a low-rank criterium on the dataframe and
888+ a L1 penalization on the residuals.
847889
848890 Parameters
849891 ----------
@@ -852,10 +894,11 @@ class ImputerRPCA(Imputer):
852894 method : str
853895 Name of the RPCA method:
854896 "PCP" for basic RPCA, bad at imputing
855- "noisy" for noisy RPCA, with possible regularisations, wihch is recommended since it is
856- more stable
897+ "noisy" for noisy RPCA, with possible regularisations, wihch is recommended since
898+ it is more stable
857899 columnwise : bool
858- For the RPCA method to be applied columnwise (with reshaping of each column into an array)
900+ For the RPCA method to be applied columnwise (with reshaping of
901+ each column into an array)
859902 or to be applied directly on the dataframe. By default, the value is set to False.
860903 """
861904
@@ -875,6 +918,7 @@ def __init__(
875918 )
876919
877920 self .method = method
921+ self .hyperparams_optim : Dict = {}
878922
879923 def fit_transform_element (self , df : pd .DataFrame ) -> pd .DataFrame :
880924 if not isinstance (df , pd .DataFrame ):
@@ -890,7 +934,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
890934 X = df .values .T
891935 M , A = model .decompose_rpca_signal (X )
892936 df_imputed = pd .DataFrame ((M + A ).T , index = df .index , columns = df .columns )
893- df_imputed = df .where (df .isna (), df_imputed )
937+ df_imputed = df .where (~ df .isna (), df_imputed )
894938
895939 return df_imputed
896940
@@ -933,6 +977,7 @@ def __init__(
933977 random_state = random_state ,
934978 )
935979 self .model = model
980+ self .hyperparams_optim : Dict = {}
936981
937982 def fit_transform_element (self , df : pd .DataFrame ) -> pd .DataFrame :
938983 if self .model == "multinormal" :
0 commit comments