44from sklearn .model_selection import StratifiedKFold
55import category_encoders as encoders
66import pandas as pd
7- from typing import Dict
7+ from typing import Dict , Optional
88
99
1010class PolynomialWrapper (BaseEstimator , TransformerMixin ):
@@ -70,28 +70,32 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin):
7070 None
7171 """
7272
73- def __init__ (self , feature_encoder ):
74- self .feature_encoder = feature_encoder
73+ def __init__ (self , feature_encoder : utils . BaseEncoder ):
74+ self .feature_encoder : utils . BaseEncoder = feature_encoder
7575 self .feature_encoders : Dict [str , utils .BaseEncoder ] = {}
76- self .label_encoder = None
76+ self .label_encoder : Optional [ encoders . OneHotEncoder ] = None
7777
7878 def fit (self , X , y , ** kwargs ):
7979 # unite the input into pandas types
8080 X , y = utils .convert_inputs (X , y )
8181 y = pd .DataFrame (y , columns = ['target' ])
8282
8383 # apply one-hot-encoder on the label
84- self .label_encoder = encoders .OneHotEncoder (handle_missing = 'error' , handle_unknown = 'error' , cols = ['target' ], drop_invariant = True ,
84+ self .label_encoder = encoders .OneHotEncoder (handle_missing = 'error' ,
85+ handle_unknown = 'error' ,
86+ cols = ['target' ],
87+ drop_invariant = True ,
8588 use_cat_names = True )
8689 labels = self .label_encoder .fit_transform (y )
8790 labels .columns = [column [7 :] for column in labels .columns ]
8891 labels = labels .iloc [:, 1 :] # drop one label
8992
90- # train the feature encoders
93+ # train the feature encoders, it is important to reset feature encoders first
94+ self .feature_encoders = {}
9195 for class_name , label in labels .items ():
9296 self .feature_encoders [class_name ] = copy .deepcopy (self .feature_encoder ).fit (X , label )
9397
94- def transform (self , X ):
98+ def transform (self , X , y = None ):
9599 # unite the input into pandas types
96100 X = utils .convert_input (X )
97101
@@ -101,8 +105,14 @@ def transform(self, X):
101105 all_new_features = pd .DataFrame ()
102106
103107 # transform the features
108+ if y is not None :
109+ y = self .label_encoder .transform (pd .DataFrame ({"target" : y }))
104110 for class_name , feature_encoder in self .feature_encoders .items ():
105- encoded = feature_encoder .transform (X )
111+ if y is not None :
112+ y_transform = y [f"target_{ class_name } " ]
113+ else :
114+ y_transform = None
115+ encoded = feature_encoder .transform (X , y_transform )
106116
107117 # decorate the encoded features with the label class suffix
108118 new_features = encoded [feature_encoder .cols ]
@@ -117,42 +127,8 @@ def transform(self, X):
117127 return result
118128
119129 def fit_transform (self , X , y = None , ** fit_params ):
120- # When we are training the feature encoders, we have to use fit_transform() method on the features.
121-
122- # unite the input into pandas types
123- X , y = utils .convert_inputs (X , y )
124- y = y .to_frame ()
125- y .columns = ["target" ]
126-
127- # apply one-hot-encoder on the label
128- self .label_encoder = encoders .OneHotEncoder (handle_missing = 'error' , handle_unknown = 'error' , cols = ['target' ], drop_invariant = True ,
129- use_cat_names = True )
130- labels = self .label_encoder .fit_transform (y )
131- labels .columns = [column [7 :] for column in labels .columns ]
132- labels = labels .iloc [:, 1 :] # drop one label
133-
134- # initialization of the feature encoders
135- encoded = None
136- feature_encoder = None
137- all_new_features = pd .DataFrame ()
138-
139- # fit_transform the feature encoders
140- for class_name , label in labels .items ():
141- feature_encoder = copy .deepcopy (self .feature_encoder )
142- encoded = feature_encoder .fit_transform (X , label )
143-
144- # decorate the encoded features with the label class suffix
145- new_features = encoded [feature_encoder .cols ]
146- new_features .columns = [str (column ) + '_' + class_name for column in new_features .columns ]
147-
148- all_new_features = pd .concat ((all_new_features , new_features ), axis = 1 )
149- self .feature_encoders [class_name ] = feature_encoder
150-
151- # add features that were not encoded
152- result = pd .concat ((encoded [encoded .columns [~ encoded .columns .isin (feature_encoder .cols )]],
153- all_new_features ), axis = 1 )
154-
155- return result
130+ self .fit (X , y , ** fit_params )
131+ return self .transform (X , y )
156132
157133
158134class NestedCVWrapper (BaseEstimator , TransformerMixin ):
0 commit comments