@@ -40,7 +40,13 @@ def test_encode_categories_in_k_binary_plus_select_vars_automatically(df_enc_big
4040 }
4141
4242 assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
43+ assert encoder .variables_binary_ == []
4344 assert encoder .n_features_in_ == 3
45+ assert encoder .encoder_dict_ == {
46+ "var_A" : ["A" , "B" , "C" , "D" , "E" , "F" , "G" ],
47+ "var_B" : ["A" , "B" , "C" , "D" , "E" , "F" , "G" ],
48+ "var_C" : ["A" , "B" , "C" , "D" , "E" , "F" , "G" ],
49+ }
4450 # test transform output
4551 assert X .sum ().to_dict () == transf
4652 assert "var_A" not in X .columns
@@ -75,7 +81,12 @@ def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big
7581 }
7682
7783 assert encoder .variables_ == ["var_A" , "var_B" ]
84+ assert encoder .variables_binary_ == []
7885 assert encoder .n_features_in_ == 3
86+ assert encoder .encoder_dict_ == {
87+ "var_A" : ["A" , "B" , "C" , "D" , "E" , "F" ],
88+ "var_B" : ["A" , "B" , "C" , "D" , "E" , "F" ],
89+ }
7990 # test transform output
8091 for col in transf .keys ():
8192 assert X [col ].sum () == transf [col ]
@@ -84,30 +95,65 @@ def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big
8495 assert "var_C" in X .columns
8596
8697
87- def test_encode_top_categories (df_enc_big ):
98+ def test_encode_top_categories ():
8899 # test case 3: encode only the most popular categories
100+
101+ df = pd .DataFrame (
102+ {
103+ "var_A" : ["A" ] * 5
104+ + ["B" ] * 11
105+ + ["C" ] * 4
106+ + ["D" ] * 9
107+ + ["E" ] * 2
108+ + ["F" ] * 2
109+ + ["G" ] * 7 ,
110+ "var_B" : ["A" ] * 11
111+ + ["B" ] * 7
112+ + ["C" ] * 4
113+ + ["D" ] * 9
114+ + ["E" ] * 2
115+ + ["F" ] * 2
116+ + ["G" ] * 5 ,
117+ "var_C" : ["A" ] * 4
118+ + ["B" ] * 5
119+ + ["C" ] * 11
120+ + ["D" ] * 9
121+ + ["E" ] * 2
122+ + ["F" ] * 2
123+ + ["G" ] * 7 ,
124+ }
125+ )
126+
89127 encoder = OneHotEncoder (top_categories = 4 , variables = None , drop_last = False )
90- X = encoder .fit_transform (df_enc_big )
128+ X = encoder .fit_transform (df )
91129
92130 # test init params
93131 assert encoder .top_categories == 4
94132 # test fit attr
95133 transf = {
96- "var_A_D" : 10 ,
97- "var_A_B" : 10 ,
98- "var_A_A" : 6 ,
99- "var_A_G" : 6 ,
100- "var_B_A" : 10 ,
101- "var_B_D" : 10 ,
102- "var_B_G" : 6 ,
103- "var_B_B" : 6 ,
104- "var_C_D" : 10 ,
105- "var_C_C" : 10 ,
106- "var_C_G" : 6 ,
107- "var_C_B" : 6 ,
134+ "var_A_D" : 9 ,
135+ "var_A_B" : 11 ,
136+ "var_A_A" : 5 ,
137+ "var_A_G" : 7 ,
138+ "var_B_A" : 11 ,
139+ "var_B_D" : 9 ,
140+ "var_B_G" : 5 ,
141+ "var_B_B" : 7 ,
142+ "var_C_D" : 9 ,
143+ "var_C_C" : 11 ,
144+ "var_C_G" : 7 ,
145+ "var_C_B" : 5 ,
108146 }
109147
148+ # test fit attr
149+ assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
150+ assert encoder .variables_binary_ == []
110151 assert encoder .n_features_in_ == 3
152+ assert encoder .encoder_dict_ == {
153+ "var_A" : ["B" , "D" , "G" , "A" ],
154+ "var_B" : ["A" , "D" , "B" , "G" ],
155+ "var_C" : ["C" , "D" , "G" , "B" ],
156+ }
111157 # test transform output
112158 for col in transf .keys ():
113159 assert X [col ].sum () == transf [col ]
@@ -125,14 +171,12 @@ def test_error_if_drop_last_not_bool():
125171 OneHotEncoder (drop_last = 0.5 )
126172
127173
128- def test_fit_raises_error_if_df_contains_na ( df_enc_big_na ):
174+ def test_raises_error_if_df_contains_na ( df_enc_big , df_enc_big_na ):
129175 # test case 4: when dataset contains na, fit method
130176 with pytest .raises (ValueError ):
131177 encoder = OneHotEncoder ()
132178 encoder .fit (df_enc_big_na )
133179
134-
135- def test_transform_raises_error_if_df_contains_na (df_enc_big , df_enc_big_na ):
136180 # test case 4: when dataset contains na, transform method
137181 with pytest .raises (ValueError ):
138182 encoder = OneHotEncoder ()
@@ -164,7 +208,9 @@ def test_encode_numerical_variables(df_enc_numeric):
164208 X = pd .DataFrame (X ).astype ("int32" )
165209
166210 assert encoder .variables_ == ["var_A" , "var_B" ]
211+ assert encoder .variables_binary_ == []
167212 assert encoder .n_features_in_ == 2
213+ assert encoder .encoder_dict_ == {"var_A" : [1 , 2 , 3 ], "var_B" : [1 , 2 , 3 ]}
168214 # test transform output
169215 pd .testing .assert_frame_equal (X , transf )
170216
@@ -197,6 +243,7 @@ def test_variables_cast_as_category(df_enc_numeric):
197243
198244 assert encoder .variables_ == ["var_A" , "var_B" ]
199245 assert encoder .n_features_in_ == 2
246+ assert encoder .encoder_dict_ == {"var_A" : [1 , 2 , 3 ], "var_B" : [1 , 2 , 3 ]}
200247 # test transform output
201248 pd .testing .assert_frame_equal (X , transf )
202249
@@ -206,89 +253,110 @@ def df_enc_binary():
206253 df = {
207254 "var_A" : ["A" ] * 6 + ["B" ] * 10 + ["C" ] * 4 ,
208255 "var_B" : ["A" ] * 10 + ["B" ] * 6 + ["C" ] * 4 ,
209- "var_C" : ["A" ] * 10 + ["B" ] * 10 ,
210- "target" : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
256+ "var_C" : ["AHA" ] * 12 + ["UHU" ] * 8 ,
257+ "var_D" : ["OHO" ] * 5 + ["EHE" ] * 15 ,
258+ "var_num" : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
211259 }
212260 df = pd .DataFrame (df )
213261
214262 return df
215263
216264
217- def test_encode_into_k_binary_plus_drop_binary (df_enc_binary ):
265+ def test_encode_into_k_dummy_plus_drop_binary (df_enc_binary ):
218266 encoder = OneHotEncoder (
219267 top_categories = None , variables = None , drop_last = False , drop_last_binary = True
220268 )
221269 X = encoder .fit_transform (df_enc_binary )
270+ X = X .astype ("int32" )
222271
223272 # test fit attr
224273 transf = {
225- "target " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
274+ "var_num " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
226275 "var_A_A" : [1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
227276 "var_A_B" : [0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
228277 "var_A_C" : [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ],
229278 "var_B_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
230279 "var_B_B" : [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
231280 "var_B_C" : [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ],
232- "var_C_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
281+ "var_C_AHA" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
282+ "var_D_OHO" : [1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
233283 }
234-
235284 transf = pd .DataFrame (transf ).astype ("int32" )
236- X = pd .DataFrame (X ).astype ("int32" )
237285
238- assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
239- assert encoder .n_features_in_ == 4
286+ assert encoder .variables_ == ["var_A" , "var_B" , "var_C" , "var_D" ]
287+ assert encoder .variables_binary_ == ["var_C" , "var_D" ]
288+ assert encoder .n_features_in_ == 5
289+ assert encoder .encoder_dict_ == {
290+ "var_A" : ["A" , "B" , "C" ],
291+ "var_B" : ["A" , "B" , "C" ],
292+ "var_C" : ["AHA" ],
293+ "var_D" : ["OHO" ],
294+ }
240295 # test transform output
241296 pd .testing .assert_frame_equal (X , transf )
242297 assert "var_C_B" not in X .columns
243298
244299
245- def test_encode_into_kminus1_binary_plus_drop_binary (df_enc_binary ):
300+ def test_encode_into_kminus1_dummyy_plus_drop_binary (df_enc_binary ):
246301 encoder = OneHotEncoder (
247302 top_categories = None , variables = None , drop_last = True , drop_last_binary = True
248303 )
249304 X = encoder .fit_transform (df_enc_binary )
305+ X = X .astype ("int32" )
250306
251307 # test fit attr
252308 transf = {
253- "target " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
309+ "var_num " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
254310 "var_A_A" : [1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
255311 "var_A_B" : [0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
256312 "var_B_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
257313 "var_B_B" : [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
258- "var_C_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
314+ "var_C_AHA" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
315+ "var_D_OHO" : [1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
259316 }
260-
261317 transf = pd .DataFrame (transf ).astype ("int32" )
262- X = pd .DataFrame (X ).astype ("int32" )
263318
264- assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
265- assert encoder .n_features_in_ == 4
319+ assert encoder .variables_ == ["var_A" , "var_B" , "var_C" , "var_D" ]
320+ assert encoder .variables_binary_ == ["var_C" , "var_D" ]
321+ assert encoder .n_features_in_ == 5
322+ assert encoder .encoder_dict_ == {
323+ "var_A" : ["A" , "B" ],
324+ "var_B" : ["A" , "B" ],
325+ "var_C" : ["AHA" ],
326+ "var_D" : ["OHO" ],
327+ }
266328 # test transform output
267329 pd .testing .assert_frame_equal (X , transf )
268330 assert "var_C_B" not in X .columns
269331
270332
271333def test_encode_into_top_categories_plus_drop_binary (df_enc_binary ):
272-
273334 # top_categories = 1
274335 encoder = OneHotEncoder (
275336 top_categories = 1 , variables = None , drop_last = False , drop_last_binary = True
276337 )
277338 X = encoder .fit_transform (df_enc_binary )
339+ X = X .astype ("int32" )
278340
279341 # test fit attr
280342 transf = {
281- "target " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
343+ "var_num " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
282344 "var_A_B" : [0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
283345 "var_B_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
284- "var_C_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
346+ "var_C_AHA" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
347+ "var_D_OHO" : [1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
285348 }
286-
287349 transf = pd .DataFrame (transf ).astype ("int32" )
288- X = pd .DataFrame (X ).astype ("int32" )
289350
290- assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
291- assert encoder .n_features_in_ == 4
351+ assert encoder .variables_ == ["var_A" , "var_B" , "var_C" , "var_D" ]
352+ assert encoder .variables_binary_ == ["var_C" , "var_D" ]
353+ assert encoder .n_features_in_ == 5
354+ assert encoder .encoder_dict_ == {
355+ "var_A" : ["B" ],
356+ "var_B" : ["A" ],
357+ "var_C" : ["AHA" ],
358+ "var_D" : ["OHO" ],
359+ }
292360 # test transform output
293361 pd .testing .assert_frame_equal (X , transf )
294362 assert "var_C_B" not in X .columns
@@ -298,22 +366,29 @@ def test_encode_into_top_categories_plus_drop_binary(df_enc_binary):
298366 top_categories = 2 , variables = None , drop_last = False , drop_last_binary = True
299367 )
300368 X = encoder .fit_transform (df_enc_binary )
369+ X = X .astype ("int32" )
301370
302371 # test fit attr
303372 transf = {
304- "target " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
373+ "var_num " : [1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 ],
305374 "var_A_B" : [0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
306375 "var_A_A" : [1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
307376 "var_B_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
308377 "var_B_B" : [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ],
309- "var_C_A" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
378+ "var_C_AHA" : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
379+ "var_D_OHO" : [1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
310380 }
311-
312381 transf = pd .DataFrame (transf ).astype ("int32" )
313- X = pd .DataFrame (X ).astype ("int32" )
314382
315- assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
316- assert encoder .n_features_in_ == 4
383+ assert encoder .variables_ == ["var_A" , "var_B" , "var_C" , "var_D" ]
384+ assert encoder .variables_binary_ == ["var_C" , "var_D" ]
385+ assert encoder .n_features_in_ == 5
386+ assert encoder .encoder_dict_ == {
387+ "var_A" : ["B" , "A" ],
388+ "var_B" : ["A" , "B" ],
389+ "var_C" : ["AHA" ],
390+ "var_D" : ["OHO" ],
391+ }
317392 # test transform output
318393 pd .testing .assert_frame_equal (X , transf )
319394 assert "var_C_B" not in X .columns
0 commit comments