@@ -54,6 +54,34 @@ def test_defo_params_plus_automatically_find_variables(df_enc_big):
5454 pd .testing .assert_frame_equal (X , df )
5555
5656
57+ def test_when_varnames_are_numbers (df_enc_big ):
58+ input_df = df_enc_big .copy ()
59+ input_df .columns = [1 , 2 , 3 ]
60+
61+ encoder = RareLabelEncoder (
62+ tol = 0.06 , n_categories = 5 , variables = None , replace_with = "Rare"
63+ )
64+ X = encoder .fit_transform (input_df )
65+
66+ # expected output
67+ df = {
68+ 1 : ["A" ] * 6 + ["B" ] * 10 + ["C" ] * 4 + ["D" ] * 10 + ["Rare" ] * 4 + ["G" ] * 6 ,
69+ 2 : ["A" ] * 10 + ["B" ] * 6 + ["C" ] * 4 + ["D" ] * 10 + ["Rare" ] * 4 + ["G" ] * 6 ,
70+ 3 : ["A" ] * 4 + ["B" ] * 6 + ["C" ] * 10 + ["D" ] * 10 + ["Rare" ] * 4 + ["G" ] * 6 ,
71+ }
72+ df = pd .DataFrame (df )
73+
74+ frequenc_cat = {
75+ 1 : ["B" , "D" , "A" , "G" , "C" ],
76+ 2 : ["A" , "D" , "B" , "G" , "C" ],
77+ 3 : ["C" , "D" , "B" , "G" , "A" ],
78+ }
79+
80+ assert encoder .variables_ == [1 , 2 , 3 ]
81+ assert encoder .encoder_dict_ == frequenc_cat
82+ pd .testing .assert_frame_equal (X , df )
83+
84+
5785def test_correctly_ignores_nan_in_transform (df_enc_big ):
5886 encoder = RareLabelEncoder (
5987 tol = 0.06 ,
@@ -102,7 +130,7 @@ def test_correctly_ignores_nan_in_fit(df_enc_big):
102130 n_categories = 3 ,
103131 missing_values = "ignore" ,
104132 )
105- X = encoder .fit_transform (df )
133+ encoder .fit (df )
106134
107135 # expected:
108136 frequenc_cat = {
@@ -134,6 +162,90 @@ def test_correctly_ignores_nan_in_fit(df_enc_big):
134162 pd .testing .assert_frame_equal (X , tt )
135163
136164
165+ def test_correctly_ignores_nan_in_fit_when_var_is_numerical (df_enc_big ):
166+
167+ df = df_enc_big .copy ()
168+ df ["var_C" ] = [
169+ 1 ,
170+ 1 ,
171+ 1 ,
172+ 1 ,
173+ 2 ,
174+ 2 ,
175+ 2 ,
176+ 2 ,
177+ 2 ,
178+ 2 ,
179+ 3 ,
180+ 3 ,
181+ 3 ,
182+ 3 ,
183+ 3 ,
184+ 3 ,
185+ 3 ,
186+ 3 ,
187+ 3 ,
188+ 3 ,
189+ 4 ,
190+ 4 ,
191+ 4 ,
192+ 4 ,
193+ 4 ,
194+ 4 ,
195+ 4 ,
196+ 4 ,
197+ 4 ,
198+ 4 ,
199+ 5 ,
200+ 5 ,
201+ 6 ,
202+ 6 ,
203+ np .nan ,
204+ np .nan ,
205+ np .nan ,
206+ np .nan ,
207+ np .nan ,
208+ np .nan ,
209+ ]
210+
211+ encoder = RareLabelEncoder (
212+ tol = 0.06 ,
213+ n_categories = 3 ,
214+ missing_values = "ignore" ,
215+ ignore_format = True ,
216+ )
217+ encoder .fit (df )
218+
219+ # expected:
220+ frequenc_cat = {
221+ "var_A" : ["B" , "D" , "A" , "G" , "C" ],
222+ "var_B" : ["A" , "D" , "B" , "G" , "C" ],
223+ "var_C" : [3 , 4 , 2 , 1 ],
224+ }
225+ assert encoder .encoder_dict_ == frequenc_cat
226+
227+ # input
228+ t = pd .DataFrame (
229+ {
230+ "var_A" : ["A" , np .nan , "J" , "G" ],
231+ "var_B" : ["A" , np .nan , "J" , "G" ],
232+ "var_C" : [3 , np .nan , 9 , 10 ],
233+ }
234+ )
235+
236+ # expected
237+ tt = pd .DataFrame (
238+ {
239+ "var_A" : ["A" , np .nan , "Rare" , "G" ],
240+ "var_B" : ["A" , np .nan , "Rare" , "G" ],
241+ "var_C" : [3.0 , np .nan , "Rare" , "Rare" ],
242+ }
243+ )
244+
245+ X = encoder .transform (t )
246+ pd .testing .assert_frame_equal (X , tt , check_dtype = False )
247+
248+
137249def test_user_provides_grouping_label_name_and_variable_list (df_enc_big ):
138250 # test case 2: user provides alternative grouping value and variable list
139251 encoder = RareLabelEncoder (
@@ -316,12 +428,84 @@ def test_variables_cast_as_category(df_enc_big):
316428 + ["G" ] * 6 ,
317429 }
318430 df = pd .DataFrame (df )
431+ df ["var_B" ] = pd .Categorical (df ["var_B" ])
319432
320433 # test fit attr
321434 assert encoder .variables_ == ["var_A" , "var_B" , "var_C" ]
322435 assert encoder .n_features_in_ == 3
323436 # test transform output
324- pd .testing .assert_frame_equal (X , df )
437+ pd .testing .assert_frame_equal (X , df , check_categorical = False )
438+
439+
440+ def test_variables_cast_as_category_with_na_in_transform (df_enc_big ):
441+ encoder = RareLabelEncoder (
442+ tol = 0.06 ,
443+ n_categories = 5 ,
444+ variables = None ,
445+ replace_with = "Rare" ,
446+ missing_values = "ignore" ,
447+ )
448+
449+ df_enc_big = df_enc_big .copy ()
450+ df_enc_big ["var_B" ] = df_enc_big ["var_B" ].astype ("category" )
451+ encoder .fit (df_enc_big )
452+
453+ # input
454+ t = pd .DataFrame (
455+ {
456+ "var_A" : ["A" , np .nan , "J" , "G" ],
457+ "var_B" : ["A" , np .nan , "J" , "G" ],
458+ "var_C" : ["A" , np .nan , "J" , "G" ],
459+ }
460+ )
461+ t ["var_B" ] = pd .Categorical (t ["var_B" ])
462+
463+ # expected
464+ tt = pd .DataFrame (
465+ {
466+ "var_A" : ["A" , np .nan , "Rare" , "G" ],
467+ "var_B" : ["A" , np .nan , "Rare" , "G" ],
468+ "var_C" : ["A" , np .nan , "Rare" , "G" ],
469+ }
470+ )
471+ tt ["var_B" ] = pd .Categorical (tt ["var_B" ])
472+ pd .testing .assert_frame_equal (encoder .transform (t ), tt , check_categorical = False )
473+
474+
475+ def test_variables_cast_as_category_with_na_in_fit (df_enc_big ):
476+
477+ df = df_enc_big .copy ()
478+ df .loc [df ["var_C" ] == "G" , "var_C" ] = np .nan
479+ df ["var_C" ] = df ["var_C" ].astype ("category" )
480+
481+ encoder = RareLabelEncoder (
482+ tol = 0.06 ,
483+ n_categories = 3 ,
484+ missing_values = "ignore" ,
485+ )
486+ encoder .fit (df )
487+
488+ # input
489+ t = pd .DataFrame (
490+ {
491+ "var_A" : ["A" , np .nan , "J" , "G" ],
492+ "var_B" : ["A" , np .nan , "J" , "G" ],
493+ "var_C" : ["C" , np .nan , "J" , "G" ],
494+ }
495+ )
496+ t ["var_C" ] = pd .Categorical (t ["var_C" ])
497+
498+ # expected
499+ tt = pd .DataFrame (
500+ {
501+ "var_A" : ["A" , np .nan , "Rare" , "G" ],
502+ "var_B" : ["A" , np .nan , "Rare" , "G" ],
503+ "var_C" : ["C" , np .nan , "Rare" , "Rare" ],
504+ }
505+ )
506+ tt ["var_C" ] = pd .Categorical (tt ["var_C" ])
507+
508+ pd .testing .assert_frame_equal (encoder .transform (t ), tt , check_categorical = False )
325509
326510
327511def test_inverse_transform_raises_not_implemented_error (df_enc_big ):
0 commit comments