@@ -439,20 +439,14 @@ def test_duplicate_index_value(self):
439439 self .assertEqual (5 , len (result ))
440440
441441 def test_string_index (self ):
442- # https://github.com/scikit-learn-contrib/categorical-encoding/issues/131
443-
444- bunch = sklearn .datasets .fetch_openml (name = "house_prices" , as_frame = True )
445- y = (bunch .target > 200000 ).values
446- X = pd .DataFrame (bunch .data , columns = bunch .feature_names )
447- X .index = X .index .values .astype (str )
448-
449- display_cols = ["Id" , "MSSubClass" , "MSZoning" , "YearBuilt" , "Heating" , "CentralAir" ]
450- X = X [display_cols ]
442+ train = pd .DataFrame ({'city' : ['chicago' , 'denver' ]})
443+ target = [0 , 1 ]
444+ train .index = train .index .values .astype (str )
451445
452446 for encoder_name in encoders .__all__ :
453447 with self .subTest (encoder_name = encoder_name ):
454- enc = getattr (encoders , encoder_name )(cols = [ 'CentralAir' , 'Heating' ] )
455- result = enc .fit_transform (X , y )
448+ enc = getattr (encoders , encoder_name )()
449+ result = enc .fit_transform (train , target )
456450 self .assertFalse (result .isnull ().values .any (), 'There should not be any missing value!' )
457451
458452 def test_get_feature_names_out (self ):
@@ -609,8 +603,7 @@ def test_metamorphic(self):
609603 x3 = pd .DataFrame (data = {'x' : ['A' , 'B' , 'B' ]}) # DataFrame
610604 x4 = pd .Series (['A' , 'B' , 'B' ], dtype = 'category' ) # Series with category data type
611605 x5 = np .array (['A' , 'B' , 'B' ]) # Numpy
612- x6 = [np .NaN , 'B' , 'B' ] # Missing value
613- x7 = ['Z' , 'Y' , 'Y' ] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)
606+ x6 = ['Z' , 'Y' , 'Y' ] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)
614607
615608 y = [1 , 1 , 0 ]
616609
@@ -636,19 +629,13 @@ def test_metamorphic(self):
636629 result5 = enc5 .fit_transform (x5 , y )
637630 self .assertTrue ((result1 .values == result5 .values ).all ())
638631
639- # gray encoder and rankhot re-orders inputs so that nan is last, hence the output is changed
632+ # gray encoder actually does re-order inputs
633+ # rankhot encoder respects order, in this example the order is switched
640634 if encoder_name not in ["GrayEncoder" , "RankHotEncoder" ]:
641635 enc6 = getattr (encoders , encoder_name )()
642636 result6 = enc6 .fit_transform (x6 , y )
643637 self .assertTrue ((result1 .values == result6 .values ).all ())
644638
645- # gray encoder actually does re-order inputs
646- # rankhot encoder respects order, in this example the order is switched
647- if encoder_name not in ["GrayEncoder" , "RankHotEncoder" ]:
648- enc7 = getattr (encoders , encoder_name )()
649- result7 = enc7 .fit_transform (x7 , y )
650- self .assertTrue ((result1 .values == result7 .values ).all ())
651-
652639 # Arguments
653640 enc9 = getattr (encoders , encoder_name )(return_df = False )
654641 result9 = enc9 .fit_transform (x1 , y )
0 commit comments