55import numpy as np
66import pandas as pd
77
8- a_encoding = [1 , 1 , 0 ]
9- b_encoding = [1 , 0 , 1 ]
10- c_encoding = [1 , - 1 , - 1 ]
8+ a_encoding = [1 , 0 ]
9+ b_encoding = [0 , 1 ]
10+ c_encoding = [- 1 , - 1 ]
1111
1212
1313class TestSumEncoder (TestCase ):
@@ -20,14 +20,14 @@ def test_unknown_and_missing(self):
2020 encoder = encoders .SumEncoder (handle_unknown = 'value' , handle_missing = 'value' )
2121 encoder .fit (train )
2222 dim_1_test = ['A' , 'D' , 'E' ]
23- dim_1_expected = [a_encoding , [1 , 0 , 0 ], [1 , 0 , 0 ]]
23+ dim_1_expected = [a_encoding , [0 , 0 ], [0 , 0 ]]
2424 dim_2_test = ['B' , 'D' , 'E' ]
25- dim_2_expected = [b_encoding , [1 , 0 , 0 ], [1 , 0 , 0 ]]
25+ dim_2_expected = [b_encoding , [0 , 0 ], [0 , 0 ]]
2626 dim_3_test = ['A' , 'B' , 'C' , None ]
27- dim_3_expected = [a_encoding , b_encoding , c_encoding , [1 , 0 , 0 ]]
27+ dim_3_expected = [a_encoding , b_encoding , c_encoding , [0 , 0 ]]
2828
2929 dim_4_test = ['D' , 'B' , 'C' , None ]
30- dim_4_expected = [[1 , 0 , 0 ], b_encoding , c_encoding , [1 , 0 , 0 ]]
30+ dim_4_expected = [[0 , 0 ], b_encoding , c_encoding , [0 , 0 ]]
3131 cases = {"should preserve dimension 1" : (dim_1_test , dim_1_expected ),
3232 "should preserve dimension 2" : (dim_2_test , dim_2_expected ),
3333 "should preserve dimension 3" : (dim_3_test , dim_3_expected ),
@@ -47,9 +47,9 @@ def test_sum_encoder_2cols(self):
4747 obtained = encoder .transform (train )
4848
4949 expected = [
50- [ 1 , a_encoding [ 1 ], a_encoding [ 2 ], a_encoding [ 1 ], a_encoding [ 2 ]] ,
51- [ 1 , b_encoding [ 1 ], b_encoding [ 2 ], b_encoding [ 1 ], b_encoding [ 2 ]] ,
52- [ 1 , c_encoding [ 1 ], c_encoding [ 2 ], c_encoding [ 1 ], c_encoding [ 2 ]] ,
50+ a_encoding * 2 ,
51+ b_encoding * 2 ,
52+ c_encoding * 2 ,
5353 ]
5454 self .assertEqual (obtained .to_numpy ().tolist (), expected )
5555
@@ -65,7 +65,6 @@ def test_multiple_columns_correct_order(self):
6565 columns = ['col1' , 'col2' , 'col3' , 'col4' ],
6666 )
6767 expected_columns = [
68- 'intercept' ,
6968 'col1' ,
7069 'col2_0' ,
7170 'col2_1' ,
@@ -108,9 +107,8 @@ def test_handle_missing_is_indicator(self):
108107 expected = [a_encoding , b_encoding , c_encoding ]
109108 self .assertEqual (result .to_numpy ().tolist (), expected )
110109
111- # unknown value should be encoded with value strategy,
112- # i.e. indicator 1 and all other columns zeros
110+ # unknown value should be encoded with value strategy, i.e. zeros for all columns
113111 test = ['A' , 'B' , 'C' ]
114112 result = encoder .transform (test )
115- expected = [a_encoding , b_encoding , [1 , 0 , 0 ]]
113+ expected = [a_encoding , b_encoding , [0 , 0 ]]
116114 self .assertEqual (result .to_numpy ().tolist (), expected )
0 commit comments