Skip to content

Commit 343bc24

Browse files
author
florian
committed
Merge remote-tracking branch 'upstream/master' into fix-binary-encoder-for-columntransformer
2 parents b5f32f6 + e6cdd52 commit 343bc24

File tree

11 files changed

+127
-96
lines changed

11 files changed

+127
-96
lines changed

category_encoders/binary.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,9 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
7171

7272
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
7373
handle_unknown='value', handle_missing='value'):
74-
self. verbose = verbose
75-
self.cols = cols
76-
self.mapping = mapping
77-
self.drop_invariant = drop_invariant
78-
self.return_df = return_df
79-
self.handle_unknown = handle_unknown
80-
self.handle_missing = handle_missing
81-
self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=self.verbose, cols=self.cols, mapping=self.mapping,
82-
drop_invariant=self.drop_invariant, return_df=self.return_df,
83-
handle_unknown=self.handle_unknown, handle_missing=self.handle_missing)
74+
self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=verbose, cols=cols, mapping=mapping,
75+
drop_invariant=drop_invariant, return_df=return_df,
76+
handle_unknown=handle_unknown, handle_missing=handle_missing)
8477

8578
def fit(self, X, y=None, **kwargs):
8679
"""Fit encoder according to X and y.

category_encoders/ordinal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def fit(self, X, y=None, **kwargs):
144144
self.mapping = categories
145145

146146
X_temp = self.transform(X, override_return_df=True)
147-
self.feature_names = X_temp.columns.values.tolist()
147+
self.feature_names = X_temp.columns.tolist()
148148

149149
# drop all output columns with 0 variance.
150150
if self.drop_invariant:

category_encoders/tests/helpers.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Helper functions that are used exclusively in the tests"""
2+
3+
import numpy as np
4+
import random
5+
import pandas as pd
6+
import math
7+
8+
9+
def verify_numeric(X_test):
10+
"""
11+
Test that all attributes in the DataFrame are numeric.
12+
"""
13+
_NUMERIC_KINDS = set('buifc')
14+
15+
for dt in X_test.dtypes:
16+
assert(dt.kind in _NUMERIC_KINDS)
17+
18+
def create_array(n_rows=1000, extras=False, has_none=True):
19+
"""
20+
Creates a numpy dataset with some categorical variables.
21+
"""
22+
ds = [[
23+
random.random(),
24+
random.random(),
25+
random.choice(['A', 'B', 'C']),
26+
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),
27+
random.choice(['A', 'B', 'C', None, np.nan]) if has_none else random.choice(['A', 'B', 'C']),
28+
random.choice(['A'])
29+
] for _ in range(n_rows)]
30+
31+
return np.array(ds)
32+
33+
34+
def create_dataset(n_rows=1000, extras=False, has_none=True):
35+
"""
36+
Creates a dataset with some categorical variables.
37+
"""
38+
random.seed(2001)
39+
ds = [[
40+
random.random(), # Floats
41+
random.choice([float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi]), # Floats with edge scenarios
42+
row, # Unique integers
43+
str(row), # Unique strings
44+
random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data
45+
random.choice(['A', 'B_b', 'C_c_c']), # Strings with underscores to test reverse_dummies()
46+
random.choice(['A', 'B', 'C', None]) if has_none else random.choice(['A', 'B', 'C']), # None
47+
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), # With a new string value
48+
random.choice([12, 43, -32]), # Number in the column name
49+
random.choice(['A', 'B', 'C']), # What is going to become the categorical column
50+
random.choice(['A', 'B', 'C', np.nan]) # Categorical with missing values
51+
] for row in range(n_rows)]
52+
53+
df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 321, 'categorical', 'categorical_na'])
54+
df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C'])
55+
df['categorical_na'] = pd.Categorical(df['categorical_na'], categories=['A', 'B', 'C'])
56+
return df
57+
58+
59+
def verify_inverse_transform(x, x_inv):
60+
"""
61+
Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
62+
"""
63+
assert x.equals(x_inv)
64+
65+
66+
def deep_round(A, ndigits=5):
67+
"""
68+
Rounds numbers in a list of lists. Useful for approximate equality testing.
69+
"""
70+
return [[round(val, ndigits) for val in sublst] for sublst in A]

category_encoders/tests/test_basen.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,13 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
111111
enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan')
112112
enc.fit(train)
113113
result = enc.transform(test)
114+
115+
message = 'inverse_transform is not supported because transform impute '\
116+
'the unknown category nan when encode city'
114117

115-
with warnings.catch_warnings(record=True) as w:
118+
with self.assertWarns(UserWarning, msg=message) as w:
116119
enc.inverse_transform(result)
117120

118-
self.assertEqual(1, len(w))
119-
self.assertEqual('inverse_transform is not supported because transform impute '
120-
'the unknown category nan when encode city', str(w[0].message))
121-
122121
def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self):
123122
train = pd.DataFrame({'city': ['chicago', np.nan]})
124123
test = pd.DataFrame({'city': ['chicago', 'los angeles']})

category_encoders/tests/test_encoders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import pandas as pd
88
import sklearn
9-
import category_encoders.tests.test_helpers as th
9+
import category_encoders.tests.helpers as th
1010
from sklearn.utils.estimator_checks import check_transformer_general, check_transformers_unfitted
1111
from sklearn.compose import ColumnTransformer
1212
from unittest2 import TestSuite, TextTestRunner, TestCase # or `from unittest import ...` if on Python 3.4+
Lines changed: 29 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,42 @@
1-
"""Helper functions that are used exclusively in the tests"""
2-
31
import numpy as np
4-
import random
52
import pandas as pd
6-
import math
7-
3+
from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+
84

9-
def verify_numeric(X_test):
10-
"""
11-
Test that all attributes in the DataFrame are numeric.
12-
"""
13-
for dt in X_test.dtypes:
14-
numeric = False
15-
if np.issubdtype(dt, np.dtype(int)) or np.issubdtype(dt, np.dtype(float)):
16-
numeric = True
17-
assert numeric
5+
from category_encoders.tests.helpers import verify_numeric
186

197

20-
def create_array(n_rows=1000, extras=False, has_none=True):
21-
"""
22-
Creates a numpy dataset with some categorical variables.
23-
"""
24-
ds = [[
25-
random.random(),
26-
random.random(),
27-
random.choice(['A', 'B', 'C']),
28-
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),
29-
random.choice(['A', 'B', 'C', None, np.nan]) if has_none else random.choice(['A', 'B', 'C']),
30-
random.choice(['A'])
31-
] for _ in range(n_rows)]
8+
class TestHelpers(TestCase):
329

33-
return np.array(ds)
10+
def test_is_numeric_pandas(self):
11+
# Whole numbers, regardless of the byte length, should not raise AssertionError
12+
X = pd.DataFrame(np.ones([5, 5]), dtype='int32')
13+
verify_numeric(pd.DataFrame(X))
3414

15+
X = pd.DataFrame(np.ones([5, 5]), dtype='int64')
16+
verify_numeric(pd.DataFrame(X))
3517

36-
def create_dataset(n_rows=1000, extras=False, has_none=True):
37-
"""
38-
Creates a dataset with some categorical variables.
39-
"""
40-
random.seed(2001)
41-
ds = [[
42-
random.random(), # Floats
43-
random.choice([float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi]), # Floats with edge scenarios
44-
row, # Unique integers
45-
str(row), # Unique strings
46-
random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data
47-
random.choice(['A', 'B_b', 'C_c_c']), # Strings with underscores to test reverse_dummies()
48-
random.choice(['A', 'B', 'C', None]) if has_none else random.choice(['A', 'B', 'C']), # None
49-
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), # With a new string value
50-
random.choice([12, 43, -32]), # Number in the column name
51-
random.choice(['A', 'B', 'C']), # What is going to become the categorical column
52-
] for row in range(n_rows)]
18+
# Strings should raise AssertionError
19+
X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']])
20+
with self.assertRaises(Exception):
21+
verify_numeric(pd.DataFrame(X))
5322

54-
df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 321, 'categorical'])
55-
df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C'])
56-
return df
23+
def test_is_numeric_numpy(self):
24+
# Whole numbers, regardless of the byte length, should not raise AssertionError
25+
X = np.ones([5, 5], dtype='int32')
26+
verify_numeric(pd.DataFrame(X))
5727

28+
X = np.ones([5, 5], dtype='int64')
29+
verify_numeric(pd.DataFrame(X))
5830

59-
def verify_inverse_transform(x, x_inv):
60-
"""
61-
Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
62-
"""
63-
assert x.equals(x_inv)
31+
# Floats
32+
X = np.ones([5, 5], dtype='float32')
33+
verify_numeric(pd.DataFrame(X))
6434

35+
X = np.ones([5, 5], dtype='float64')
36+
verify_numeric(pd.DataFrame(X))
6537

66-
def deep_round(A, ndigits=5):
67-
"""
68-
Rounds numbers in a list of lists. Useful for approximate equality testing.
69-
"""
70-
return [[round(val, ndigits) for val in sublst] for sublst in A]
38+
def test_verify_raises_AssertionError_on_categories(self):
39+
# Categories should raise AssertionError
40+
X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='category')
41+
with self.assertRaises(Exception):
42+
verify_numeric(pd.DataFrame(X))

category_encoders/tests/test_leave_one_out.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pandas as pd
22
from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+
3-
import category_encoders.tests.test_helpers as th
3+
import category_encoders.tests.helpers as th
44
import numpy as np
55

66
import category_encoders as encoders

category_encoders/tests/test_one_hot.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import pandas as pd
2-
from unittest import TestCase # or `from unittest import ...` if on Python 3.4+
2+
from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+
33
import numpy as np
44
import warnings
5-
import category_encoders.tests.test_helpers as th
5+
import category_encoders.tests.helpers as th
66

77
import category_encoders as encoders
88

@@ -227,14 +227,13 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
227227
enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='return_nan')
228228
enc.fit(train)
229229
result = enc.transform(test)
230+
231+
message = 'inverse_transform is not supported because transform impute '\
232+
'the unknown category nan when encode city'
230233

231-
with warnings.catch_warnings(record=True) as w:
234+
with self.assertWarns(UserWarning, msg=message) as w:
232235
enc.inverse_transform(result)
233236

234-
self.assertEqual(1, len(w))
235-
self.assertEqual('inverse_transform is not supported because transform impute '
236-
'the unknown category nan when encode city', str(w[0].message))
237-
238237
def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self):
239238
train = pd.DataFrame({'city': ['chicago', np.nan]})
240239
test = pd.DataFrame({'city': ['chicago', 'los angeles']})

category_encoders/tests/test_ordinal.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pandas as pd
22
from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+
3-
import category_encoders.tests.test_helpers as th
3+
import category_encoders.tests.helpers as th
44
import numpy as np
55
import warnings
66
import category_encoders as encoders
@@ -35,7 +35,7 @@ def test_ordinal(self):
3535
self.assertIn(-1, set(out['extra'].values))
3636
self.assertTrue(len(enc.mapping) > 0)
3737

38-
enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return _nan')
38+
enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return_nan')
3939
enc.fit(X)
4040
out = enc.transform(X_t)
4141
out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)]
@@ -137,13 +137,12 @@ def test_inverse_transform_HaveUnknown_ExpectWarning(self):
137137
enc.fit(train)
138138
result = enc.transform(test)
139139

140-
with warnings.catch_warnings(record=True) as w:
140+
message = 'inverse_transform is not supported because transform impute '\
141+
'the unknown category -1 when encode city'
142+
143+
with self.assertWarns(UserWarning, msg=message) as w:
141144
enc.inverse_transform(result)
142145

143-
self.assertEqual(1, len(w))
144-
self.assertEqual('inverse_transform is not supported because transform impute '
145-
'the unknown category -1 when encode city', str(w[0].message))
146-
147146
def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self):
148147
train = pd.DataFrame({'city': ['chicago', np.nan]})
149148

@@ -169,14 +168,13 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
169168
enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan')
170169
enc.fit(train)
171170
result = enc.transform(test)
171+
172+
message = 'inverse_transform is not supported because transform impute '\
173+
'the unknown category nan when encode city'
172174

173-
with warnings.catch_warnings(record=True) as w:
175+
with self.assertWarns(UserWarning, msg=message) as w:
174176
enc.inverse_transform(result)
175177

176-
self.assertEqual(1, len(w))
177-
self.assertEqual('inverse_transform is not supported because transform impute '
178-
'the unknown category nan when encode city', str(w[0].message))
179-
180178
def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self):
181179
train = pd.DataFrame({'city': ['chicago', np.nan]})
182180
test = pd.DataFrame({'city': ['chicago', 'los angeles']})

category_encoders/tests/test_target_encoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pandas as pd
22
from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+
3-
import category_encoders.tests.test_helpers as th
3+
import category_encoders.tests.helpers as th
44
import numpy as np
55

66
import category_encoders as encoders

0 commit comments

Comments
 (0)