Skip to content

Commit 5eb7a2d

Browse files
Merge pull request #397 from PaulWestenthanner/docs/gray_rankhot
Docs/gray rankhot
2 parents f669491 + f18dc09 commit 5eb7a2d

File tree

10 files changed

+61
-34
lines changed

10 files changed

+61
-34
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
unreleased
22
==========
3+
4+
* added: ignore option for one-hot-encoding
5+
* fixed: external dependency in unit test
6+
* fixed: gaps in ordinal encoding if nan values are present
7+
* fixed: sklearn complicance: add `feature_names_in_` attribute
8+
* fixed: add RankHotEncoder in documentation
39
* fixed: return correct mapping in one hot encoder `category_mapping` property (issue #256)
410

511
v2.6.0

category_encoders/ordinal.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,12 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
220220
mapping_out = []
221221
for col in cols:
222222
nan_identity = np.nan
223-
categories = list(X[col].unique())
223+
categories = X[col].unique()
224+
# make nan last category
225+
if pd.isna(categories).any():
226+
categories = [c for c in categories if not pd.isna(c)] + [nan_identity]
227+
else:
228+
categories = categories.tolist()
224229
if util.is_category(X[col].dtype):
225230
# Avoid using pandas category dtype meta-data if possible, see #235, #238.
226231
if X[col].dtype.ordered:

category_encoders/rankhot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def generate_mapping(self):
226226
index = []
227227
new_columns = []
228228

229-
for cat_name, class_ in values.iteritems():
229+
for cat_name, class_ in values.items():
230230
if self.use_cat_names:
231231
n_col_name = f"{col}_{cat_name}"
232232
found_count = found_column_counts.get(n_col_name, 0)

category_encoders/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ def convert_inputs(X, y, columns=None, index=None, deep=False):
8080
# N.B.: If either was already pandas, it keeps its index.
8181

8282
if any(X.index != y.index):
83-
raise ValueError("`X` and `y` both have indexes, but they do not match.")
83+
msg = "`X` and `y` both have indexes, but they do not match. If you are shuffling your input data on " \
84+
"purpose (e.g. via permutation_test_score) use np arrays instead of data frames / series"
85+
raise ValueError(msg)
8486
if X.shape[0] != y.shape[0]:
8587
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
8688
return X, y

docs/source/gray.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Gray
22
====
33

4-
.. autoclass:: category_encoders.gray.GaryEncoder
4+
.. autoclass:: category_encoders.gray.GrayEncoder
55
:members:
66
:inherited-members:

docs/source/index.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@ To use:
5252
encoder = ce.MEstimateEncoder(cols=[...])
5353
encoder = ce.OneHotEncoder(cols=[...])
5454
encoder = ce.OrdinalEncoder(cols=[...])
55-
encoder = ce.SumEncoder(cols=[...])
5655
encoder = ce.PolynomialEncoder(cols=[...])
56+
encoder = ce.QuantileEncoder(cols=[...])
57+
encoder = ce.RankHotEncoder(cols=[...])
58+
encoder = ce.SumEncoder(cols=[...])
5759
encoder = ce.TargetEncoder(cols=[...])
5860
encoder = ce.WOEEncoder(cols=[...])
59-
encoder = ce.QuantileEncoder(cols=[...])
6061
6162
encoder.fit(X, y)
6263
X_cleaned = encoder.transform(X_dirty)
@@ -85,6 +86,7 @@ Contents:
8586
ordinal
8687
polynomial
8788
quantile
89+
rankhot
8890
sum
8991
summary
9092
targetencoder

docs/source/rankhot.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
RankHotEncoder
2+
==============
3+
4+
.. autoclass:: category_encoders.rankhot.RankHotEncoder
5+
:members:
6+
:inherited-members:

tests/test_encoders.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -439,20 +439,14 @@ def test_duplicate_index_value(self):
439439
self.assertEqual(5, len(result))
440440

441441
def test_string_index(self):
442-
# https://github.com/scikit-learn-contrib/categorical-encoding/issues/131
443-
444-
bunch = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
445-
y = (bunch.target > 200000).values
446-
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
447-
X.index = X.index.values.astype(str)
448-
449-
display_cols = ["Id", "MSSubClass", "MSZoning", "YearBuilt", "Heating", "CentralAir"]
450-
X = X[display_cols]
442+
train = pd.DataFrame({'city': ['chicago', 'denver']})
443+
target = [0, 1]
444+
train.index = train.index.values.astype(str)
451445

452446
for encoder_name in encoders.__all__:
453447
with self.subTest(encoder_name=encoder_name):
454-
enc = getattr(encoders, encoder_name)(cols=['CentralAir', 'Heating'])
455-
result = enc.fit_transform(X, y)
448+
enc = getattr(encoders, encoder_name)()
449+
result = enc.fit_transform(train, target)
456450
self.assertFalse(result.isnull().values.any(), 'There should not be any missing value!')
457451

458452
def test_get_feature_names_out(self):
@@ -609,8 +603,7 @@ def test_metamorphic(self):
609603
x3 = pd.DataFrame(data={'x': ['A', 'B', 'B']}) # DataFrame
610604
x4 = pd.Series(['A', 'B', 'B'], dtype='category') # Series with category data type
611605
x5 = np.array(['A', 'B', 'B']) # Numpy
612-
x6 = [np.NaN, 'B', 'B'] # Missing value
613-
x7 = ['Z', 'Y', 'Y'] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)
606+
x6 = ['Z', 'Y', 'Y'] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)
614607

615608
y = [1, 1, 0]
616609

@@ -636,19 +629,13 @@ def test_metamorphic(self):
636629
result5 = enc5.fit_transform(x5, y)
637630
self.assertTrue((result1.values == result5.values).all())
638631

639-
# gray encoder and rankhot re-orders inputs so that nan is last, hence the output is changed
632+
# gray encoder actually does re-order inputs
633+
# rankhot encoder respects order, in this example the order is switched
640634
if encoder_name not in ["GrayEncoder", "RankHotEncoder"]:
641635
enc6 = getattr(encoders, encoder_name)()
642636
result6 = enc6.fit_transform(x6, y)
643637
self.assertTrue((result1.values == result6.values).all())
644638

645-
# gray encoder actually does re-order inputs
646-
# rankhot encoder respects order, in this example the order is switched
647-
if encoder_name not in ["GrayEncoder", "RankHotEncoder"]:
648-
enc7 = getattr(encoders, encoder_name)()
649-
result7 = enc7.fit_transform(x7, y)
650-
self.assertTrue((result1.values == result7.values).all())
651-
652639
# Arguments
653640
enc9 = getattr(encoders, encoder_name)(return_df=False)
654641
result9 = enc9.fit_transform(x1, y)

tests/test_one_hot.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -159,31 +159,38 @@ def test_HandleMissingReturnNan(self):
159159
result,
160160
pd.DataFrame({'x_A': [1, np.nan, 0], 'x_B': [0, np.nan, 1]}),
161161
)
162-
162+
163163
def test_HandleMissingIgnore(self):
164164
train = pd.DataFrame({'x': ['A', 'B', np.nan],
165165
'y': ['A', None, 'A'],
166166
'z': [np.NaN, 'B', 'B']})
167167
train['z'] = train['z'].astype('category')
168-
168+
169169
expected_result = pd.DataFrame({'x_A': [1, 0, 0],
170170
'x_B': [0, 1, 0],
171171
'y_A': [1, 0, 1],
172172
'z_B': [0, 1, 1]})
173173
encoder = encoders.OneHotEncoder(handle_missing='ignore', use_cat_names=True)
174174
result = encoder.fit_transform(train)
175-
175+
176176
pd.testing.assert_frame_equal(result, expected_result)
177-
177+
178178
def test_HandleMissingIgnore_ExpectMappingUsed(self):
179-
train = pd.DataFrame({'city': ['Chicago', np.NaN,'Geneva']})
179+
train = pd.DataFrame({'city': ['Chicago', np.NaN, 'Geneva']})
180180
expected_result = pd.DataFrame({'city_1': [1, 0, 0],
181-
'city_3': [0, 0, 1]})
181+
'city_2': [0, 0, 1]})
182182

183183
encoder = encoders.OneHotEncoder(handle_missing='ignore')
184184
result = encoder.fit(train).transform(train)
185+
expected_mapping = pd.DataFrame([
186+
[1, 0],
187+
[0, 1],
188+
[0, 0],
189+
[0, 0],
190+
], columns=["city_1", "city_2"], index=[1, 2, -2, -1])
185191

186192
pd.testing.assert_frame_equal(expected_result, result)
193+
pd.testing.assert_frame_equal(expected_mapping, encoder.category_mapping[0]["mapping"])
187194

188195
def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self):
189196
train = ['A', 'B', np.nan]
@@ -271,7 +278,7 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
271278
enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='return_nan')
272279
enc.fit(train)
273280
result = enc.transform(test)
274-
281+
275282
message = 'inverse_transform is not supported because transform impute '\
276283
'the unknown category nan when encode city'
277284

tests/test_ordinal.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,18 @@ def test_HaveNaNInTrain_ExpectCodedAsOne(self):
177177

178178
self.assertEqual(expected, result)
179179

180+
def test_NoGaps(self):
181+
train = pd.DataFrame({"city": ["New York", np.nan, "Rio", None, "Rosenheim"]})
182+
expected_mapping_value = pd.Series([1, 2, 3, 4], index=["New York", "Rio", "Rosenheim", np.nan])
183+
expected_mapping_return_nan = pd.Series([1, 2, 3, -2], index=["New York", "Rio", "Rosenheim", np.nan])
184+
185+
enc_value = encoders.OrdinalEncoder(cols=["city"], handle_missing="value")
186+
enc_value.fit(train)
187+
pd.testing.assert_series_equal(expected_mapping_value, enc_value.mapping[0]["mapping"])
188+
enc_return_nan = encoders.OrdinalEncoder(cols=["city"], handle_missing="return_nan")
189+
enc_return_nan.fit(train)
190+
pd.testing.assert_series_equal(expected_mapping_return_nan, enc_return_nan.mapping[0]["mapping"])
191+
180192
def test_HaveNoneAndNan_ExpectCodesAsOne(self):
181193
train = pd.DataFrame({"city": [np.nan, None]})
182194
expected = [1, 1]

0 commit comments

Comments
 (0)