Skip to content

Commit 0bcb96b

Browse files
Merge pull request #322 from bmreiniger/fix_ohe_nan_col
Fix ohe nan col
2 parents cc0c4b9 + 04ccbdb commit 0bcb96b

File tree

2 files changed

+71
-20
lines changed

2 files changed

+71
-20
lines changed

category_encoders/one_hot.py

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,20 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
2727
if True, category values will be included in the encoded column names. Since this can result in duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated.
2828
If False, category indices will be used instead of the category values.
2929
handle_unknown: str
30-
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
31-
an extra column will be added in if the transform matrix has unknown categories. This can cause
32-
unexpected changes in dimension in some cases.
30+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'.
31+
32+
'error' will raise a `ValueError` at transform time if there are new categories.
33+
'return_nan' will encode a new value as `np.nan` in every dummy column.
34+
'value' will encode a new value as 0 in every dummy column.
35+
'indicator' will add an additional dummy column (in both training and test data).
3336
handle_missing: str
34-
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
35-
an extra column will be added in if the transform matrix has nan values. This can cause
36-
unexpected changes in dimension in some cases.
37+
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'.
38+
39+
'error' will raise a `ValueError` if missings are encountered.
40+
'return_nan' will encode a missing value as `np.nan` in every dummy column.
41+
'value' will encode a missing value as 0 in every dummy column.
42+
'indicator' will treat missingness as its own category, adding an additional dummy column
43+
(whether there are missing values in the training set or not).
3744
3845
Example
3946
-------
@@ -142,11 +149,18 @@ def fit(self, X, y=None, **kwargs):
142149
if X[self.cols].isnull().any().any():
143150
raise ValueError('Columns to be encoded can not contain null')
144151

152+
oe_missing_strat = {
153+
'error': 'error',
154+
'return_nan': 'return_nan',
155+
'value': 'value',
156+
'indicator': 'return_nan',
157+
}[self.handle_missing]
158+
145159
self.ordinal_encoder = OrdinalEncoder(
146160
verbose=self.verbose,
147161
cols=self.cols,
148162
handle_unknown='value',
149-
handle_missing='value'
163+
handle_missing=oe_missing_strat,
150164
)
151165
self.ordinal_encoder = self.ordinal_encoder.fit(X)
152166
self.mapping = self.generate_mapping()
@@ -184,7 +198,13 @@ def generate_mapping(self):
184198
index = []
185199
new_columns = []
186200

201+
append_nan_to_index = False
187202
for cat_name, class_ in values.iteritems():
203+
if pd.isna(cat_name) and self.handle_missing == 'return_nan':
204+
# we don't want a mapping column if return_nan
205+
# but do add the index to the end
206+
append_nan_to_index = class_
207+
continue
188208
if self.use_cat_names:
189209
n_col_name = str(col) + '_%s' % (cat_name,)
190210
found_count = found_column_counts.get(n_col_name, 0)
@@ -205,7 +225,10 @@ def generate_mapping(self):
205225
new_columns.append(n_col_name)
206226
index.append(-1)
207227

208-
base_matrix = np.eye(N=len(index), dtype=np.int)
228+
if append_nan_to_index:
229+
index.append(append_nan_to_index)
230+
231+
base_matrix = np.eye(N=len(index), M=len(new_columns), dtype=int)
209232
base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index)
210233

211234
if self.handle_unknown == 'value':
@@ -214,7 +237,7 @@ def generate_mapping(self):
214237
base_df.loc[-1] = np.nan
215238

216239
if self.handle_missing == 'return_nan':
217-
base_df.loc[values.loc[np.nan]] = np.nan
240+
base_df.loc[-2] = np.nan
218241
elif self.handle_missing == 'value':
219242
base_df.loc[-2] = 0
220243

@@ -238,17 +261,17 @@ def transform(self, X, override_return_df=False):
238261
239262
"""
240263

241-
if self.handle_missing == 'error':
242-
if X[self.cols].isnull().any().any():
243-
raise ValueError('Columns to be encoded can not contain null')
244-
245264
if self._dim is None:
246265
raise ValueError(
247266
'Must train encoder before it can be used to transform data.')
248267

249268
# first check the type
250269
X = util.convert_input(X)
251270

271+
if self.handle_missing == 'error':
272+
if X[self.cols].isnull().any().any():
273+
raise ValueError('Columns to be encoded can not contain null')
274+
252275
# then make sure that it is the right size
253276
if X.shape[1] != self._dim:
254277
raise ValueError('Unexpected input dimension %d, expected %d' % (
@@ -344,7 +367,7 @@ def get_dummies(self, X_in):
344367
col = switch.get('col')
345368
mod = switch.get('mapping')
346369

347-
base_df = mod.reindex(X[col])
370+
base_df = mod.reindex(X[col].fillna(-2))
348371
base_df = base_df.set_index(X.index)
349372
X = pd.concat([base_df, X], axis=1)
350373

tests/test_one_hot.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,30 @@ def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet(
144144

145145
pd.testing.assert_frame_equal(expected_result, result)
146146

147+
def test_HandleMissingError(self):
148+
data_no_missing = ['A', 'B', 'B']
149+
data_w_missing = [np.nan, 'B', 'B']
150+
encoder = encoders.OneHotEncoder(handle_missing="error")
151+
152+
result = encoder.fit_transform(data_no_missing)
153+
expected = [[1, 0],
154+
[0, 1],
155+
[0, 1]]
156+
self.assertEqual(result.values.tolist(), expected)
157+
158+
self.assertRaisesRegex(ValueError, '.*null.*', encoder.transform, data_w_missing)
159+
160+
self.assertRaisesRegex(ValueError, '.*null.*', encoder.fit, data_w_missing)
161+
162+
def test_HandleMissingReturnNan(self):
163+
train = pd.DataFrame({'x': ['A', np.nan, 'B']})
164+
encoder = encoders.OneHotEncoder(handle_missing='return_nan', use_cat_names=True)
165+
result = encoder.fit_transform(train)
166+
pd.testing.assert_frame_equal(
167+
result,
168+
pd.DataFrame({'x_A': [1, np.nan, 0], 'x_B': [0, np.nan, 1]}),
169+
)
170+
147171
def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self):
148172
train = ['A', 'B', np.nan]
149173

@@ -170,13 +194,17 @@ def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self):
170194
test = ['A', 'B', np.nan]
171195

172196
encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value')
173-
encoder.fit(train)
174-
result = encoder.transform(test)
197+
encoded_train = encoder.fit_transform(train)
198+
encoded_test = encoder.transform(test)
175199

176-
expected = [[1, 0, 0],
177-
[0, 1, 0],
178-
[0, 0, 1]]
179-
self.assertEqual(result.values.tolist(), expected)
200+
expected_1 = [[1, 0, 0],
201+
[0, 1, 0]]
202+
self.assertEqual(encoded_train.values.tolist(), expected_1)
203+
204+
expected_2 = [[1, 0, 0],
205+
[0, 1, 0],
206+
[0, 0, 1]]
207+
self.assertEqual(encoded_test.values.tolist(), expected_2)
180208

181209
def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self):
182210
train = ['A', 'B']

0 commit comments

Comments
 (0)