Skip to content

Commit c57d7c5

Browse files
committed
convert pd.NA to np.nan
1 parent 721323b commit c57d7c5

File tree

2 files changed

+24
-20
lines changed

2 files changed

+24
-20
lines changed

category_encoders/ordinal.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,9 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
193193
column = switch.get('col')
194194
col_mapping = switch['mapping']
195195

196-
# Treat None as np.nan
197-
X[column] = pd.Series([el if el is not None else np.NaN for el in X[column]], index=X[column].index)
198-
X[column] = X[column].map(col_mapping)
196+
# Convert to object to accept np.nan (dtype string doesn't)
197+
# fillna changes None and pd.NA to np.nan
198+
X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
199199
if util.is_category(X[column].dtype):
200200
nan_identity = col_mapping.loc[col_mapping.index.isna()].values[0]
201201
X[column] = X[column].cat.add_categories(nan_identity)

tests/test_encoders.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -199,33 +199,37 @@ def test_handle_unknown_return_nan(self):
199199
self.assertTrue(result[1:].isnull().all())
200200

201201
def test_handle_missing_return_nan_train(self):
202-
X = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
202+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
203+
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
203204
y = pd.Series([1, 0, 1])
204205

205206
for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
206-
with self.subTest(encoder_name=encoder_name):
207-
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
208-
result = enc.fit_transform(X, y).iloc[2, :]
207+
for X in (X_np, X_pd):
208+
with self.subTest(encoder_name=encoder_name):
209+
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
210+
result = enc.fit_transform(X, y).iloc[2, :]
209211

210-
if len(result) == 1:
211-
self.assertTrue(result.isnull().all())
212-
else:
213-
self.assertTrue(result[1:].isnull().all())
212+
if len(result) == 1:
213+
self.assertTrue(result.isnull().all())
214+
else:
215+
self.assertTrue(result[1:].isnull().all())
214216

215217
def test_handle_missing_return_nan_test(self):
216218
X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']})
217-
X_t = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
219+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
220+
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
218221
y = pd.Series([1, 0, 1])
219222

220223
for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
221-
with self.subTest(encoder_name=encoder_name):
222-
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
223-
result = enc.fit(X, y).transform(X_t).iloc[2, :]
224-
225-
if len(result) == 1:
226-
self.assertTrue(result.isnull().all())
227-
else:
228-
self.assertTrue(result[1:].isnull().all())
224+
for X_na in (X_np, X_pd):
225+
with self.subTest(encoder_name=encoder_name):
226+
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
227+
result = enc.fit(X, y).transform(X_na).iloc[2, :]
228+
229+
if len(result) == 1:
230+
self.assertTrue(result.isnull().all())
231+
else:
232+
self.assertTrue(result[1:].isnull().all())
229233

230234
def test_handle_unknown_value(self):
231235
train = pd.DataFrame({'city': ['chicago', 'los angeles']})

0 commit comments

Comments
 (0)