Skip to content

Commit 9dbbd90

Browse files
author
PaulWestenthanner
committed
fix: #443: change np.NaN to np.nan
1 parent 5203c12 commit 9dbbd90

File tree

12 files changed

+36
-36
lines changed

12 files changed

+36
-36
lines changed

category_encoders/count.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
3939
(otherwise it will be a numpy array).
4040
handle_missing: str
4141
how to handle missing values at fit time. Options are 'error', 'return_nan',
42-
and 'value'. Default 'value', which treat NaNs as a countable category at
42+
and 'value'. Default 'value', which treat nans as a countable category at
4343
fit time.
4444
handle_unknown: str, int or dict of {column : option, ...}.
4545
how to handle unknown labels at transform time. Options are 'error'
46-
'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
46+
'return_nan', 'value' and int. Defaults to None which uses nan behaviour
4747
specified at fit time. Passing an int will fill with this int value.
4848
normalize: bool or dict of {column : bool, ...}.
4949
whether to normalize the counts to the range (0, 1). See Pandas `value_counts`
@@ -62,9 +62,9 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
6262
Note: The default name can be long and may keep changing, for example,
6363
in cross-validation.
6464
combine_min_nan_groups: bool or dict of {column : bool, ...}.
65-
whether to combine the leftovers group with NaN group. Default True. Can
65+
whether to combine the leftovers group with nan group. Default True. Can
6666
also be forced to combine with 'force' meaning small groups are effectively
67-
counted as NaNs. Force can only be used when 'handle_missing' is 'value' or 'error'.
67+
counted as nans. Force can only be used when 'handle_missing' is 'value' or 'error'.
6868
Note: Will not force if it creates a binary or invariant column.
6969
7070
@@ -137,7 +137,7 @@ def _fit(self, X, y=None, **kwargs):
137137
def _transform(self, X):
138138
for col in self.cols:
139139
# Treat None as np.nan
140-
X[col] = pd.Series([el if el is not None else np.NaN for el in X[col]], index=X[col].index)
140+
X[col] = pd.Series([el if el is not None else np.nan for el in X[col]], index=X[col].index)
141141
if self.handle_missing == "value":
142142
if not util.is_category(X[col].dtype):
143143
X[col] = X[col].fillna(np.nan)
@@ -180,7 +180,7 @@ def _fit_count_encode(self, X_in, y):
180180
self.mapping[col] = mapping_values
181181

182182
if self._handle_missing[col] == 'return_nan':
183-
self.mapping[col][np.NaN] = np.NaN
183+
self.mapping[col][np.nan] = np.nan
184184

185185
# elif self._handle_missing[col] == 'value':
186186
#test_count.py failing self.mapping[col].loc[-2] = 0

category_encoders/rankhot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def apply_coding(row: pd.Series):
154154
elif self.handle_missing == "return_nan":
155155
return [np.nan] * len(default_value)
156156
else:
157-
raise ValueError("Unhandled NaN")
157+
raise ValueError("Unhandled nan")
158158
return encoding_dict.get(row.iloc[0], default_value)
159159

160160
encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand")

category_encoders/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
255255
(otherwise it will be a numpy array).
256256
handle_missing: str
257257
how to handle missing values at fit time. Options are 'error', 'return_nan',
258-
and 'value'. Default 'value', which treat NaNs as a countable category at
258+
and 'value'. Default 'value', which treat nans as a countable category at
259259
fit time.
260260
handle_unknown: str, int or dict of {column : option, ...}.
261261
how to handle unknown labels at transform time. Options are 'error'
262-
'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
262+
'return_nan', 'value' and int. Defaults to None which uses nan behaviour
263263
specified at fit time. Passing an int will fill with this int value.
264264
kwargs: dict.
265265
additional encoder specific parameters like regularisation.

examples/benchmarking_large/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def train_encoder(X, y, fold_count, encoder):
1414
"""
15-
Defines folds and performs the data preprocessing (categorical encoding, NaN imputation, normalization)
15+
Defines folds and performs the data preprocessing (categorical encoding, nan imputation, normalization)
1616
Returns a list with {X_train, y_train, X_test, y_test}, average fit_encoder_time and average score_encoder_time
1717
1818
Note: We normalize all features (not only numerical features) because otherwise SVM would

tests/helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
4444
str(row), # Unique strings
4545
random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data
4646
random.choice(['A', 'B_b', 'C_c_c']), # Strings with underscores to test reverse_dummies()
47-
random.choice(['A', 'B', 'C', np.NaN]) if has_missing else random.choice(['A', 'B', 'C']), # None
47+
random.choice(['A', 'B', 'C', np.nan]) if has_missing else random.choice(['A', 'B', 'C']), # None
4848
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), # With a new string value
4949
random.choice(['A', 'B', 'C']), # What is going to become the categorical column
5050
random.choice(['A', 'B', 'C', np.nan]), # Categorical with missing values
@@ -60,7 +60,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
6060

6161
def verify_inverse_transform(x, x_inv):
6262
"""
63-
Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
63+
Verify x is equal to x_inv. The test returns true for nan.equals(nan) as it should.
6464
"""
6565
assert x.equals(x_inv)
6666

tests/test_cat_boost.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ def test_catBoost(self):
2121
self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3])
2222

2323
def test_catBoost_missing(self):
24-
X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.NaN, np.NaN, np.NaN]})
24+
X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.nan, np.nan, np.nan]})
2525
y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0])
2626
enc = encoders.CatBoostEncoder(handle_missing='value')
2727
obtained = enc.fit_transform(X, y)
2828
self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.')
2929

30-
X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.NaN]})
30+
X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.nan]})
3131
obtained = enc.transform(X_t)
3232
self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4])
3333

tests/test_count.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def test_count_combine_min_nan_groups_bool(self):
169169
self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
170170
self.assertEqual(out['na_categorical'].unique().shape[0], 3)
171171
self.assertTrue(enc.mapping is not None)
172-
self.assertIn(np.NaN, enc.mapping['na_categorical'])
172+
self.assertIn(np.nan, enc.mapping['na_categorical'])
173173

174174
def test_count_combine_min_nan_groups_dict(self):
175175
"""Test the combine_min_nan_groups dict on 'none' and 'na_categorical'."""

tests/test_encoders.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_handle_unknown_return_nan(self):
203203
self.assertTrue(result[1:].isna().all())
204204

205205
def test_handle_missing_return_nan_train(self):
206-
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
206+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
207207
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
208208
y = pd.Series([1, 0, 1])
209209

@@ -220,7 +220,7 @@ def test_handle_missing_return_nan_train(self):
220220

221221
def test_handle_missing_return_nan_test(self):
222222
X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']})
223-
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
223+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
224224
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
225225
y = pd.Series([1, 0, 1])
226226

@@ -586,8 +586,8 @@ def test_target_encoders(self):
586586
def test_missing_values(self):
587587
# by default, treat missing values as another valid value
588588
x_placeholder = pd.Series(['a', 'b', 'b', 'c', 'c'])
589-
x_nan = pd.Series(['a', 'b', 'b', np.NaN, np.NaN])
590-
x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.NaN, np.NaN]})
589+
x_nan = pd.Series(['a', 'b', 'b', np.nan, np.nan])
590+
x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.nan, np.nan]})
591591
y = [0, 1, 1, 1, 1]
592592

593593
for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder currently violates it

tests/test_one_hot.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def test_HandleMissingReturnNan(self):
163163
def test_HandleMissingIgnore(self):
164164
train = pd.DataFrame({'x': ['A', 'B', np.nan],
165165
'y': ['A', None, 'A'],
166-
'z': [np.NaN, 'B', 'B']})
166+
'z': [np.nan, 'B', 'B']})
167167
train['z'] = train['z'].astype('category')
168168

169169
expected_result = pd.DataFrame({'x_A': [1, 0, 0],
@@ -176,7 +176,7 @@ def test_HandleMissingIgnore(self):
176176
pd.testing.assert_frame_equal(result, expected_result)
177177

178178
def test_HandleMissingIgnore_ExpectMappingUsed(self):
179-
train = pd.DataFrame({'city': ['Chicago', np.NaN, 'Geneva']})
179+
train = pd.DataFrame({'city': ['Chicago', np.nan, 'Geneva']})
180180
expected_result = pd.DataFrame({'city_1': [1, 0, 0],
181181
'city_2': [0, 0, 1]})
182182

tests/test_ordinal.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,19 +135,19 @@ def test_custom_mapping(self):
135135
custom_mapping = [
136136
{
137137
"col": "col1",
138-
"mapping": {np.NaN: 0, "a": 1, "b": 2},
138+
"mapping": {np.nan: 0, "a": 1, "b": 2},
139139
}, # The mapping from the documentation
140-
{"col": "col2", "mapping": {np.NaN: -3, "x": 11, "y": 2}},
140+
{"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}},
141141
]
142142
custom_mapping_series = [
143143
{
144144
"col": "col1",
145-
"mapping": pd.Series({np.NaN: 0, "a": 1, "b": 2}),
145+
"mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}),
146146
}, # The mapping from the documentation
147-
{"col": "col2", "mapping": pd.Series({np.NaN: -3, "x": 11, "y": 2})},
147+
{"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})},
148148
]
149149

150-
train = pd.DataFrame({"col1": ["a", "a", "b", np.NaN], "col2": ["x", "y", np.NaN, np.NaN]})
150+
train = pd.DataFrame({"col1": ["a", "a", "b", np.nan], "col2": ["x", "y", np.nan, np.nan]})
151151

152152
for mapping in [custom_mapping, custom_mapping_series]:
153153
with self.subTest():
@@ -168,7 +168,7 @@ def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self):
168168

169169
self.assertEqual(expected, result)
170170

171-
def test_HaveNaNInTrain_ExpectCodedAsOne(self):
171+
def test_HavenanInTrain_ExpectCodedAsOne(self):
172172
train = pd.DataFrame({"city": [np.nan]})
173173
expected = [1]
174174

@@ -362,16 +362,16 @@ def test_validate_mapping(self):
362362
custom_mapping = [
363363
{
364364
"col": "col1",
365-
"mapping": {np.NaN: 0, "a": 1, "b": 2},
365+
"mapping": {np.nan: 0, "a": 1, "b": 2},
366366
}, # The mapping from the documentation
367-
{"col": "col2", "mapping": {np.NaN: -3, "x": 11, "y": 2}},
367+
{"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}},
368368
]
369369
expected_valid_mapping = [
370370
{
371371
"col": "col1",
372-
"mapping": pd.Series({np.NaN: 0, "a": 1, "b": 2}),
372+
"mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}),
373373
}, # The mapping from the documentation
374-
{"col": "col2", "mapping": pd.Series({np.NaN: -3, "x": 11, "y": 2})},
374+
{"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})},
375375
]
376376
enc = encoders.OrdinalEncoder()
377377
actual_valid_mapping = enc._validate_supplied_mapping(custom_mapping)

0 commit comments

Comments
 (0)