Skip to content

Commit a33a667

Browse files
Merge pull request #444 from PaulWestenthanner/various_maintenance
Various maintenance
2 parents 11fbba6 + e1652c6 commit a33a667

File tree

14 files changed

+43
-37
lines changed

14 files changed

+43
-37
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
unreleased
22
==========
3+
* fixed: Future Warning in Pandas
4+
* fixed: np.NaNs in numpy 2.x
35
* improved: performance of the hashing encoder (about twice as fast)
46
* deprecate the `max_sample`` parameter, it has no use anymore
57
* add `process_creation_method` parameter

category_encoders/count.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
3939
(otherwise it will be a numpy array).
4040
handle_missing: str
4141
how to handle missing values at fit time. Options are 'error', 'return_nan',
42-
and 'value'. Default 'value', which treat NaNs as a countable category at
42+
and 'value'. Default 'value', which treat nans as a countable category at
4343
fit time.
4444
handle_unknown: str, int or dict of {column : option, ...}.
4545
how to handle unknown labels at transform time. Options are 'error'
46-
'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
46+
'return_nan', 'value' and int. Defaults to None which uses nan behaviour
4747
specified at fit time. Passing an int will fill with this int value.
4848
normalize: bool or dict of {column : bool, ...}.
4949
whether to normalize the counts to the range (0, 1). See Pandas `value_counts`
@@ -62,9 +62,9 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
6262
Note: The default name can be long and may keep changing, for example,
6363
in cross-validation.
6464
combine_min_nan_groups: bool or dict of {column : bool, ...}.
65-
whether to combine the leftovers group with NaN group. Default True. Can
65+
whether to combine the leftovers group with nan group. Default True. Can
6666
also be forced to combine with 'force' meaning small groups are effectively
67-
counted as NaNs. Force can only be used when 'handle_missing' is 'value' or 'error'.
67+
counted as nans. Force can only be used when 'handle_missing' is 'value' or 'error'.
6868
Note: Will not force if it creates a binary or invariant column.
6969
7070
@@ -137,7 +137,7 @@ def _fit(self, X, y=None, **kwargs):
137137
def _transform(self, X):
138138
for col in self.cols:
139139
# Treat None as np.nan
140-
X[col] = pd.Series([el if el is not None else np.NaN for el in X[col]], index=X[col].index)
140+
X[col] = pd.Series([el if el is not None else np.nan for el in X[col]], index=X[col].index)
141141
if self.handle_missing == "value":
142142
if not util.is_category(X[col].dtype):
143143
X[col] = X[col].fillna(np.nan)
@@ -180,7 +180,7 @@ def _fit_count_encode(self, X_in, y):
180180
self.mapping[col] = mapping_values
181181

182182
if self._handle_missing[col] == 'return_nan':
183-
self.mapping[col][np.NaN] = np.NaN
183+
self.mapping[col][np.nan] = np.nan
184184

185185
# elif self._handle_missing[col] == 'value':
186186
#test_count.py failing self.mapping[col].loc[-2] = 0

category_encoders/ordinal.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,11 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
195195

196196
# Convert to object to accept np.nan (dtype string doesn't)
197197
# fillna changes None and pd.NA to np.nan
198-
X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
198+
try:
199+
with pd.option_context('future.no_silent_downcasting', True):
200+
X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
201+
except pd._config.config.OptionError: # old pandas versions
202+
X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
199203
if util.is_category(X[column].dtype):
200204
nan_identity = col_mapping.loc[col_mapping.index.isna()].array[0]
201205
X[column] = X[column].cat.add_categories(nan_identity)

category_encoders/rankhot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def apply_coding(row: pd.Series):
154154
elif self.handle_missing == "return_nan":
155155
return [np.nan] * len(default_value)
156156
else:
157-
raise ValueError("Unhandled NaN")
157+
raise ValueError("Unhandled nan")
158158
return encoding_dict.get(row.iloc[0], default_value)
159159

160160
encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand")

category_encoders/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
256256
(otherwise it will be a numpy array).
257257
handle_missing: str
258258
how to handle missing values at fit time. Options are 'error', 'return_nan',
259-
and 'value'. Default 'value', which treat NaNs as a countable category at
259+
and 'value'. Default 'value', which treat nans as a countable category at
260260
fit time.
261261
handle_unknown: str, int or dict of {column : option, ...}.
262262
how to handle unknown labels at transform time. Options are 'error'
263-
'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
263+
'return_nan', 'value' and int. Defaults to None which uses nan behaviour
264264
specified at fit time. Passing an int will fill with this int value.
265265
kwargs: dict.
266266
additional encoder specific parameters like regularisation.

examples/benchmarking_large/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def train_encoder(X, y, fold_count, encoder):
1414
"""
15-
Defines folds and performs the data preprocessing (categorical encoding, NaN imputation, normalization)
15+
Defines folds and performs the data preprocessing (categorical encoding, nan imputation, normalization)
1616
Returns a list with {X_train, y_train, X_test, y_test}, average fit_encoder_time and average score_encoder_time
1717
1818
Note: We normalize all features (not only numerical features) because otherwise SVM would

tests/helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
4444
str(row), # Unique strings
4545
random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data
4646
random.choice(['A', 'B_b', 'C_c_c']), # Strings with underscores to test reverse_dummies()
47-
random.choice(['A', 'B', 'C', np.NaN]) if has_missing else random.choice(['A', 'B', 'C']), # None
47+
random.choice(['A', 'B', 'C', np.nan]) if has_missing else random.choice(['A', 'B', 'C']), # None
4848
random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), # With a new string value
4949
random.choice(['A', 'B', 'C']), # What is going to become the categorical column
5050
random.choice(['A', 'B', 'C', np.nan]), # Categorical with missing values
@@ -60,7 +60,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
6060

6161
def verify_inverse_transform(x, x_inv):
6262
"""
63-
Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
63+
Verify x is equal to x_inv. The test returns true for nan.equals(nan) as it should.
6464
"""
6565
assert x.equals(x_inv)
6666

tests/test_cat_boost.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ def test_catBoost(self):
2121
self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3])
2222

2323
def test_catBoost_missing(self):
24-
X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.NaN, np.NaN, np.NaN]})
24+
X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.nan, np.nan, np.nan]})
2525
y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0])
2626
enc = encoders.CatBoostEncoder(handle_missing='value')
2727
obtained = enc.fit_transform(X, y)
2828
self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.')
2929

30-
X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.NaN]})
30+
X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.nan]})
3131
obtained = enc.transform(X_t)
3232
self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4])
3333

tests/test_count.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def test_count_combine_min_nan_groups_bool(self):
169169
self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
170170
self.assertEqual(out['na_categorical'].unique().shape[0], 3)
171171
self.assertTrue(enc.mapping is not None)
172-
self.assertIn(np.NaN, enc.mapping['na_categorical'])
172+
self.assertIn(np.nan, enc.mapping['na_categorical'])
173173

174174
def test_count_combine_min_nan_groups_dict(self):
175175
"""Test the combine_min_nan_groups dict on 'none' and 'na_categorical'."""

tests/test_encoders.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_handle_unknown_return_nan(self):
203203
self.assertTrue(result[1:].isna().all())
204204

205205
def test_handle_missing_return_nan_train(self):
206-
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
206+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
207207
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
208208
y = pd.Series([1, 0, 1])
209209

@@ -220,7 +220,7 @@ def test_handle_missing_return_nan_train(self):
220220

221221
def test_handle_missing_return_nan_test(self):
222222
X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']})
223-
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
223+
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
224224
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
225225
y = pd.Series([1, 0, 1])
226226

@@ -586,8 +586,8 @@ def test_target_encoders(self):
586586
def test_missing_values(self):
587587
# by default, treat missing values as another valid value
588588
x_placeholder = pd.Series(['a', 'b', 'b', 'c', 'c'])
589-
x_nan = pd.Series(['a', 'b', 'b', np.NaN, np.NaN])
590-
x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.NaN, np.NaN]})
589+
x_nan = pd.Series(['a', 'b', 'b', np.nan, np.nan])
590+
x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.nan, np.nan]})
591591
y = [0, 1, 1, 1, 1]
592592

593593
for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder currently violates it

0 commit comments

Comments
 (0)