Merge pull request #444 from PaulWestenthanner/various_maintenance

PaulWestenthanner · web-flow · commit a33a667dad8c · 2024-10-01T22:37:45.000+02:00
Various maintenance
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
 unreleased
 ==========
+* fixed: Future Warning in Pandas
+* fixed: np.NaNs in numpy 2.x
 * improved: performance of the hashing encoder (about twice as fast)
   * deprecate the `max_sample`` parameter, it has no use anymore
   * add `process_creation_method` parameter
diff --git a/category_encoders/count.py b/category_encoders/count.py
@@ -39,11 +39,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
             (otherwise it will be a numpy array).
         handle_missing: str
             how to handle missing values at fit time. Options are 'error', 'return_nan',
-            and 'value'. Default 'value', which treat NaNs as a countable category at
+            and 'value'. Default 'value', which treat nans as a countable category at
             fit time.
         handle_unknown: str, int or dict of {column : option, ...}.
             how to handle unknown labels at transform time. Options are 'error'
-            'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
+            'return_nan', 'value' and int. Defaults to None which uses nan behaviour
             specified at fit time. Passing an int will fill with this int value.
         normalize: bool or dict of {column : bool, ...}.
             whether to normalize the counts to the range (0, 1). See Pandas `value_counts`
@@ -62,9 +62,9 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False,
             Note: The default name can be long and may keep changing, for example, 
             in cross-validation.
         combine_min_nan_groups: bool or dict of {column : bool, ...}.
-            whether to combine the leftovers group with NaN group. Default True. Can
+            whether to combine the leftovers group with nan group. Default True. Can
             also be forced to combine with 'force' meaning small groups are effectively
-            counted as NaNs. Force can only be used when 'handle_missing' is 'value' or 'error'.
+            counted as nans. Force can only be used when 'handle_missing' is 'value' or 'error'.
             Note: Will not force if it creates a binary or invariant column.
 
 
@@ -137,7 +137,7 @@ def _fit(self, X, y=None, **kwargs):
     def _transform(self, X):
         for col in self.cols:
             # Treat None as np.nan
-            X[col] = pd.Series([el if el is not None else np.NaN for el in X[col]], index=X[col].index)
+            X[col] = pd.Series([el if el is not None else np.nan for el in X[col]], index=X[col].index)
             if self.handle_missing == "value":
                 if not util.is_category(X[col].dtype):
                     X[col] = X[col].fillna(np.nan)
@@ -180,7 +180,7 @@ def _fit_count_encode(self, X_in, y):
             self.mapping[col] = mapping_values
 
             if self._handle_missing[col] == 'return_nan':
-                self.mapping[col][np.NaN] = np.NaN
+                self.mapping[col][np.nan] = np.nan
             
             # elif self._handle_missing[col] == 'value':
             #test_count.py failing     self.mapping[col].loc[-2] = 0
diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py
@@ -195,7 +195,11 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
 
                 # Convert to object to accept np.nan (dtype string doesn't)
                 # fillna changes None and pd.NA to np.nan
-                X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
+                try:
+                    with pd.option_context('future.no_silent_downcasting', True):
+                        X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
+                except pd._config.config.OptionError:  # old pandas versions
+                    X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
                 if util.is_category(X[column].dtype):
                     nan_identity = col_mapping.loc[col_mapping.index.isna()].array[0]
                     X[column] = X[column].cat.add_categories(nan_identity)
diff --git a/category_encoders/rankhot.py b/category_encoders/rankhot.py
@@ -154,7 +154,7 @@ def apply_coding(row: pd.Series):
                     elif self.handle_missing == "return_nan":
                         return [np.nan] * len(default_value)
                     else:
-                        raise ValueError("Unhandled NaN")
+                        raise ValueError("Unhandled nan")
                 return encoding_dict.get(row.iloc[0], default_value)
 
             encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand")
diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -256,11 +256,11 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
             (otherwise it will be a numpy array).
         handle_missing: str
             how to handle missing values at fit time. Options are 'error', 'return_nan',
-            and 'value'. Default 'value', which treat NaNs as a countable category at
+            and 'value'. Default 'value', which treat nans as a countable category at
             fit time.
         handle_unknown: str, int or dict of {column : option, ...}.
             how to handle unknown labels at transform time. Options are 'error'
-            'return_nan', 'value' and int. Defaults to None which uses NaN behaviour
+            'return_nan', 'value' and int. Defaults to None which uses nan behaviour
             specified at fit time. Passing an int will fill with this int value.
         kwargs: dict.
             additional encoder specific parameters like regularisation.
diff --git a/examples/benchmarking_large/util.py b/examples/benchmarking_large/util.py
@@ -12,7 +12,7 @@
 
 def train_encoder(X, y, fold_count, encoder):
     """
-    Defines folds and performs the data preprocessing (categorical encoding, NaN imputation, normalization)
+    Defines folds and performs the data preprocessing (categorical encoding, nan imputation, normalization)
     Returns a list with {X_train, y_train, X_test, y_test}, average fit_encoder_time and average score_encoder_time
 
     Note: We normalize all features (not only numerical features) because otherwise SVM would
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -44,7 +44,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
         str(row),                                                                               # Unique strings
         random.choice(['A', 'B']) if extras else 'A',                                           # Invariant in the training data
         random.choice(['A', 'B_b', 'C_c_c']),                                                   # Strings with underscores to test reverse_dummies()
-        random.choice(['A', 'B', 'C', np.NaN]) if has_missing else random.choice(['A', 'B', 'C']), # None
+        random.choice(['A', 'B', 'C', np.nan]) if has_missing else random.choice(['A', 'B', 'C']), # None
         random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),      # With a new string value
         random.choice(['A', 'B', 'C']),                                                         # What is going to become the categorical column
         random.choice(['A', 'B', 'C', np.nan]),                                                 # Categorical with missing values
@@ -60,7 +60,7 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
 
 def verify_inverse_transform(x, x_inv):
     """
-    Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
+    Verify x is equal to x_inv. The test returns true for nan.equals(nan) as it should.
     """
     assert x.equals(x_inv)
 
diff --git a/tests/test_cat_boost.py b/tests/test_cat_boost.py
@@ -21,13 +21,13 @@ def test_catBoost(self):
         self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3])
 
     def test_catBoost_missing(self):
-        X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.NaN, np.NaN, np.NaN]})
+        X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.nan, np.nan, np.nan]})
         y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0])
         enc = encoders.CatBoostEncoder(handle_missing='value')
         obtained = enc.fit_transform(X, y)
         self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.')
 
-        X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.NaN]})
+        X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.nan]})
         obtained = enc.transform(X_t)
         self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4])
 
diff --git a/tests/test_count.py b/tests/test_count.py
@@ -169,7 +169,7 @@ def test_count_combine_min_nan_groups_bool(self):
         self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
         self.assertEqual(out['na_categorical'].unique().shape[0], 3)
         self.assertTrue(enc.mapping is not None)
-        self.assertIn(np.NaN, enc.mapping['na_categorical'])
+        self.assertIn(np.nan, enc.mapping['na_categorical'])
 
     def test_count_combine_min_nan_groups_dict(self):
         """Test the combine_min_nan_groups dict  on 'none' and 'na_categorical'."""
diff --git a/tests/test_encoders.py b/tests/test_encoders.py
@@ -203,7 +203,7 @@ def test_handle_unknown_return_nan(self):
                     self.assertTrue(result[1:].isna().all())
 
     def test_handle_missing_return_nan_train(self):
-        X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
+        X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
         X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
         y = pd.Series([1, 0, 1])
 
@@ -220,7 +220,7 @@ def test_handle_missing_return_nan_train(self):
 
     def test_handle_missing_return_nan_test(self):
         X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']})
-        X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
+        X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]})
         X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
         y = pd.Series([1, 0, 1])
 
@@ -586,8 +586,8 @@ def test_target_encoders(self):
     def test_missing_values(self):
         # by default, treat missing values as another valid value
         x_placeholder = pd.Series(['a', 'b', 'b', 'c', 'c'])
-        x_nan = pd.Series(['a', 'b', 'b', np.NaN, np.NaN])
-        x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.NaN, np.NaN]})
+        x_nan = pd.Series(['a', 'b', 'b', np.nan, np.nan])
+        x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.nan, np.nan]})
         y = [0, 1, 1, 1, 1]
 
         for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}):   # HashingEncoder currently violates it
diff --git a/tests/test_one_hot.py b/tests/test_one_hot.py
@@ -163,7 +163,7 @@ def test_HandleMissingReturnNan(self):
     def test_HandleMissingIgnore(self):
         train = pd.DataFrame({'x': ['A', 'B', np.nan],
                               'y': ['A', None, 'A'],
-                              'z': [np.NaN, 'B', 'B']})
+                              'z': [np.nan, 'B', 'B']})
         train['z'] = train['z'].astype('category')
 
         expected_result = pd.DataFrame({'x_A': [1, 0, 0],
@@ -176,7 +176,7 @@ def test_HandleMissingIgnore(self):
         pd.testing.assert_frame_equal(result, expected_result)
 
     def test_HandleMissingIgnore_ExpectMappingUsed(self):
-        train = pd.DataFrame({'city': ['Chicago', np.NaN, 'Geneva']})
+        train = pd.DataFrame({'city': ['Chicago', np.nan, 'Geneva']})
         expected_result = pd.DataFrame({'city_1': [1, 0, 0],
                                         'city_2': [0, 0, 1]})
 
diff --git a/tests/test_ordinal.py b/tests/test_ordinal.py
@@ -135,19 +135,19 @@ def test_custom_mapping(self):
         custom_mapping = [
             {
                 "col": "col1",
-                "mapping": {np.NaN: 0, "a": 1, "b": 2},
+                "mapping": {np.nan: 0, "a": 1, "b": 2},
             },  # The mapping from the documentation
-            {"col": "col2", "mapping": {np.NaN: -3, "x": 11, "y": 2}},
+            {"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}},
         ]
         custom_mapping_series = [
             {
                 "col": "col1",
-                "mapping": pd.Series({np.NaN: 0, "a": 1, "b": 2}),
+                "mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}),
             },  # The mapping from the documentation
-            {"col": "col2", "mapping": pd.Series({np.NaN: -3, "x": 11, "y": 2})},
+            {"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})},
         ]
 
-        train = pd.DataFrame({"col1": ["a", "a", "b", np.NaN], "col2": ["x", "y", np.NaN, np.NaN]})
+        train = pd.DataFrame({"col1": ["a", "a", "b", np.nan], "col2": ["x", "y", np.nan, np.nan]})
 
         for mapping in [custom_mapping, custom_mapping_series]:
             with self.subTest():
@@ -168,7 +168,7 @@ def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self):
 
         self.assertEqual(expected, result)
 
-    def test_HaveNaNInTrain_ExpectCodedAsOne(self):
+    def test_HavenanInTrain_ExpectCodedAsOne(self):
         train = pd.DataFrame({"city": [np.nan]})
         expected = [1]
 
@@ -362,16 +362,16 @@ def test_validate_mapping(self):
         custom_mapping = [
             {
                 "col": "col1",
-                "mapping": {np.NaN: 0, "a": 1, "b": 2},
+                "mapping": {np.nan: 0, "a": 1, "b": 2},
             },  # The mapping from the documentation
-            {"col": "col2", "mapping": {np.NaN: -3, "x": 11, "y": 2}},
+            {"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}},
         ]
         expected_valid_mapping = [
             {
                 "col": "col1",
-                "mapping": pd.Series({np.NaN: 0, "a": 1, "b": 2}),
+                "mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}),
             },  # The mapping from the documentation
-            {"col": "col2", "mapping": pd.Series({np.NaN: -3, "x": 11, "y": 2})},
+            {"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})},
         ]
         enc = encoders.OrdinalEncoder()
         actual_valid_mapping = enc._validate_supplied_mapping(custom_mapping)
diff --git a/tests/test_rankhot.py b/tests/test_rankhot.py
@@ -16,7 +16,7 @@
 
 class TestRankHotEncoder(TestCase):
 
-    def test_handleNaNvalue(self):
+    def test_handlenanvalue(self):
         enc = encoders.RankHotEncoder(handle_unknown='value', cols=['none'])
         enc.fit(X)
         t_f = enc.transform(X)
diff --git a/tests/test_woe.py b/tests/test_woe.py
@@ -33,26 +33,26 @@ def test_woe(self):
         X1 = enc.transform(X_t)
         th.verify_numeric(X1[cols])
         self.assertTrue(np.isfinite(X1[cols].to_numpy()).all(),
-                        'There must not be any NaN, inf or -inf in the transformed columns')
+                        'There must not be any nan, inf or -inf in the transformed columns')
         self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change')
         self.assertEqual(len(X_t), len(X1), 'The count of rows must not change')
         X2 = enc.transform(X_t, np_y_t)
         th.verify_numeric(X2)
         self.assertTrue(np.isfinite(X2[cols].to_numpy()).all(),
-                        'There must not be any NaN, inf or -inf in the transformed columns')
+                        'There must not be any nan, inf or -inf in the transformed columns')
         self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change')
         self.assertEqual(len(X_t), len(X2), 'The count of rows must not change')
         X3 = enc.transform(X, np_y)
         th.verify_numeric(X3)
         self.assertTrue(np.isfinite(X3[cols].to_numpy()).all(),
-                        'There must not be any NaN, inf or -inf in the transformed columns')
+                        'There must not be any nan, inf or -inf in the transformed columns')
         self.assertEqual(len(list(X)), len(list(X3)), 'The count of attributes must not change')
         self.assertEqual(len(X), len(X3), 'The count of rows must not change')
         self.assertTrue(X3['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label')
         X4 = enc.fit_transform(X, np_y)
         th.verify_numeric(X4)
         self.assertTrue(np.isfinite(X4[cols].to_numpy()).all(),
-                        'There must not be any NaN, inf or -inf in the transformed columns')
+                        'There must not be any nan, inf or -inf in the transformed columns')
         self.assertEqual(len(list(X)), len(list(X4)), 'The count of attributes must not change')
         self.assertEqual(len(X), len(X4), 'The count of rows must not change')
         self.assertTrue(X4['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label')