force sklearn >= v1 and fix issue#384

PaulWestenthanner · PaulWestenthanner · commit 0f9f86670cfc · 2022-12-27T20:51:09.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.14.0
-scikit-learn>=0.20.0
+scikit-learn>=1.0.0
 scipy>=1.0.0
 statsmodels>=0.9.0
 pandas>=1.0.5
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -46,13 +46,12 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001
         random.choice(['A', 'B_b', 'C_c_c']),                                                   # Strings with underscores to test reverse_dummies()
         random.choice(['A', 'B', 'C', np.NaN]) if has_missing else random.choice(['A', 'B', 'C']), # None
         random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),      # With a new string value
-        random.choice([12, 43, -32]),                                                           # Number in the column name
         random.choice(['A', 'B', 'C']),                                                         # What is going to become the categorical column
         random.choice(['A', 'B', 'C', np.nan]),                                                 # Categorical with missing values
         random.choice([1, 2, 3])                                                                # Ordinal integers
     ] for row in range(n_rows)]
 
-    df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 321, 'categorical', 'na_categorical', 'categorical_int'])
+    df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 'categorical', 'na_categorical', 'categorical_int'])
     df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C'])
     df['na_categorical'] = pd.Categorical(df['na_categorical'], categories=['A', 'B', 'C'])
     df['categorical_int'] = pd.Categorical(df['categorical_int'], categories=[1, 2, 3])
diff --git a/tests/test_encoders.py b/tests/test_encoders.py
@@ -432,14 +432,17 @@ def test_duplicate_index_value(self):
     def test_string_index(self):
         # https://github.com/scikit-learn-contrib/categorical-encoding/issues/131
 
-        bunch = sklearn.datasets.load_boston()
-        y = (bunch.target > 20)
+        bunch = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
+        y = (bunch.target > 200000).values
         X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
         X.index = X.index.values.astype(str)
 
+        display_cols = ["Id", "MSSubClass", "MSZoning", "YearBuilt", "Heating", "CentralAir"]
+        X = X[display_cols]
+
         for encoder_name in encoders.__all__:
             with self.subTest(encoder_name=encoder_name):
-                enc = getattr(encoders, encoder_name)(cols=['CHAS', 'RAD'])
+                enc = getattr(encoders, encoder_name)(cols=['CentralAir', 'Heating'])
                 result = enc.fit_transform(X, y)
                 self.assertFalse(result.isnull().values.any(), 'There should not be any missing value!')