Upgrade scikit-learn (#4248)

eccabay · web-flow · commit 0c224ea51349 · 2023-07-24T10:26:09.000-04:00
* Upgrade sktime and vowpalwabbit

* Update tree parameters to remove deprecated value

* Upgrade imbalanced-learn

* Workaround for knn bug in scikit-learn

* Adjust for roc threshold change

* Actually unpin and upgrade sklearn
diff --git a/.github/meta.yaml b/.github/meta.yaml
@@ -28,7 +28,7 @@ outputs:
         - pandas >=1.5.0, <2.0.0
         - dask >=2022.2.0, !=2022.10.1
         - scipy >=1.5.0
-        - scikit-learn >=1.2.2
+        - scikit-learn >=1.3.0
         - scikit-optimize >=0.9.0
         - statsmodels >=0.12.2
         - colorama >=0.4.4
@@ -78,8 +78,8 @@ outputs:
         - lightgbm >=4.0.0
         - lime >=0.2.0.1
         - python >=3.8.*
-        - imbalanced-learn >=0.9.1, <0.11.0
-        - sktime >=0.17.0
+        - imbalanced-learn >=0.11.0
+        - sktime >=0.21.0
         - pmdarima >=1.8.5
         - vowpalwabbit >=8.11.0
     test:
diff --git a/core-requirements.txt b/core-requirements.txt
@@ -1,7 +1,7 @@
 numpy>=1.21.0
 pandas>=1.5.0, <2.0.0
 scipy>=1.5.0
-scikit-learn>=1.2.1
+scikit-learn>=1.3.0
 scikit-optimize>=0.9.0
 pyzmq>=20.0.0
 colorama>=0.4.4
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -7,6 +7,7 @@ Release Notes
     * Changes
         * Unpinned sktime version :pr:`4214`
         * Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`
+        * Pinned scikit-learn version due to incompatibility with pinned imbalanced-learn :pr:`4248`
     * Documentation Changes
     * Testing Changes
 
diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py
@@ -75,7 +75,7 @@ class ComponentGraph:
         ...                                  'max_depth': 6,
         ...                                  'n_jobs': -1},
         ...     'Decision Tree Classifier': {'criterion': 'gini',
-        ...                                  'max_features': 'auto',
+        ...                                  'max_features': 'sqrt',
         ...                                  'max_depth': 6,
         ...                                  'min_samples_split': 2,
         ...                                  'min_weight_fraction_leaf': 0.0},
diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py
@@ -34,7 +34,7 @@ class StackedEnsembleClassifier(StackedEnsembleBase):
         >>> cg = ComponentGraph(component_graph)
         >>> assert cg.default_parameters == {
         ...     'Decision Tree Classifier': {'criterion': 'gini',
-        ...                                  'max_features': 'auto',
+        ...                                  'max_features': 'sqrt',
         ...                                  'max_depth': 6,
         ...                                  'min_samples_split': 2,
         ...                                  'min_weight_fraction_leaf': 0.0},
diff --git a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py
@@ -14,17 +14,15 @@ class DecisionTreeClassifier(Estimator):
         criterion ({"gini", "entropy"}): The function to measure the quality of a split.
             Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.
             Defaults to "gini".
-        max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
+        max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
 
             - If int, then consider max_features features at each split.
             - If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
-            - If "auto", then max_features=sqrt(n_features).
             - If "sqrt", then max_features=sqrt(n_features).
             - If "log2", then max_features=log2(n_features).
             - If None, then max_features = n_features.
 
             The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
-            Defaults to "auto".
         max_depth (int): The maximum depth of the tree. Defaults to 6.
         min_samples_split (int or float): The minimum number of samples required to split an internal node:
 
@@ -40,12 +38,12 @@ class DecisionTreeClassifier(Estimator):
     name = "Decision Tree Classifier"
     hyperparameter_ranges = {
         "criterion": ["gini", "entropy"],
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }
     """{
         "criterion": ["gini", "entropy"],
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }"""
     model_family = ModelFamily.DECISION_TREE
@@ -66,7 +64,7 @@ class DecisionTreeClassifier(Estimator):
     def __init__(
         self,
         criterion="gini",
-        max_features="auto",
+        max_features="sqrt",
         max_depth=6,
         min_samples_split=2,
         min_weight_fraction_leaf=0.0,
diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py
@@ -12,17 +12,15 @@ class ExtraTreesClassifier(Estimator):
 
     Args:
         n_estimators (float): The number of trees in the forest. Defaults to 100.
-        max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
+        max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
 
             - If int, then consider max_features features at each split.
             - If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
-            - If "auto", then max_features=sqrt(n_features).
             - If "sqrt", then max_features=sqrt(n_features).
             - If "log2", then max_features=log2(n_features).
             - If None, then max_features = n_features.
 
             The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
-            Defaults to "auto".
         max_depth (int): The maximum depth of the tree. Defaults to 6.
         min_samples_split (int or float): The minimum number of samples required to split an internal node:
 
@@ -39,12 +37,12 @@ class ExtraTreesClassifier(Estimator):
     name = "Extra Trees Classifier"
     hyperparameter_ranges = {
         "n_estimators": Integer(10, 1000),
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }
     """{
         "n_estimators": Integer(10, 1000),
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }
     """
@@ -66,7 +64,7 @@ class ExtraTreesClassifier(Estimator):
     def __init__(
         self,
         n_estimators=100,
-        max_features="auto",
+        max_features="sqrt",
         max_depth=6,
         min_samples_split=2,
         min_weight_fraction_leaf=0.0,
diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py
@@ -1,11 +1,13 @@
 """K-Nearest Neighbors Classifier."""
 import numpy as np
+import pandas as pd
 from sklearn.neighbors import KNeighborsClassifier as SKKNeighborsClassifier
 from skopt.space import Integer
 
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
+from evalml.utils import infer_feature_types
 
 
 class KNeighborsClassifier(Estimator):
@@ -93,6 +95,34 @@ def __init__(
             random_seed=random_seed,
         )
 
+    def predict(self, X: pd.DataFrame) -> pd.Series:
+        """Make predictions using selected features.
+
+        Args:
+            X (pd.DataFrame): Data of shape [n_samples, n_features].
+
+        Returns:
+            pd.Series: Predicted values.
+        """
+        predictions = self._component_obj.predict(X.to_numpy())
+        predictions = infer_feature_types(predictions)
+        predictions.index = X.index
+        return predictions
+
+    def predict_proba(self, X: pd.DataFrame) -> pd.Series:
+        """Make probability estimates for labels.
+
+        Args:
+            X (pd.DataFrame): Features.
+
+        Returns:
+            pd.Series: Probability estimates.
+        """
+        pred_proba = self._component_obj.predict_proba(X.to_numpy())
+        pred_proba = infer_feature_types(pred_proba)
+        pred_proba.index = X.index
+        return pred_proba
+
     @property
     def feature_importance(self):
         """Returns array of 0's matching the input number of features as feature_importance is not defined for KNN classifiers."""
diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py
@@ -18,11 +18,10 @@ class DecisionTreeRegressor(Estimator):
                 - "friedman_mse", which uses mean squared error with Friedman"s improvement score for potential splits
                 - "absolute_error" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node,
                 - "poisson" which uses reduction in Poisson deviance to find splits.
-        max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
+        max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
 
             - If int, then consider max_features features at each split.
             - If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
-            - If "auto", then max_features=sqrt(n_features).
             - If "sqrt", then max_features=sqrt(n_features).
             - If "log2", then max_features=log2(n_features).
             - If None, then max_features = n_features.
@@ -43,12 +42,12 @@ class DecisionTreeRegressor(Estimator):
     name = "Decision Tree Regressor"
     hyperparameter_ranges = {
         "criterion": ["squared_error", "friedman_mse", "absolute_error"],
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }
     """{
         "criterion": ["squared_error", "friedman_mse", "absolute_error"],
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }"""
     model_family = ModelFamily.DECISION_TREE
@@ -65,7 +64,7 @@ class DecisionTreeRegressor(Estimator):
     def __init__(
         self,
         criterion="squared_error",
-        max_features="auto",
+        max_features="sqrt",
         max_depth=6,
         min_samples_split=2,
         min_weight_fraction_leaf=0.0,
diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py
@@ -18,17 +18,15 @@ class ExtraTreesRegressor(Estimator):
 
     Args:
         n_estimators (float): The number of trees in the forest. Defaults to 100.
-        max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
+        max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
 
             - If int, then consider max_features features at each split.
             - If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
-            - If "auto", then max_features=sqrt(n_features).
             - If "sqrt", then max_features=sqrt(n_features).
             - If "log2", then max_features=log2(n_features).
             - If None, then max_features = n_features.
 
             The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
-            Defaults to "auto".
         max_depth (int): The maximum depth of the tree. Defaults to 6.
         min_samples_split (int or float): The minimum number of samples required to split an internal node:
 
@@ -45,12 +43,12 @@ class ExtraTreesRegressor(Estimator):
     name = "Extra Trees Regressor"
     hyperparameter_ranges = {
         "n_estimators": Integer(10, 1000),
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }
     """{
         "n_estimators": Integer(10, 1000),
-        "max_features": ["auto", "sqrt", "log2"],
+        "max_features": ["sqrt", "log2"],
         "max_depth": Integer(4, 10),
     }"""
     model_family = ModelFamily.EXTRA_TREES
@@ -67,7 +65,7 @@ class ExtraTreesRegressor(Estimator):
     def __init__(
         self,
         n_estimators: int = 100,
-        max_features: str = "auto",
+        max_features: str = "sqrt",
         max_depth: int = 6,
         min_samples_split: int = 2,
         min_weight_fraction_leaf: float = 0.0,
diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py
@@ -361,7 +361,7 @@ def generate_component_code(element):
 
     Examples:
         >>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor
-        >>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'squared_error', 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})"
+        >>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})"
         ...
         >>> from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer
         >>> assert generate_component_code(SimpleImputer()) == "from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer\n\nsimpleImputer = SimpleImputer(**{'impute_strategy': 'most_frequent', 'fill_value': None})"
diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -326,8 +326,8 @@ def test_describe_component():
     lr_classifier = LogisticRegressionClassifier()
     en_classifier = ElasticNetClassifier()
     en_regressor = ElasticNetRegressor()
-    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
-    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
+    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="sqrt")
+    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="sqrt")
     rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
     rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
     linear_regressor = LinearRegressor()
@@ -374,7 +374,7 @@ def test_describe_component():
         "name": "Extra Trees Classifier",
         "parameters": {
             "n_estimators": 10,
-            "max_features": "auto",
+            "max_features": "sqrt",
             "max_depth": 6,
             "min_samples_split": 2,
             "min_weight_fraction_leaf": 0.0,
@@ -385,7 +385,7 @@ def test_describe_component():
         "name": "Extra Trees Regressor",
         "parameters": {
             "n_estimators": 10,
-            "max_features": "auto",
+            "max_features": "sqrt",
             "max_depth": 6,
             "min_samples_split": 2,
             "min_weight_fraction_leaf": 0.0,
@@ -1615,7 +1615,7 @@ def test_generate_code():
 
     expected_code = (
         "from evalml.pipelines.components.estimators.regressors.et_regressor import ExtraTreesRegressor"
-        "\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})"
+        "\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})"
     )
     component_code = generate_component_code(ExtraTreesRegressor(n_estimators=50))
     assert component_code == expected_code
diff --git a/evalml/tests/component_tests/test_decision_tree_classifier.py b/evalml/tests/component_tests/test_decision_tree_classifier.py
@@ -22,7 +22,7 @@ def test_problem_types():
 def test_fit_predict_binary(X_y_binary):
     X, y = X_y_binary
 
-    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
+    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
     sk_clf.fit(X, y)
     y_pred_sk = sk_clf.predict(X)
     y_pred_proba_sk = sk_clf.predict_proba(X)
@@ -39,7 +39,7 @@ def test_fit_predict_binary(X_y_binary):
 def test_fit_predict_multi(X_y_multi):
     X, y = X_y_multi
 
-    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
+    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
     sk_clf.fit(X, y)
     y_pred_sk = sk_clf.predict(X)
     y_pred_proba_sk = sk_clf.predict_proba(X)
@@ -59,7 +59,7 @@ def test_feature_importance(X_y_binary):
     X, y = X_y_binary
 
     clf = DecisionTreeClassifier()
-    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
+    sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
     sk_clf.fit(X, y)
     sk_feature_importance = sk_clf.feature_importances_
 
diff --git a/evalml/tests/component_tests/test_decision_tree_regressor.py b/evalml/tests/component_tests/test_decision_tree_regressor.py
@@ -20,7 +20,7 @@ def test_problem_types():
 def test_fit_predict(X_y_regression):
     X, y = X_y_regression
 
-    sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0)
+    sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="sqrt", random_state=0)
     sk_clf.fit(X, y)
     y_pred_sk = sk_clf.predict(X)
 
@@ -36,7 +36,7 @@ def test_feature_importance(X_y_regression):
     X, y = X_y_regression
 
     clf = DecisionTreeRegressor()
-    sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0)
+    sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="sqrt", random_state=0)
     sk_clf.fit(X, y)
     sk_feature_importance = sk_clf.feature_importances_
 
diff --git a/evalml/tests/component_tests/test_et_classifier.py b/evalml/tests/component_tests/test_et_classifier.py
@@ -22,7 +22,7 @@ def test_problem_types():
 def test_fit_predict_binary(X_y_binary):
     X, y = X_y_binary
 
-    sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0)
+    sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0, max_features="sqrt")
     sk_clf.fit(X, y)
     y_pred_sk = sk_clf.predict(X)
     y_pred_proba_sk = sk_clf.predict_proba(X)
diff --git a/evalml/tests/component_tests/test_et_regressor.py b/evalml/tests/component_tests/test_et_regressor.py
@@ -20,7 +20,7 @@ def test_problem_types():
 def test_fit_predict(X_y_regression):
     X, y = X_y_regression
 
-    sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0)
+    sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0, max_features="sqrt")
     sk_clf.fit(X, y)
     y_pred_sk = sk_clf.predict(X)
 
@@ -36,7 +36,12 @@ def test_feature_importance(X_y_regression):
     X, y = X_y_regression
 
     clf = ExtraTreesRegressor(n_jobs=1)
-    sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0, n_jobs=1)
+    sk_clf = SKExtraTreesRegressor(
+        max_depth=6,
+        random_state=0,
+        n_jobs=1,
+        max_features="sqrt",
+    )
     sk_clf.fit(X, y)
     sk_feature_importance = sk_clf.feature_importances_
 
diff --git a/evalml/tests/component_tests/test_knn_classifier.py b/evalml/tests/component_tests/test_knn_classifier.py
diff --git a/evalml/tests/component_tests/test_oversampler.py b/evalml/tests/component_tests/test_oversampler.py
diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
diff --git a/evalml/tests/dependency_update_check/minimum_requirements.txt b/evalml/tests/dependency_update_check/minimum_requirements.txt
diff --git a/evalml/tests/dependency_update_check/minimum_test_requirements.txt b/evalml/tests/dependency_update_check/minimum_test_requirements.txt
diff --git a/evalml/tests/model_understanding_tests/test_metrics.py b/evalml/tests/model_understanding_tests/test_metrics.py
diff --git a/pyproject.toml b/pyproject.toml