Skip to content

Commit 0c224ea

Browse files
authored
Upgrade scikit-learn (#4248)
* Upgrade sktime and vowpalwabbit * Update tree parameters to remove deprecated value * Upgrade imbalanced-learn * Workaround for knn bug in scikit-learn * Adjust for roc threshold change * Actually unpin and upgrade sklearn
1 parent 5e158ed commit 0c224ea

23 files changed

+98
-80
lines changed

.github/meta.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ outputs:
2828
- pandas >=1.5.0, <2.0.0
2929
- dask >=2022.2.0, !=2022.10.1
3030
- scipy >=1.5.0
31-
- scikit-learn >=1.2.2
31+
- scikit-learn >=1.3.0
3232
- scikit-optimize >=0.9.0
3333
- statsmodels >=0.12.2
3434
- colorama >=0.4.4
@@ -78,8 +78,8 @@ outputs:
7878
- lightgbm >=4.0.0
7979
- lime >=0.2.0.1
8080
- python >=3.8.*
81-
- imbalanced-learn >=0.9.1, <0.11.0
82-
- sktime >=0.17.0
81+
- imbalanced-learn >=0.11.0
82+
- sktime >=0.21.0
8383
- pmdarima >=1.8.5
8484
- vowpalwabbit >=8.11.0
8585
test:

core-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
numpy>=1.21.0
22
pandas>=1.5.0, <2.0.0
33
scipy>=1.5.0
4-
scikit-learn>=1.2.1
4+
scikit-learn>=1.3.0
55
scikit-optimize>=0.9.0
66
pyzmq>=20.0.0
77
colorama>=0.4.4

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Release Notes
77
* Changes
88
* Unpinned sktime version :pr:`4214`
99
* Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`
10+
* Pinned scikit-learn version due to incompatibility with pinned imbalanced-learn :pr:`4248`
1011
* Documentation Changes
1112
* Testing Changes
1213

evalml/pipelines/component_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class ComponentGraph:
7575
... 'max_depth': 6,
7676
... 'n_jobs': -1},
7777
... 'Decision Tree Classifier': {'criterion': 'gini',
78-
... 'max_features': 'auto',
78+
... 'max_features': 'sqrt',
7979
... 'max_depth': 6,
8080
... 'min_samples_split': 2,
8181
... 'min_weight_fraction_leaf': 0.0},

evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class StackedEnsembleClassifier(StackedEnsembleBase):
3434
>>> cg = ComponentGraph(component_graph)
3535
>>> assert cg.default_parameters == {
3636
... 'Decision Tree Classifier': {'criterion': 'gini',
37-
... 'max_features': 'auto',
37+
... 'max_features': 'sqrt',
3838
... 'max_depth': 6,
3939
... 'min_samples_split': 2,
4040
... 'min_weight_fraction_leaf': 0.0},

evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,15 @@ class DecisionTreeClassifier(Estimator):
1414
criterion ({"gini", "entropy"}): The function to measure the quality of a split.
1515
Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.
1616
Defaults to "gini".
17-
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
17+
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
1818
1919
- If int, then consider max_features features at each split.
2020
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
21-
- If "auto", then max_features=sqrt(n_features).
2221
- If "sqrt", then max_features=sqrt(n_features).
2322
- If "log2", then max_features=log2(n_features).
2423
- If None, then max_features = n_features.
2524
2625
The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
27-
Defaults to "auto".
2826
max_depth (int): The maximum depth of the tree. Defaults to 6.
2927
min_samples_split (int or float): The minimum number of samples required to split an internal node:
3028
@@ -40,12 +38,12 @@ class DecisionTreeClassifier(Estimator):
4038
name = "Decision Tree Classifier"
4139
hyperparameter_ranges = {
4240
"criterion": ["gini", "entropy"],
43-
"max_features": ["auto", "sqrt", "log2"],
41+
"max_features": ["sqrt", "log2"],
4442
"max_depth": Integer(4, 10),
4543
}
4644
"""{
4745
"criterion": ["gini", "entropy"],
48-
"max_features": ["auto", "sqrt", "log2"],
46+
"max_features": ["sqrt", "log2"],
4947
"max_depth": Integer(4, 10),
5048
}"""
5149
model_family = ModelFamily.DECISION_TREE
@@ -66,7 +64,7 @@ class DecisionTreeClassifier(Estimator):
6664
def __init__(
6765
self,
6866
criterion="gini",
69-
max_features="auto",
67+
max_features="sqrt",
7068
max_depth=6,
7169
min_samples_split=2,
7270
min_weight_fraction_leaf=0.0,

evalml/pipelines/components/estimators/classifiers/et_classifier.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,15 @@ class ExtraTreesClassifier(Estimator):
1212
1313
Args:
1414
n_estimators (float): The number of trees in the forest. Defaults to 100.
15-
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
15+
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
1616
1717
- If int, then consider max_features features at each split.
1818
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
19-
- If "auto", then max_features=sqrt(n_features).
2019
- If "sqrt", then max_features=sqrt(n_features).
2120
- If "log2", then max_features=log2(n_features).
2221
- If None, then max_features = n_features.
2322
2423
The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
25-
Defaults to "auto".
2624
max_depth (int): The maximum depth of the tree. Defaults to 6.
2725
min_samples_split (int or float): The minimum number of samples required to split an internal node:
2826
@@ -39,12 +37,12 @@ class ExtraTreesClassifier(Estimator):
3937
name = "Extra Trees Classifier"
4038
hyperparameter_ranges = {
4139
"n_estimators": Integer(10, 1000),
42-
"max_features": ["auto", "sqrt", "log2"],
40+
"max_features": ["sqrt", "log2"],
4341
"max_depth": Integer(4, 10),
4442
}
4543
"""{
4644
"n_estimators": Integer(10, 1000),
47-
"max_features": ["auto", "sqrt", "log2"],
45+
"max_features": ["sqrt", "log2"],
4846
"max_depth": Integer(4, 10),
4947
}
5048
"""
@@ -66,7 +64,7 @@ class ExtraTreesClassifier(Estimator):
6664
def __init__(
6765
self,
6866
n_estimators=100,
69-
max_features="auto",
67+
max_features="sqrt",
7068
max_depth=6,
7169
min_samples_split=2,
7270
min_weight_fraction_leaf=0.0,

evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""K-Nearest Neighbors Classifier."""
22
import numpy as np
3+
import pandas as pd
34
from sklearn.neighbors import KNeighborsClassifier as SKKNeighborsClassifier
45
from skopt.space import Integer
56

67
from evalml.model_family import ModelFamily
78
from evalml.pipelines.components.estimators import Estimator
89
from evalml.problem_types import ProblemTypes
10+
from evalml.utils import infer_feature_types
911

1012

1113
class KNeighborsClassifier(Estimator):
@@ -93,6 +95,34 @@ def __init__(
9395
random_seed=random_seed,
9496
)
9597

98+
def predict(self, X: pd.DataFrame) -> pd.Series:
99+
"""Make predictions using selected features.
100+
101+
Args:
102+
X (pd.DataFrame): Data of shape [n_samples, n_features].
103+
104+
Returns:
105+
pd.Series: Predicted values.
106+
"""
107+
predictions = self._component_obj.predict(X.to_numpy())
108+
predictions = infer_feature_types(predictions)
109+
predictions.index = X.index
110+
return predictions
111+
112+
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
113+
"""Make probability estimates for labels.
114+
115+
Args:
116+
X (pd.DataFrame): Features.
117+
118+
Returns:
119+
pd.Series: Probability estimates.
120+
"""
121+
pred_proba = self._component_obj.predict_proba(X.to_numpy())
122+
pred_proba = infer_feature_types(pred_proba)
123+
pred_proba.index = X.index
124+
return pred_proba
125+
96126
@property
97127
def feature_importance(self):
98128
"""Returns array of 0's matching the input number of features as feature_importance is not defined for KNN classifiers."""

evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@ class DecisionTreeRegressor(Estimator):
1818
- "friedman_mse", which uses mean squared error with Friedman"s improvement score for potential splits
1919
- "absolute_error" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node,
2020
- "poisson" which uses reduction in Poisson deviance to find splits.
21-
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
21+
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
2222
2323
- If int, then consider max_features features at each split.
2424
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
25-
- If "auto", then max_features=sqrt(n_features).
2625
- If "sqrt", then max_features=sqrt(n_features).
2726
- If "log2", then max_features=log2(n_features).
2827
- If None, then max_features = n_features.
@@ -43,12 +42,12 @@ class DecisionTreeRegressor(Estimator):
4342
name = "Decision Tree Regressor"
4443
hyperparameter_ranges = {
4544
"criterion": ["squared_error", "friedman_mse", "absolute_error"],
46-
"max_features": ["auto", "sqrt", "log2"],
45+
"max_features": ["sqrt", "log2"],
4746
"max_depth": Integer(4, 10),
4847
}
4948
"""{
5049
"criterion": ["squared_error", "friedman_mse", "absolute_error"],
51-
"max_features": ["auto", "sqrt", "log2"],
50+
"max_features": ["sqrt", "log2"],
5251
"max_depth": Integer(4, 10),
5352
}"""
5453
model_family = ModelFamily.DECISION_TREE
@@ -65,7 +64,7 @@ class DecisionTreeRegressor(Estimator):
6564
def __init__(
6665
self,
6766
criterion="squared_error",
68-
max_features="auto",
67+
max_features="sqrt",
6968
max_depth=6,
7069
min_samples_split=2,
7170
min_weight_fraction_leaf=0.0,

evalml/pipelines/components/estimators/regressors/et_regressor.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,15 @@ class ExtraTreesRegressor(Estimator):
1818
1919
Args:
2020
n_estimators (float): The number of trees in the forest. Defaults to 100.
21-
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
21+
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:
2222
2323
- If int, then consider max_features features at each split.
2424
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
25-
- If "auto", then max_features=sqrt(n_features).
2625
- If "sqrt", then max_features=sqrt(n_features).
2726
- If "log2", then max_features=log2(n_features).
2827
- If None, then max_features = n_features.
2928
3029
The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
31-
Defaults to "auto".
3230
max_depth (int): The maximum depth of the tree. Defaults to 6.
3331
min_samples_split (int or float): The minimum number of samples required to split an internal node:
3432
@@ -45,12 +43,12 @@ class ExtraTreesRegressor(Estimator):
4543
name = "Extra Trees Regressor"
4644
hyperparameter_ranges = {
4745
"n_estimators": Integer(10, 1000),
48-
"max_features": ["auto", "sqrt", "log2"],
46+
"max_features": ["sqrt", "log2"],
4947
"max_depth": Integer(4, 10),
5048
}
5149
"""{
5250
"n_estimators": Integer(10, 1000),
53-
"max_features": ["auto", "sqrt", "log2"],
51+
"max_features": ["sqrt", "log2"],
5452
"max_depth": Integer(4, 10),
5553
}"""
5654
model_family = ModelFamily.EXTRA_TREES
@@ -67,7 +65,7 @@ class ExtraTreesRegressor(Estimator):
6765
def __init__(
6866
self,
6967
n_estimators: int = 100,
70-
max_features: str = "auto",
68+
max_features: str = "sqrt",
7169
max_depth: int = 6,
7270
min_samples_split: int = 2,
7371
min_weight_fraction_leaf: float = 0.0,

0 commit comments

Comments
 (0)