Skip to content

Commit feae8fc

Browse files
authored
GPBoost support (#26)
* gpboost clf * add GPBoost
1 parent 85cd746 commit feae8fc

File tree

6 files changed

+419
-1
lines changed

6 files changed

+419
-1
lines changed

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ install_requires =
5757
pydantic
5858
jupyter
5959
notebook
60+
gpboost
6061
importlib-metadata; python_version<"3.8"
6162

6263

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# stdlib
2+
import multiprocessing
3+
from typing import Any, List, Optional
4+
5+
# third party
6+
from gpboost import GPBoostClassifier
7+
import numpy as np
8+
import pandas as pd
9+
from sklearn.preprocessing import LabelEncoder
10+
11+
# hyperimpute absolute
12+
import hyperimpute.plugins.core.params as params
13+
import hyperimpute.plugins.prediction.classifiers.base as base
14+
15+
16+
class GPBoostPlugin(base.ClassifierPlugin):
17+
"""Classification plugin based on the GPBoost classifier.
18+
19+
Args:
20+
n_estimators: int
21+
The maximum number of estimators at which boosting is terminated.
22+
max_depth: int
23+
Maximum depth of a tree.
24+
reg_lambda: float
25+
L2 regularization term on weights (xgb’s lambda).
26+
reg_alpha: float
27+
L1 regularization term on weights (xgb’s alpha).
28+
colsample_bytree: float
29+
Subsample ratio of columns when constructing each tree.
30+
subsample: float
31+
Subsample ratio of the training instance.
32+
learning_rate: float
33+
Boosting learning rate
34+
boosting_type: str
35+
Specify which booster to use: gbtree, gblinear or dart.
36+
min_child_weight: int
37+
Minimum sum of instance weight(hessian) needed in a child.
38+
random_state: float
39+
Random number seed.
40+
41+
42+
Example:
43+
>>> from hyperimpute.plugins.prediction import Predictions
44+
>>> plugin = Predictions(category="classifiers").get("gpboost")
45+
>>> from sklearn.datasets import load_iris
46+
>>> X, y = load_iris(return_X_y=True)
47+
>>> plugin.fit_predict(X, y)
48+
"""
49+
50+
boosting_type = ["gbdt", "goss", "dart"]
51+
52+
def __init__(
53+
self,
54+
boosting_type: int = 0,
55+
max_depth: Optional[int] = 3,
56+
n_estimators: int = 100,
57+
reg_lambda: float = 0,
58+
reg_alpha: float = 0,
59+
colsample_bytree: float = 1.0,
60+
subsample: float = 1.0,
61+
learning_rate: float = 1e-3,
62+
min_child_weight: int = 0.001,
63+
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
64+
random_state: int = 0,
65+
hyperparam_search_iterations: Optional[int] = None,
66+
**kwargs: Any
67+
) -> None:
68+
super().__init__(**kwargs)
69+
if hyperparam_search_iterations:
70+
n_estimators = int(hyperparam_search_iterations)
71+
72+
self.model = GPBoostClassifier(
73+
boosting_type=GPBoostPlugin.boosting_type[boosting_type],
74+
n_estimators=n_estimators,
75+
max_depth=max_depth,
76+
reg_lambda=reg_lambda,
77+
reg_alpha=reg_alpha,
78+
colsample_bytree=colsample_bytree,
79+
subsample=subsample,
80+
learning_rate=learning_rate,
81+
min_child_weight=min_child_weight,
82+
random_state=random_state,
83+
n_jobs=n_jobs,
84+
**kwargs,
85+
)
86+
87+
@staticmethod
88+
def name() -> str:
89+
return "gpboost"
90+
91+
@staticmethod
92+
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
93+
return [
94+
params.Float("reg_lambda", 1e-3, 10.0),
95+
params.Float("reg_alpha", 1e-3, 10.0),
96+
params.Float("colsample_bytree", 0.1, 0.9),
97+
params.Float("subsample", 0.1, 0.9),
98+
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
99+
params.Integer("max_depth", 2, 5),
100+
params.Integer("n_estimators", 10, 300),
101+
params.Integer("min_child_weight", 0, 300),
102+
params.Integer("boosting_type", 0, len(GPBoostPlugin.boosting_type) - 1),
103+
]
104+
105+
def _fit(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> "GPBoostPlugin":
106+
y = np.asarray(args[0])
107+
self.encoder = LabelEncoder()
108+
y = self.encoder.fit_transform(y)
109+
self.model.fit(X, y, **kwargs)
110+
return self
111+
112+
def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
113+
return self.encoder.inverse_transform(self.model.predict(X, *args, **kwargs))
114+
115+
def _predict_proba(
116+
self, X: pd.DataFrame, *args: Any, **kwargs: Any
117+
) -> pd.DataFrame:
118+
return self.model.predict_proba(X, *args, **kwargs)
119+
120+
121+
plugin = GPBoostPlugin
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# stdlib
2+
import multiprocessing
3+
from typing import Any, List, Optional
4+
5+
# third party
6+
from gpboost import GPBoostRegressor
7+
import pandas as pd
8+
9+
# hyperimpute absolute
10+
import hyperimpute.plugins.core.params as params
11+
import hyperimpute.plugins.prediction.regression.base as base
12+
13+
14+
class GPBoostRegressorPlugin(base.RegressionPlugin):
15+
"""Classification plugin based on the GPBoost classifier.
16+
17+
Args:
18+
n_estimators: int
19+
The maximum number of estimators at which boosting is terminated.
20+
max_depth: int
21+
Maximum depth of a tree.
22+
reg_lambda: float
23+
L2 regularization term on weights (xgb’s lambda).
24+
reg_alpha: float
25+
L1 regularization term on weights (xgb’s alpha).
26+
colsample_bytree: float
27+
Subsample ratio of columns when constructing each tree.
28+
subsample: float
29+
Subsample ratio of the training instance.
30+
learning_rate: float
31+
Boosting learning rate
32+
boosting_type: str
33+
Specify which booster to use: gbtree, gblinear or dart.
34+
min_child_weight: int
35+
Minimum sum of instance weight(hessian) needed in a child.
36+
random_state: float
37+
Random number seed.
38+
39+
40+
Example:
41+
>>> from hyperimpute.plugins.prediction import Predictions
42+
>>> plugin = Predictions(category="regression").get("gpboost_regressor")
43+
>>> from sklearn.datasets import load_iris
44+
>>> X, y = load_iris(return_X_y=True)
45+
>>> plugin.fit_predict(X, y)
46+
"""
47+
48+
boosting_type = ["gbdt", "goss", "dart"]
49+
50+
def __init__(
51+
self,
52+
boosting_type: int = 0,
53+
max_depth: Optional[int] = -1,
54+
n_estimators: int = 100,
55+
reg_lambda: float = 0,
56+
reg_alpha: float = 0,
57+
colsample_bytree: float = 1.0,
58+
subsample: float = 1.0,
59+
learning_rate: float = 1e-3,
60+
min_child_weight: int = 0.001,
61+
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
62+
random_state: int = 0,
63+
hyperparam_search_iterations: Optional[int] = None,
64+
**kwargs: Any
65+
) -> None:
66+
super().__init__(**kwargs)
67+
if hyperparam_search_iterations:
68+
n_estimators = int(hyperparam_search_iterations)
69+
70+
self.model = GPBoostRegressor(
71+
boosting_type=GPBoostRegressorPlugin.boosting_type[boosting_type],
72+
n_estimators=n_estimators,
73+
max_depth=max_depth,
74+
reg_lambda=reg_lambda,
75+
reg_alpha=reg_alpha,
76+
colsample_bytree=colsample_bytree,
77+
subsample=subsample,
78+
learning_rate=learning_rate,
79+
min_child_weight=min_child_weight,
80+
random_state=random_state,
81+
n_jobs=n_jobs,
82+
**kwargs,
83+
)
84+
85+
@staticmethod
86+
def name() -> str:
87+
return "gpboost_regressor"
88+
89+
@staticmethod
90+
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
91+
return [
92+
params.Float("reg_lambda", 1e-3, 10.0),
93+
params.Float("reg_alpha", 1e-3, 10.0),
94+
params.Float("colsample_bytree", 0.1, 0.9),
95+
params.Float("subsample", 0.1, 0.9),
96+
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
97+
params.Integer("max_depth", 2, 5),
98+
params.Integer("n_estimators", 10, 300),
99+
params.Integer("min_child_weight", 0, 300),
100+
params.Integer(
101+
"boosting_type", 0, len(GPBoostRegressorPlugin.boosting_type) - 1
102+
),
103+
]
104+
105+
def _fit(
106+
self, X: pd.DataFrame, *args: Any, **kwargs: Any
107+
) -> "GPBoostRegressorPlugin":
108+
self.model.fit(X, *args, **kwargs)
109+
return self
110+
111+
def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
112+
return self.model.predict(X, *args, **kwargs)
113+
114+
115+
plugin = GPBoostRegressorPlugin

src/hyperimpute/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.7"
1+
__version__ = "0.1.8"
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# stdlib
2+
from typing import Any
3+
4+
# third party
5+
import numpy as np
6+
import optuna
7+
import pytest
8+
from sklearn.datasets import load_iris
9+
from sklearn.model_selection import train_test_split
10+
11+
# hyperimpute absolute
12+
from hyperimpute.plugins.prediction import PredictionPlugin, Predictions
13+
from hyperimpute.plugins.prediction.classifiers.plugin_gpboost import plugin
14+
from hyperimpute.utils.serialization import load_model, save_model
15+
from hyperimpute.utils.tester import evaluate_estimator
16+
17+
18+
def from_api() -> PredictionPlugin:
19+
return Predictions().get("gpboost")
20+
21+
22+
def from_module() -> PredictionPlugin:
23+
return plugin()
24+
25+
26+
def from_pickle() -> PredictionPlugin:
27+
buff = save_model(plugin())
28+
return load_model(buff)
29+
30+
31+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
32+
def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None:
33+
assert test_plugin is not None
34+
35+
36+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
37+
def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None:
38+
assert test_plugin.name() == "gpboost"
39+
40+
41+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
42+
def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None:
43+
assert test_plugin.type() == "prediction"
44+
assert test_plugin.subtype() == "classifier"
45+
46+
47+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
48+
def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None:
49+
assert len(test_plugin.hyperparameter_space()) == 9
50+
51+
52+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
53+
def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None:
54+
X, y = load_iris(return_X_y=True, as_frame=True)
55+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
56+
57+
y_pred = test_plugin.fit(X_train, y_train).predict(X_test)
58+
59+
assert np.abs(np.subtract(y_pred.values, y_test.values)).mean() < 1
60+
61+
62+
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
63+
def test_gpboost_plugin_score(test_plugin: PredictionPlugin) -> None:
64+
X, y = load_iris(return_X_y=True, as_frame=True)
65+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
66+
67+
test_plugin.fit(X_train, y_train)
68+
69+
assert test_plugin.score(X_test, y_test) > 0.5
70+
71+
72+
def test_param_search() -> None:
73+
if len(plugin.hyperparameter_space()) == 0:
74+
return
75+
76+
X, y = load_iris(return_X_y=True, as_frame=True)
77+
78+
def evaluate_args(**kwargs: Any) -> float:
79+
kwargs["n_estimators"] = 10
80+
81+
model = plugin(**kwargs)
82+
metrics = evaluate_estimator(model, X, y)
83+
84+
return metrics["clf"]["aucroc"][0]
85+
86+
def objective(trial: optuna.Trial) -> float:
87+
args = plugin.sample_hyperparameters(trial)
88+
return evaluate_args(**args)
89+
90+
study = optuna.create_study(
91+
load_if_exists=True,
92+
directions=["maximize"],
93+
study_name=f"test_param_search_{plugin.name()}",
94+
)
95+
study.optimize(objective, n_trials=10, timeout=60)
96+
97+
assert len(study.trials) == 10

0 commit comments

Comments
 (0)