GPBoost support (#26)

bcebere · web-flow · commit feae8fce240e · 2022-11-17T10:02:52.000Z
* gpboost clf

* add GPBoost
diff --git a/setup.cfg b/setup.cfg
@@ -57,6 +57,7 @@ install_requires =
     pydantic
     jupyter
     notebook
+    gpboost
     importlib-metadata; python_version<"3.8"
 
 
diff --git a/src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py b/src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py
@@ -0,0 +1,121 @@
+# stdlib
+import multiprocessing
+from typing import Any, List, Optional
+
+# third party
+from gpboost import GPBoostClassifier
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+# hyperimpute absolute
+import hyperimpute.plugins.core.params as params
+import hyperimpute.plugins.prediction.classifiers.base as base
+
+
+class GPBoostPlugin(base.ClassifierPlugin):
+    """Classification plugin based on the GPBoost classifier.
+
+    Args:
+        n_estimators: int
+            The maximum number of estimators at which boosting is terminated.
+        max_depth: int
+            Maximum depth of a tree.
+        reg_lambda: float
+            L2 regularization term on weights (xgb’s lambda).
+        reg_alpha: float
+            L1 regularization term on weights (xgb’s alpha).
+        colsample_bytree: float
+            Subsample ratio of columns when constructing each tree.
+        subsample: float
+            Subsample ratio of the training instance.
+        learning_rate: float
+            Boosting learning rate
+        boosting_type: str
+            Specify which booster to use: gbtree, gblinear or dart.
+        min_child_weight: int
+            Minimum sum of instance weight(hessian) needed in a child.
+        random_state: float
+            Random number seed.
+
+
+    Example:
+        >>> from hyperimpute.plugins.prediction import Predictions
+        >>> plugin = Predictions(category="classifiers").get("gpboost")
+        >>> from sklearn.datasets import load_iris
+        >>> X, y = load_iris(return_X_y=True)
+        >>> plugin.fit_predict(X, y)
+    """
+
+    boosting_type = ["gbdt", "goss", "dart"]
+
+    def __init__(
+        self,
+        boosting_type: int = 0,
+        max_depth: Optional[int] = 3,
+        n_estimators: int = 100,
+        reg_lambda: float = 0,
+        reg_alpha: float = 0,
+        colsample_bytree: float = 1.0,
+        subsample: float = 1.0,
+        learning_rate: float = 1e-3,
+        min_child_weight: int = 0.001,
+        n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
+        random_state: int = 0,
+        hyperparam_search_iterations: Optional[int] = None,
+        **kwargs: Any
+    ) -> None:
+        super().__init__(**kwargs)
+        if hyperparam_search_iterations:
+            n_estimators = int(hyperparam_search_iterations)
+
+        self.model = GPBoostClassifier(
+            boosting_type=GPBoostPlugin.boosting_type[boosting_type],
+            n_estimators=n_estimators,
+            max_depth=max_depth,
+            reg_lambda=reg_lambda,
+            reg_alpha=reg_alpha,
+            colsample_bytree=colsample_bytree,
+            subsample=subsample,
+            learning_rate=learning_rate,
+            min_child_weight=min_child_weight,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            **kwargs,
+        )
+
+    @staticmethod
+    def name() -> str:
+        return "gpboost"
+
+    @staticmethod
+    def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
+        return [
+            params.Float("reg_lambda", 1e-3, 10.0),
+            params.Float("reg_alpha", 1e-3, 10.0),
+            params.Float("colsample_bytree", 0.1, 0.9),
+            params.Float("subsample", 0.1, 0.9),
+            params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
+            params.Integer("max_depth", 2, 5),
+            params.Integer("n_estimators", 10, 300),
+            params.Integer("min_child_weight", 0, 300),
+            params.Integer("boosting_type", 0, len(GPBoostPlugin.boosting_type) - 1),
+        ]
+
+    def _fit(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> "GPBoostPlugin":
+        y = np.asarray(args[0])
+        self.encoder = LabelEncoder()
+        y = self.encoder.fit_transform(y)
+        self.model.fit(X, y, **kwargs)
+        return self
+
+    def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
+        return self.encoder.inverse_transform(self.model.predict(X, *args, **kwargs))
+
+    def _predict_proba(
+        self, X: pd.DataFrame, *args: Any, **kwargs: Any
+    ) -> pd.DataFrame:
+        return self.model.predict_proba(X, *args, **kwargs)
+
+
+plugin = GPBoostPlugin
diff --git a/src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py b/src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py
@@ -0,0 +1,115 @@
+# stdlib
+import multiprocessing
+from typing import Any, List, Optional
+
+# third party
+from gpboost import GPBoostRegressor
+import pandas as pd
+
+# hyperimpute absolute
+import hyperimpute.plugins.core.params as params
+import hyperimpute.plugins.prediction.regression.base as base
+
+
+class GPBoostRegressorPlugin(base.RegressionPlugin):
+    """Classification plugin based on the GPBoost classifier.
+
+    Args:
+        n_estimators: int
+            The maximum number of estimators at which boosting is terminated.
+        max_depth: int
+            Maximum depth of a tree.
+        reg_lambda: float
+            L2 regularization term on weights (xgb’s lambda).
+        reg_alpha: float
+            L1 regularization term on weights (xgb’s alpha).
+        colsample_bytree: float
+            Subsample ratio of columns when constructing each tree.
+        subsample: float
+            Subsample ratio of the training instance.
+        learning_rate: float
+            Boosting learning rate
+        boosting_type: str
+            Specify which booster to use: gbtree, gblinear or dart.
+        min_child_weight: int
+            Minimum sum of instance weight(hessian) needed in a child.
+        random_state: float
+            Random number seed.
+
+
+    Example:
+        >>> from hyperimpute.plugins.prediction import Predictions
+        >>> plugin = Predictions(category="regression").get("gpboost_regressor")
+        >>> from sklearn.datasets import load_iris
+        >>> X, y = load_iris(return_X_y=True)
+        >>> plugin.fit_predict(X, y)
+    """
+
+    boosting_type = ["gbdt", "goss", "dart"]
+
+    def __init__(
+        self,
+        boosting_type: int = 0,
+        max_depth: Optional[int] = -1,
+        n_estimators: int = 100,
+        reg_lambda: float = 0,
+        reg_alpha: float = 0,
+        colsample_bytree: float = 1.0,
+        subsample: float = 1.0,
+        learning_rate: float = 1e-3,
+        min_child_weight: int = 0.001,
+        n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
+        random_state: int = 0,
+        hyperparam_search_iterations: Optional[int] = None,
+        **kwargs: Any
+    ) -> None:
+        super().__init__(**kwargs)
+        if hyperparam_search_iterations:
+            n_estimators = int(hyperparam_search_iterations)
+
+        self.model = GPBoostRegressor(
+            boosting_type=GPBoostRegressorPlugin.boosting_type[boosting_type],
+            n_estimators=n_estimators,
+            max_depth=max_depth,
+            reg_lambda=reg_lambda,
+            reg_alpha=reg_alpha,
+            colsample_bytree=colsample_bytree,
+            subsample=subsample,
+            learning_rate=learning_rate,
+            min_child_weight=min_child_weight,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            **kwargs,
+        )
+
+    @staticmethod
+    def name() -> str:
+        return "gpboost_regressor"
+
+    @staticmethod
+    def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
+        return [
+            params.Float("reg_lambda", 1e-3, 10.0),
+            params.Float("reg_alpha", 1e-3, 10.0),
+            params.Float("colsample_bytree", 0.1, 0.9),
+            params.Float("subsample", 0.1, 0.9),
+            params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
+            params.Integer("max_depth", 2, 5),
+            params.Integer("n_estimators", 10, 300),
+            params.Integer("min_child_weight", 0, 300),
+            params.Integer(
+                "boosting_type", 0, len(GPBoostRegressorPlugin.boosting_type) - 1
+            ),
+        ]
+
+    def _fit(
+        self, X: pd.DataFrame, *args: Any, **kwargs: Any
+    ) -> "GPBoostRegressorPlugin":
+        self.model.fit(X, *args, **kwargs)
+        return self
+
+    def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
+        return self.model.predict(X, *args, **kwargs)
+
+
+plugin = GPBoostRegressorPlugin
diff --git a/src/hyperimpute/version.py b/src/hyperimpute/version.py
@@ -1 +1 @@
-__version__ = "0.1.7"
+__version__ = "0.1.8"
diff --git a/tests/prediction/classifiers/test_gpboost.py b/tests/prediction/classifiers/test_gpboost.py
@@ -0,0 +1,97 @@
+# stdlib
+from typing import Any
+
+# third party
+import numpy as np
+import optuna
+import pytest
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+# hyperimpute absolute
+from hyperimpute.plugins.prediction import PredictionPlugin, Predictions
+from hyperimpute.plugins.prediction.classifiers.plugin_gpboost import plugin
+from hyperimpute.utils.serialization import load_model, save_model
+from hyperimpute.utils.tester import evaluate_estimator
+
+
+def from_api() -> PredictionPlugin:
+    return Predictions().get("gpboost")
+
+
+def from_module() -> PredictionPlugin:
+    return plugin()
+
+
+def from_pickle() -> PredictionPlugin:
+    buff = save_model(plugin())
+    return load_model(buff)
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None:
+    assert test_plugin is not None
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None:
+    assert test_plugin.name() == "gpboost"
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None:
+    assert test_plugin.type() == "prediction"
+    assert test_plugin.subtype() == "classifier"
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None:
+    assert len(test_plugin.hyperparameter_space()) == 9
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None:
+    X, y = load_iris(return_X_y=True, as_frame=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+    y_pred = test_plugin.fit(X_train, y_train).predict(X_test)
+
+    assert np.abs(np.subtract(y_pred.values, y_test.values)).mean() < 1
+
+
+@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
+def test_gpboost_plugin_score(test_plugin: PredictionPlugin) -> None:
+    X, y = load_iris(return_X_y=True, as_frame=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+    test_plugin.fit(X_train, y_train)
+
+    assert test_plugin.score(X_test, y_test) > 0.5
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, y = load_iris(return_X_y=True, as_frame=True)
+
+    def evaluate_args(**kwargs: Any) -> float:
+        kwargs["n_estimators"] = 10
+
+        model = plugin(**kwargs)
+        metrics = evaluate_estimator(model, X, y)
+
+        return metrics["clf"]["aucroc"][0]
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["maximize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) == 10
diff --git a/tests/prediction/regression/test_gpboost_regression.py b/tests/prediction/regression/test_gpboost_regression.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.1.7"`
	`1`	`+__version__ = "0.1.8"`