ZJUEarthData · DZzuk1ll · Nov 4, 2024 · Jan 31, 2025 · Jan 31, 2025
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -41,6 +41,7 @@
     "SGD Regression",
     "BayesianRidge Regression",
     "Ridge Regression",
+    "AdaBoost"
     # "Bagging Regression",
     # "Decision Tree",
     # Histogram-based Gradient Boosting,

diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py
@@ -93,7 +93,7 @@ def image_config(self):
             "label": "all",  # Whether to show informative labels for impurity, etc
             "filled": True,  # color filling
             "impurity": True,  # When set to True, show the impurity at each node
-            "node_ids": None,  # When set to True, show the ID number on each node
+            "node_ids": False,  # When set to True, show the ID number on each node
             "proportion": False,  # When set to True, change the display of ‘values’ and/or ‘samples’ to be proportions and percentages respectively
             "rounded": True,  # When set to True, draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman
             "precision": 3,  # Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node

diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_ada_boost.py b/geochemistrypi/data_mining/model/func/algo_regression/_ada_boost.py
@@ -0,0 +1,32 @@
+from typing import Dict
+
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import float_input, num_input, str_input
+
+
+def ada_boost_manual_hyper_parameters() -> Dict:
+    """Manually set hyperparameters for AdaBoost.
+
+    Returns
+    -------
+    hyper_parameters : dict
+        The hyperparameters.
+    """
+
+    print("N Estimators: The maximum number of estimators at which boosting is terminated. A good starting range could be between 50 and 500, such as 100.")
+    n_estimators = num_input(SECTION[2], "@N Estimators: ")
+    print("Learning Rate: A higher learning rate increases the contribution of each regressor. A good starting range could be between 0.01 and 1.0, such as 0.1.")
+    learning_rate = float_input(0.1, SECTION[2], "@Learning Rate: ")
+    print("Loss:  The loss function to use when updating the weights after each boosting iteration. It is generally recommended to leave it as 'linear'.")
+    losses = ["linear", "square", "exponential"]
+    loss = str_input(losses, SECTION[2])
+
+    hyper_parameters = {
+        "n_estimators": n_estimators,
+        "learning_rate": learning_rate,
+        "loss": loss,
+    }
+
+    return hyper_parameters
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_enum.py b/geochemistrypi/data_mining/model/func/algo_regression/_enum.py
@@ -81,3 +81,8 @@ class GradientBoostingSpecialFunction(Enum):
 
 class PolynomialSpecialFunction(Enum):
     POLYNOMIAL_REGRESSION_FORMULA = "Polynomial Regression Formula"
+
+
+class AdaBoostSpecialFunction(Enum):
+    FEATURE_IMPORTANCE_DIAGRAM = "Feature Importance Diagram"
+    SINGLE_TREE_DIAGRAM = "Single Tree Diagram"
diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
@@ -10,7 +10,7 @@
 from flaml import AutoML
 from multipledispatch import dispatch
 from rich import print
-from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
+from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
 from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LinearRegression, Ridge, SGDRegressor
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.neural_network import MLPRegressor
@@ -21,11 +21,13 @@
 from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, RAY_FLAML
 from ..utils.base import clear_output, save_data, save_data_without_data_identifier, save_fig, save_text
 from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase
+from .func.algo_regression._ada_boost import ada_boost_manual_hyper_parameters
 from .func.algo_regression._bayesianridge_regression import bayesian_ridge_manual_hyper_parameters
 from .func.algo_regression._common import cross_validation, plot_predicted_vs_actual, plot_residuals, score
 from .func.algo_regression._decision_tree import decision_tree_manual_hyper_parameters
 from .func.algo_regression._elastic_net import elastic_net_manual_hyper_parameters
 from .func.algo_regression._enum import (
+    AdaBoostSpecialFunction,
     ClassicalLinearSpecialFunction,
     DecisionTreeSpecialFunction,
     ElasticNetSpecialFunction,
@@ -4532,3 +4534,176 @@ def special_components(self, is_automl: bool, **kwargs) -> None:
             )
         else:
             pass
+
+
+class AdaBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
+    """The automation workflow of using AdaBoost Regression algorithm to make insightful products."""
+
+    name = "AdaBoost"
+    special_function = ["Feature Importance Diagram", "Single Tree Diagram"]
+
+    def __init__(
+        self,
+        estimator: object = None,
+        *,
+        n_estimators: int = 50,
+        learning_rate: float = 1.0,
+        loss: str = "linear",
+        random_state: Optional[int] = None,
+    ) -> None:
+        """Parameters
+        ----------
+        estimator : object, default=None
+            The base estimator from which the boosted ensemble is built.
+            If ``None``, then the base estimator is
+            :class:`~sklearn.tree.DecisionTreeRegressor` initialized with
+            `max_depth=3`.
+
+            .. versionadded:: 1.2
+            `base_estimator` was renamed to `estimator`.
+
+        n_estimators : int, default=50
+            The maximum number of estimators at which boosting is terminated.
+            In case of perfect fit, the learning procedure is stopped early.
+            Values must be in the range `[1, inf)`.
+
+        learning_rate : float, default=1.0
+            Weight applied to each regressor at each boosting iteration. A higher
+            learning rate increases the contribution of each regressor. There is
+            a trade-off between the `learning_rate` and `n_estimators` parameters.
+            Values must be in the range `(0.0, inf)`.
+
+        loss : {'linear', 'square', 'exponential'}, default='linear'
+            The loss function to use when updating the weights after each
+            boosting iteration.
+
+        random_state : int, RandomState instance or None, default=None
+            Controls the random seed given at each `estimator` at each
+            boosting iteration.
+            Thus, it is only used when `estimator` exposes a `random_state`.
+            In addition, it controls the bootstrap of the weights used to train the
+            `estimator` at each boosting iteration.
+            Pass an int for reproducible output across multiple function calls.
+            See :term:`Glossary <random_state>`.
+
+        References
+        ----------
+        Scikit-learn API: sklearn.ensemble.AdaBoostRegressor
+        https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
+        """
+        super().__init__()
+        self.estimator = estimator
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.loss = loss
+        self.random_state = random_state
+
+        if random_state:
+            self.random_state = random_state
+
+        self.model = AdaBoostRegressor(
+            estimator=self.estimator,
+            n_estimators=self.n_estimators,
+            learning_rate=self.learning_rate,
+            loss=self.loss,
+            random_state=self.random_state,
+        )
+
+        self.naming = AdaBoostRegression.name
+        self.customized = True
+        self.customized_name = "AdaBoost"
+
+    @property
+    def settings(self) -> Dict:
+        """The configuration of AdaBoost Regression to implement AutoML by FLAML framework."""
+        configuration = {
+            "time_budget": 10,  # total running time in seconds
+            "metric": "r2",
+            "estimator_list": [self.customized_name],  # list of ML learners
+            "task": "regression",  # task type
+        }
+        return configuration
+
+    @property
+    def customization(self) -> object:
+        """The customized Adaboost Regression of FLAML framework."""
+        from flaml import tune
+        from flaml.data import REGRESSION
+        from flaml.model import SKLearnEstimator
+        from sklearn.ensemble import AdaBoostRegressor
+
+        class MyAdaBoostRegression(SKLearnEstimator):
+            def __init__(self, task="regression", n_jobs=None, **config):
+                super().__init__(task, **config)
+                if task in REGRESSION:
+                    self.estimator_class = AdaBoostRegressor
+
+            @classmethod
+            def search_space(cls, data_size, task):
+                space = {
+                    "n_estimators": {
+                        "domain": tune.lograndint(lower=4, upper=512),
+                        "init_value": 50,
+                    },
+                    "learning_rate": {
+                        "domain": tune.loguniform(lower=0.001, upper=1.0),
+                        "init_value": 0.1,
+                    },
+                }
+                return space
+
+        return MyAdaBoostRegression
+
+    @classmethod
+    def manual_hyper_parameters(cls) -> Dict:
+        """Manual hyper-parameters specification."""
+        print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
+        hyper_parameters = ada_boost_manual_hyper_parameters()
+        clear_output()
+        return hyper_parameters
+
+    @dispatch()
+    def special_components(self, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        self._plot_feature_importance(
+            X_train=AdaBoostRegression.X_train,
+            name_column=RegressionWorkflowBase.name_train,
+            trained_model=self.model,
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            func_name=AdaBoostSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+        self._plot_tree(
+            trained_model=self.model.estimators_[0],
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            func_name=AdaBoostSpecialFunction.SINGLE_TREE_DIAGRAM.value,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+
+    @dispatch(bool)
+    def special_components(self, is_automl: bool, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by FLAML framework."""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        self._plot_feature_importance(
+            X_train=AdaBoostRegression.X_train,
+            name_column=RegressionWorkflowBase.name_train,
+            trained_model=self.auto_model,
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            func_name=AdaBoostSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+        self._plot_tree(
+            trained_model=self.auto_model.estimators_[0][0],
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            func_name=AdaBoostSpecialFunction.SINGLE_TREE_DIAGRAM.value,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py
@@ -8,6 +8,7 @@
 from ..constants import MLFLOW_ARTIFACT_DATA_PATH, SECTION
 from ..data.data_readiness import num_input
 from ..model.regression import (
+    AdaBoostRegression,
     BayesianRidgeRegression,
     ClassicalLinearRegression,
     DecisionTreeRegression,
@@ -208,9 +209,11 @@ def activate(
                 max_iter=hyper_parameters["max_iter"],
                 tol=hyper_parameters["tol"],
             )
+        elif self.model_name == "AdaBoost":
+            hyper_parameters = AdaBoostRegression.manual_hyper_parameters()
+            self.reg_workflow = AdaBoostRegression(loss=hyper_parameters["loss"], n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"])
 
         self.reg_workflow.show_info()
-
         # Use Scikit-learn style API to process input data
         self.reg_workflow.fit(X_train, y_train)
         y_train_predict = self.reg_workflow.predict(X_train)
@@ -294,6 +297,8 @@ def activate(
             self.reg_workflow = BayesianRidgeRegression()
         elif self.model_name == "Ridge Regression":
             self.reg_workflow = RidgeRegression()
+        elif self.model_name == "AdaBoost":
+            self.reg_workflow = AdaBoostRegression()
 
         self.reg_workflow.show_info()