|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from typing import Callable, Literal |
| 4 | + |
| 5 | +from numpy.typing import ArrayLike |
| 6 | +from sklearn.base import BaseEstimator, TransformerMixin |
| 7 | + |
| 8 | +from ._weighted_trees import WeightedTreesNNRegressor |
| 9 | +from .transformers import GBNodeTransformer |
| 10 | + |
| 11 | + |
| 12 | +class GBNNRegressor(WeightedTreesNNRegressor): |
| 13 | + """ |
| 14 | + Regression using Gradient Boosting Nearest Neighbors (GBNN) imputation. |
| 15 | +
|
| 16 | + New data is predicted by similarity of its node indexes to training |
| 17 | + set node indexes when run through multiple univariate gradient boosting |
| 18 | + models. A gradient boosting model is fit to each target in the training |
| 19 | + set and node indexes are captured for each tree in each forest for each |
| 20 | + training sample. Node indexes are then captured for inference data and |
| 21 | + distance is calculated as the dissimilarity between node indexes. |
| 22 | +
|
| 23 | + Gradient boosting models are constructed using either scikit-learn's |
| 24 | + `GradientBoostingRegressor` or `GradientBoostingClassifier` classes based on |
| 25 | + the data type of each target (`y` or `y_fit`) in the training set. If the |
| 26 | + target is numeric (e.g. `int` or `float`), a `GradientBoostingRegressor` is |
| 27 | + used. If the target is categorical (e.g. `str` or `pd.Categorical`), a |
| 28 | + `GradientBoostingClassifier` is used. The |
| 29 | + `sknnr.transformers.GBNodeTransformer` class is responsible for constructing |
| 30 | + the gradient boosting models and capturing the node indexes. |
| 31 | +
|
| 32 | + See `sklearn.neighbors.KNeighborsRegressor` for more detail on |
| 33 | + parameters associated with nearest neighbors. See |
| 34 | + `sklearn.ensemble.GradientBoostingRegressor` and |
| 35 | + `sklearn.ensemble.GradientBoostingClassifier` for more detail on parameters |
| 36 | + associated with gradient boosting. Note that some parameters (e.g. |
| 37 | + `loss` and `alpha`) are specified separately for regression and |
| 38 | + classification and have `_reg` and `_clf` suffixes. |
| 39 | +
|
| 40 | + Parameters |
| 41 | + ---------- |
| 42 | + loss_reg : {"squared_error", "absolute_error", "huber", "quantile"}, |
| 43 | + default="squared_error" |
| 44 | + Loss function to be optimized for regression. |
| 45 | + loss_clf : {"log_loss", "exponential"}, default="log_loss" |
| 46 | + The loss function to be used for classification. |
| 47 | + learning_rate : float, default=0.1 |
| 48 | + Learning rate shrinks the contribution of each tree by `learning_rate`. |
| 49 | + n_estimators : int, default=100 |
| 50 | + The number of boosting stages to perform. |
| 51 | + subsample : float, default=1.0 |
| 52 | + The fraction of samples to be used for fitting the individual base |
| 53 | + learners. |
| 54 | + criterion : {"friedman_mse", "squared_error"}, default="friedman_mse" |
| 55 | + The function to measure the quality of a split. |
| 56 | + min_samples_split : int or float, default=2 |
| 57 | + The minimum number of samples required to split an internal node. |
| 58 | + min_samples_leaf : int or float, default=1 |
| 59 | + The minimum number of samples required to be at a leaf node. |
| 60 | + min_weight_fraction_leaf : float, default=0.0 |
| 61 | + The minimum weighted fraction of the sum total of weights (of all the |
| 62 | + input samples) required to be at a leaf node. |
| 63 | + max_depth : int or None, default=3 |
| 64 | + Maximum depth of the individual regression estimators. |
| 65 | + min_impurity_decrease : float, default=0.0 |
| 66 | + A node will be split if this split induces a decrease of the impurity |
| 67 | + greater than or equal to this value. |
| 68 | + init : estimator, "zero" or None, default=None |
| 69 | + An estimator object that is used to compute the initial predictions. |
| 70 | + random_state : int, RandomState instance or None, default=None |
| 71 | + Controls the random seed given to each Tree estimator at each boosting |
| 72 | + iteration. |
| 73 | + max_features : {"sqrt", "log2"}, int or float, default=None |
| 74 | + The number of features to consider when looking for the best split. |
| 75 | + alpha_reg : float, default=0.9 |
| 76 | + The alpha-quantile of the huber loss function and the quantile loss |
| 77 | + function. |
| 78 | + verbose : int, default=0 |
| 79 | + Enable verbose output. |
| 80 | + max_leaf_nodes : int or None, default=None |
| 81 | + Grow trees with `max_leaf_nodes` in best-first fashion. |
| 82 | + warm_start : bool, default=False |
| 83 | + When set to `True`, reuse the solution of the previous call to fit and |
| 84 | + add more estimators to the ensemble, otherwise, just erase the previous |
| 85 | + solution. |
| 86 | + validation_fraction : float, default=0.1 |
| 87 | + The proportion of training data to set aside as validation set for |
| 88 | + early stopping. |
| 89 | + n_iter_no_change : int or None, default=None |
| 90 | + `n_iter_no_change` is used to decide if early stopping will be used to |
| 91 | + terminate training when validation score is not improving. |
| 92 | + tol : float, default=1e-4 |
| 93 | + Tolerance for the early stopping. |
| 94 | + ccp_alpha : non-negative float, default=0.0 |
| 95 | + Complexity parameter used for Minimal Cost-Complexity Pruning. |
| 96 | + forest_weights: {"uniform"}, array-like of shape (n_targets), default="uniform" |
| 97 | + Weights assigned to each target in the training set when calculating |
| 98 | + Hamming distance between node indexes. This allows for differential |
| 99 | + weighting of targets when calculating distances. Note that all trees |
| 100 | + associated with a target will receive the same weight. If "uniform", |
| 101 | + each tree is assigned equal weight. |
| 102 | + tree_weighting_method: {"train_improvement", "uniform"}, |
| 103 | + default="train_improvement" |
| 104 | + The method used to weight the trees in each gradient boosting model. |
| 105 | + n_neighbors : int, default=5 |
| 106 | + Number of neighbors to use by default for `kneighbors` queries. |
| 107 | + weights : {"uniform", "distance"}, callable or None, default="uniform" |
| 108 | + Weight function used in prediction. |
| 109 | + n_jobs : int or None, default=None |
| 110 | + The number of jobs to run in parallel. |
| 111 | +
|
| 112 | + Attributes |
| 113 | + ---------- |
| 114 | + effective_metric_ : str |
| 115 | + Always set to 'hamming'. |
| 116 | + effective_metric_params_ : dict |
| 117 | + Always empty. |
| 118 | + hamming_weights_ : np.array |
| 119 | + When `fit`, provides the weights on each tree in each forest when |
| 120 | + calculating the Hamming distance. |
| 121 | + independent_prediction_ : np.array |
| 122 | + When `fit`, provides the prediction for training data not allowing |
| 123 | + self-assignment during neighbor search. |
| 124 | + independent_score_ : double |
| 125 | + When `fit`, the mean coefficient of determination of the independent |
| 126 | + prediction across all features. |
| 127 | + n_features_in_ : int |
| 128 | + Number of features that the transformer outputs. This is equal to the |
| 129 | + number of features in `y` (or `y_fit`) * `n_estimators_per_forest`. |
| 130 | + n_samples_fit_ : int |
| 131 | + Number of samples in the fitted data. |
| 132 | + transformer_ : GBNodeTransformer |
| 133 | + The fitted transformer which holds the built gradient boosting models |
| 134 | + for each feature. |
| 135 | + y_fit_ : np.array or pd.DataFrame |
| 136 | + When `y_fit` is passed to `fit`, the data used to construct the |
| 137 | + individual gradient boosting models. Note that all `y` data is used |
| 138 | + for prediction. |
| 139 | +
|
| 140 | + Notes |
| 141 | + ----- |
| 142 | + The `tree_weighting_method` parameter determines how the trees in each |
| 143 | + forest are weighted when calculating distances between node indexes. |
| 144 | + If `tree_weighting_method` is set to "train_improvement", tree weights are |
| 145 | + calculated as a function of the change in loss between successive trees |
| 146 | + in the gradient boosting estimator. As such, weights are directly |
| 147 | + proportional to the loss function specified and the user may want to |
| 148 | + choose the appropriate loss function (i.e. `loss_reg` or `loss_clf`) |
| 149 | + for their task. |
| 150 | +
|
| 151 | + If `tree_weighting_method` is set to "uniform", all trees are weighted |
| 152 | + equally. |
| 153 | + """ |
| 154 | + |
| 155 | + def __init__( |
| 156 | + self, |
| 157 | + *, |
| 158 | + loss_reg: Literal[ |
| 159 | + "squared_error", "absolute_error", "huber", "quantile" |
| 160 | + ] = "squared_error", |
| 161 | + loss_clf: Literal["log_loss", "exponential"] = "log_loss", |
| 162 | + learning_rate: float = 0.1, |
| 163 | + n_estimators: int = 100, |
| 164 | + subsample: float = 1.0, |
| 165 | + criterion: Literal["friedman_mse", "squared_error"] = "friedman_mse", |
| 166 | + min_samples_split: int | float = 2, |
| 167 | + min_samples_leaf: int | float = 1, |
| 168 | + min_weight_fraction_leaf: float = 0.0, |
| 169 | + max_depth: int | None = 3, |
| 170 | + min_impurity_decrease: float = 0.0, |
| 171 | + init: BaseEstimator | Literal["zero"] | None = None, |
| 172 | + random_state: int | None = None, |
| 173 | + max_features: Literal["sqrt", "log2"] | int | float | None = None, |
| 174 | + alpha_reg: float = 0.9, |
| 175 | + verbose: int = 0, |
| 176 | + max_leaf_nodes: int | None = None, |
| 177 | + warm_start: bool = False, |
| 178 | + validation_fraction: float = 0.1, |
| 179 | + n_iter_no_change: int | None = None, |
| 180 | + tol: float = 0.0001, |
| 181 | + ccp_alpha: float = 0.0, |
| 182 | + forest_weights: Literal["uniform"] | ArrayLike[float] = "uniform", |
| 183 | + tree_weighting_method: Literal[ |
| 184 | + "train_improvement", "uniform" |
| 185 | + ] = "train_improvement", |
| 186 | + n_neighbors: int = 5, |
| 187 | + weights: Literal["uniform", "distance"] | Callable = "uniform", |
| 188 | + n_jobs: int | None = None, |
| 189 | + ): |
| 190 | + self.loss_reg = loss_reg |
| 191 | + self.loss_clf = loss_clf |
| 192 | + self.learning_rate = learning_rate |
| 193 | + self.n_estimators = n_estimators |
| 194 | + self.subsample = subsample |
| 195 | + self.criterion = criterion |
| 196 | + self.min_samples_split = min_samples_split |
| 197 | + self.min_samples_leaf = min_samples_leaf |
| 198 | + self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| 199 | + self.max_depth = max_depth |
| 200 | + self.min_impurity_decrease = min_impurity_decrease |
| 201 | + self.init = init |
| 202 | + self.random_state = random_state |
| 203 | + self.max_features = max_features |
| 204 | + self.alpha_reg = alpha_reg |
| 205 | + self.verbose = verbose |
| 206 | + self.max_leaf_nodes = max_leaf_nodes |
| 207 | + self.warm_start = warm_start |
| 208 | + self.validation_fraction = validation_fraction |
| 209 | + self.n_iter_no_change = n_iter_no_change |
| 210 | + self.tol = tol |
| 211 | + self.ccp_alpha = ccp_alpha |
| 212 | + self.forest_weights = forest_weights |
| 213 | + self.tree_weighting_method = tree_weighting_method |
| 214 | + |
| 215 | + super().__init__( |
| 216 | + n_neighbors=n_neighbors, |
| 217 | + weights=weights, |
| 218 | + n_jobs=n_jobs, |
| 219 | + ) |
| 220 | + |
| 221 | + def _get_transformer(self) -> TransformerMixin: |
| 222 | + return GBNodeTransformer( |
| 223 | + loss_reg=self.loss_reg, |
| 224 | + loss_clf=self.loss_clf, |
| 225 | + learning_rate=self.learning_rate, |
| 226 | + n_estimators=self.n_estimators, |
| 227 | + subsample=self.subsample, |
| 228 | + criterion=self.criterion, |
| 229 | + min_samples_split=self.min_samples_split, |
| 230 | + min_samples_leaf=self.min_samples_leaf, |
| 231 | + min_weight_fraction_leaf=self.min_weight_fraction_leaf, |
| 232 | + max_depth=self.max_depth, |
| 233 | + min_impurity_decrease=self.min_impurity_decrease, |
| 234 | + init=self.init, |
| 235 | + random_state=self.random_state, |
| 236 | + max_features=self.max_features, |
| 237 | + alpha_reg=self.alpha_reg, |
| 238 | + verbose=self.verbose, |
| 239 | + max_leaf_nodes=self.max_leaf_nodes, |
| 240 | + warm_start=self.warm_start, |
| 241 | + validation_fraction=self.validation_fraction, |
| 242 | + n_iter_no_change=self.n_iter_no_change, |
| 243 | + tol=self.tol, |
| 244 | + ccp_alpha=self.ccp_alpha, |
| 245 | + tree_weighting_method=self.tree_weighting_method, |
| 246 | + ) |
0 commit comments