ottenbreit-data-science
diff --git a/‎API_REFERENCE_FOR_REGRESSION.md‎
Lines changed: 7 additions & 4 deletions b/‎API_REFERENCE_FOR_REGRESSION.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎aplr/aplr.py‎
Lines changed: 4 additions & 0 deletions b/‎aplr/aplr.py‎
Lines changed: 4 additions & 0 deletions
@@ -1,6 +1,6 @@
 # APLRRegressor
 
-## class aplr.APLRRegressor(m:int = 3000, v:float = 0.5, random_state:int = 0, loss_function:str = "mse", link_function:str = "identity", n_jobs:int = 0, cv_folds:int = 5, bins:int = 300, max_interaction_level:int = 1, max_interactions:int = 100000, min_observations_in_split:int = 4, ineligible_boosting_steps_added:int = 15, max_eligible_terms:int = 7, verbosity:int = 0, dispersion_parameter:float = 1.5, validation_tuning_metric:str = "default", quantile:float = 0.5, calculate_custom_validation_error_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatVector, FloatMatrix], float]] = None, calculate_custom_loss_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatVector, FloatMatrix], float]] = None, calculate_custom_negative_gradient_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatMatrix],FloatVector]] = None, calculate_custom_transform_linear_predictor_to_predictions_function:Optional[Callable[[FloatVector], FloatVector]] = None, calculate_custom_differentiate_predictions_wrt_linear_predictor_function:Optional[Callable[[FloatVector], FloatVector]] = None, boosting_steps_before_interactions_are_allowed:int = 0, monotonic_constraints_ignore_interactions:bool = False, group_mse_by_prediction_bins:int = 10, group_mse_cycle_min_obs_in_bin:int = 30, early_stopping_rounds:int = 200, num_first_steps_with_linear_effects_only:int = 0, penalty_for_non_linearity:float = 0.0, penalty_for_interactions:float = 0.0, max_terms:int = 0, ridge_penalty: float = 0.0001, mean_bias_correction:bool = False)
+## class aplr.APLRRegressor(m:int = 3000, v:float = 0.5, random_state:int = 0, loss_function:str = "mse", link_function:str = "identity", n_jobs:int = 0, cv_folds:int = 5, bins:int = 300, max_interaction_level:int = 1, max_interactions:int = 100000, min_observations_in_split:int = 4, ineligible_boosting_steps_added:int = 15, max_eligible_terms:int = 7, verbosity:int = 0, dispersion_parameter:float = 1.5, validation_tuning_metric:str = "default", quantile:float = 0.5, calculate_custom_validation_error_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatVector, FloatMatrix], float]] = None, calculate_custom_loss_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatVector, FloatMatrix], float]] = None, calculate_custom_negative_gradient_function:Optional[Callable[[FloatVector, FloatVector, FloatVector, FloatMatrix],FloatVector]] = None, calculate_custom_transform_linear_predictor_to_predictions_function:Optional[Callable[[FloatVector], FloatVector]] = None, calculate_custom_differentiate_predictions_wrt_linear_predictor_function:Optional[Callable[[FloatVector], FloatVector]] = None, boosting_steps_before_interactions_are_allowed:int = 0, monotonic_constraints_ignore_interactions:bool = False, group_mse_by_prediction_bins:int = 10, group_mse_cycle_min_obs_in_bin:int = 30, early_stopping_rounds:int = 200, num_first_steps_with_linear_effects_only:int = 0, penalty_for_non_linearity:float = 0.0, penalty_for_interactions:float = 0.0, max_terms:int = 0, ridge_penalty: float = 0.0001, mean_bias_correction:bool = False, faster_convergence:bool = False)
 
 ### Constructor parameters
 
@@ -14,7 +14,7 @@ The learning rate. Must be greater than zero and not more than one. The higher t
 Used to randomly split training observations into cv_folds if ***cv_observations*** is not specified when fitting.
 
 #### loss_function (default = "mse")
-Determines the loss function used. Allowed values are "mse", "binomial", "poisson", "gamma", "tweedie", "group_mse", "group_mse_cycle","mae", "quantile", "negative_binomial", "cauchy", "weibull", "huber" and "custom_function". This is used together with ***link_function***. When ***loss_function*** is "group_mse" then the "group" argument in the ***fit*** method must be provided. In the latter case APLR will try to minimize group MSE when training the model. When using "group_mse_cycle", ***group_mse_cycle_min_obs_in_bin*** controls the minimum amount of observations in each group. For a description of "group_mse_cycle" see ***group_mse_cycle_min_obs_in_bin***. The ***loss_function*** "quantile" is used together with the ***quantile*** constructor parameter. When ***loss_function*** is "custom_function" then the constructor parameters ***calculate_custom_loss_function*** and ***calculate_custom_negative_gradient_function***, both described below, must be provided.
+Determines the loss function used. Allowed values are "mse", "binomial", "poisson", "gamma", "tweedie", "group_mse", "group_mse_cycle","mae", "quantile", "negative_binomial", "cauchy", "weibull", "huber", "exponential_power" and "custom_function". This is used together with ***link_function***. When ***loss_function*** is "group_mse" then the "group" argument in the ***fit*** method must be provided. In the latter case APLR will try to minimize group MSE when training the model. When using "group_mse_cycle", ***group_mse_cycle_min_obs_in_bin*** controls the minimum amount of observations in each group. For a description of "group_mse_cycle" see ***group_mse_cycle_min_obs_in_bin***. The ***loss_function*** "quantile" is used together with the ***quantile*** constructor parameter. When using "exponential_power" it is recommended to also set ***faster_convergence*** to True. When ***loss_function*** is "custom_function" then the constructor parameters ***calculate_custom_loss_function*** and ***calculate_custom_negative_gradient_function***, both described below, must be provided.
 
 #### link_function (default = "identity")
 Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit", "log" and "custom_function". For an ordinary regression model use ***loss_function*** "mse" and ***link_function*** "identity". For logistic regression use ***loss_function*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" ***loss_function***, depending on the data. The ***loss_function*** "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" should only be used with the "log" ***link_function***. Inappropriate combinations of ***loss_function*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. When ***link_function*** is "custom_function" then the constructor parameters ***calculate_custom_transform_linear_predictor_to_predictions_function*** and ***calculate_custom_differentiate_predictions_wrt_linear_predictor_function***, both described below, must be provided.
@@ -47,10 +47,10 @@ Limits 1) the number of terms already in the model that can be considered as int
 ***0*** does not print progress reports during fitting. ***1*** prints a summary after running the ***fit*** method. ***2*** prints a summary after each boosting step.
 
 #### dispersion_parameter (default = 1.5)
-Specifies the variance power when ***loss_function*** is "tweedie". Specifies a dispersion parameter when ***loss_function*** is "negative_binomial", "cauchy" or "weibull". For "huber" it specifies the delta parameter.
+Specifies the variance power when ***loss_function*** is "tweedie". Specifies a dispersion parameter when ***loss_function*** is "negative_binomial", "cauchy", "weibull" or "exponential_power". For "huber" it specifies the delta parameter.
 
 #### validation_tuning_metric (default = "default")
-Specifies which metric to use for validating the model and tuning ***m***. The model will try to minimize the validation metric. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "huber", "negative_gini" (normalized), "group_mse", "group_mse_by_prediction", "neg_top_quantile_mean_response", "bottom_quantile_mean_response" and "custom_function". The default is often a choice that fits well with respect to the ***loss_function*** chosen. However, if you want to use ***loss_function*** or ***dispersion_parameter*** as tuning parameters then the default is not suitable. "group_mse" requires that the "group" argument in the ***fit*** method is provided. "group_mse_by_prediction" groups predictions by up to ***group_mse_by_prediction_bins*** groups and calculates groupwise mse. "neg_top_quantile_mean_response" calculates the negative of the sample weighted mean response for observations with predictions in the top quantile (as specified by the ***quantile*** parameter). For example, if ***quantile*** is 0.95, this metric will be the negative of the sample weighted mean response for the 5% of observations with the highest predictions. "bottom_quantile_mean_response" calculates the sample weighted mean response for observations with predictions in the bottom quantile (as specified by the ***quantile*** parameter). For example, if ***quantile*** is 0.05, this metric will be the sample weighted mean response for the 5% of observations with the lowest predictions. For "custom_function" see ***calculate_custom_validation_error_function*** below. Please note that for non-default values a significantly higher ***early_stopping_rounds*** than the default of 200 might be needed.
+Specifies which metric to use for assessing the model's cross-validation (CV) error, which is returned by the `get_cv_error()` method. This metric is also used by `APLRTuner` to select the best model when tuning hyperparameters. Note that for model training and for finding the optimal number of boosting steps (***m***), the "default" metric (which corresponds to the chosen ***loss_function***) is always used to ensure stable convergence. When tuning hyperparameters like `dispersion_parameter` or `loss_function`, you should not use "default" because the resulting CV errors will not be comparable across different parameter values. Available options are "default", "mse", "mae", "huber", "negative_gini" (normalized), "group_mse", "group_mse_by_prediction", "neg_top_quantile_mean_response", "bottom_quantile_mean_response", and "custom_function". "group_mse" requires that the "group" argument in the ***fit*** method is provided. "group_mse_by_prediction" groups predictions by up to ***group_mse_by_prediction_bins*** groups and calculates groupwise mse. "neg_top_quantile_mean_response" calculates the negative of the sample weighted mean response for observations with predictions in the top quantile (as specified by the ***quantile*** parameter). For example, if ***quantile*** is 0.95, this metric will be the negative of the sample weighted mean response for the 5% of observations with the highest predictions. "bottom_quantile_mean_response" calculates the sample weighted mean response for observations with predictions in the bottom quantile (as specified by the ***quantile*** parameter). For example, if ***quantile*** is 0.05, this metric will be the sample weighted mean response for the 5% of observations with the lowest predictions. For "custom_function" see ***calculate_custom_validation_error_function*** below.
 
 #### quantile (default = 0.5)
 Specifies the quantile to use when ***loss_function*** is "quantile" or when ***validation_tuning_metric*** is "neg_top_quantile_mean_response" or "bottom_quantile_mean_response".
@@ -135,6 +135,9 @@ Specifies the (weighted) ridge penalty applied to the model. Positive values can
 #### mean_bias_correction (default = False)
 If true, then a mean bias correction is applied to the model's intercept term. This can be useful for some loss functions, such as "huber", that can otherwise produce biased predictions. The correction is only applied for the "identity" and "log" link functions.
 
+#### faster_convergence (default = False)
+If true, then a scaling is applied to the negative gradient to speed up convergence. This should primarily be used when the algorithm otherwise converges too slowly. This is only applied for the "identity" and "log" link functions.
+This will not speed up the combination of "mse" loss with an "identity" link, as this combination is already optimized for speed within the algorithm. Furthermore, this option is not effective for all loss functions, such as "mae" and "quantile".
 
 ## Method: fit(X:FloatMatrix, y:FloatVector, sample_weight:FloatVector = np.empty(0), X_names:List[str] = [], cv_observations:IntMatrix = np.empty([0, 0]), prioritized_predictors_indexes:List[int] = [], monotonic_constraints:List[int] = [], group:FloatVector = np.empty(0), interaction_constraints:List[List[int]] = [], other_data:FloatMatrix = np.empty([0, 0]), predictor_learning_rates:List[float] = [], predictor_penalties_for_non_linearity:List[float] = [], predictor_penalties_for_interactions:List[float] = [], predictor_min_observations_in_split: List[int] = [])
 
 
@@ -76,6 +76,7 @@ def __init__(
         max_terms: int = 0,
         ridge_penalty: float = 0.0001,
         mean_bias_correction: bool = False,
+        faster_convergence: bool = False,
     ):
         self.m = m
         self.v = v
@@ -124,6 +125,7 @@ def __init__(
         self.max_terms = max_terms
         self.ridge_penalty = ridge_penalty
         self.mean_bias_correction = mean_bias_correction
+        self.faster_convergence = faster_convergence
 
         # Creating aplr_cpp and setting parameters
         self.APLRRegressor = aplr_cpp.APLRRegressor()
@@ -186,6 +188,7 @@ def __set_params_cpp(self):
         self.APLRRegressor.max_terms = self.max_terms
         self.APLRRegressor.ridge_penalty = self.ridge_penalty
         self.APLRRegressor.mean_bias_correction = self.mean_bias_correction
+        self.APLRRegressor.faster_convergence = self.faster_convergence
 
     def fit(
         self,
@@ -469,6 +472,7 @@ def get_params(self, deep=True):
             "max_terms": self.max_terms,
             "ridge_penalty": self.ridge_penalty,
             "mean_bias_correction": self.mean_bias_correction,
+            "faster_convergence": self.faster_convergence,
         }
 
     # For sklearn