updated docs

mathias-von-ottenbreit · mathias-von-ottenbreit · commit 3b1883778ab5 · 2025-08-07T16:59:04.000+02:00
diff --git a/API_REFERENCE_FOR_REGRESSION.md b/API_REFERENCE_FOR_REGRESSION.md
@@ -17,7 +17,7 @@ Used to randomly split training observations into cv_folds if ***cv_observations
 Determines the loss function used. Allowed values are "mse", "binomial", "poisson", "gamma", "tweedie", "group_mse", "group_mse_cycle","mae", "quantile", "negative_binomial", "cauchy", "weibull" and "custom_function". This is used together with ***link_function***. When ***loss_function*** is "group_mse" then the "group" argument in the ***fit*** method must be provided. In the latter case APLR will try to minimize group MSE when training the model. When using "group_mse_cycle", ***group_mse_cycle_min_obs_in_bin*** controls the minimum amount of observations in each group. For a description of "group_mse_cycle" see ***group_mse_cycle_min_obs_in_bin***. The ***loss_function*** "quantile" is used together with the ***quantile*** constructor parameter. When ***loss_function*** is "custom_function" then the constructor parameters ***calculate_custom_loss_function*** and ***calculate_custom_negative_gradient_function***, both described below, must be provided.
 
 #### link_function (default = "identity")
-Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit", "log" and "custom_function". For an ordinary regression model use ***loss_function*** "mse" and ***link_function*** "identity". For logistic regression use ***loss_function*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" ***loss_function***, depending on the data. The ***loss_function*** "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" should only be used with the "log" ***link_function***. Inappropriate combinations of ***loss_function*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. Please note that values other than "identity" may require a higher ***m*** (or ***v***) in order to converge. When ***link_function*** is "custom_function" then the constructor parameters ***calculate_custom_transform_linear_predictor_to_predictions_function*** and ***calculate_custom_differentiate_predictions_wrt_linear_predictor_function***, both described below, must be provided.
+Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit", "log" and "custom_function". For an ordinary regression model use ***loss_function*** "mse" and ***link_function*** "identity". For logistic regression use ***loss_function*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" ***loss_function***, depending on the data. The ***loss_function*** "poisson", "gamma", "tweedie", "negative_binomial" or "weibull" should only be used with the "log" ***link_function***. Inappropriate combinations of ***loss_function*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. When ***link_function*** is "custom_function" then the constructor parameters ***calculate_custom_transform_linear_predictor_to_predictions_function*** and ***calculate_custom_differentiate_predictions_wrt_linear_predictor_function***, both described below, must be provided.
 
 #### n_jobs (default = 0)
 Multi-threading parameter. If ***0*** then uses all available cores for multi-threading. Any other positive integer specifies the number of cores to use (***1*** means single-threading).
diff --git a/documentation/APLR 10.9.0.pdf b/documentation/APLR 10.9.0.pdf
diff --git a/examples/train_aplr_classification.py b/examples/train_aplr_classification.py
@@ -39,8 +39,8 @@
 param_grid = ParameterGrid(
     {
         "max_interaction_level": [0, 1],
-        "min_observations_in_split": [1, 4, 20, 40],
-        "ridge_penalty": [0.0001, 0.001],
+        "min_observations_in_split": [1, 4, 20],
+        "ridge_penalty": [0, 0.0001, 0.001],
     }
 )
 best_model: APLRClassifier = None
@@ -50,7 +50,8 @@
         verbosity=2,
         m=3000,
         v=0.5,
-        # max_terms=5,  # Optionally tune this to find a trade-off between interpretability and predictiveness. May require a higher learning rate for best results.
+        num_first_steps_with_linear_effects_only=0,  # Increasing this will increase interpretabilty but may decrease predictiveness.
+        boosting_steps_before_interactions_are_allowed=0,  # Increasing this will increase interpretabilty but may decrease predictiveness.
         **params
     )
     model.fit(
diff --git a/examples/train_aplr_classification_using_aplr_tuner.py b/examples/train_aplr_classification_using_aplr_tuner.py
@@ -37,12 +37,17 @@
 parameters = {
     "random_state": [random_state],
     "max_interaction_level": [0, 1],
-    "min_observations_in_split": [1, 4, 20, 40],
+    "min_observations_in_split": [1, 4, 20],
     "verbosity": [2],
     "m": [3000],
     "v": [0.5],
-    "ridge_penalty": [0.0001, 0.001],
-    # "max_terms": [5],
+    "ridge_penalty": [0, 0.0001, 0.001],
+    "num_first_steps_with_linear_effects_only": [
+        0
+    ],  # Increasing num_first_steps_with_linear_effects_only will increase interpretabilty but may decrease predictiveness.
+    "boosting_steps_before_interactions_are_allowed": [
+        0
+    ],  # Increasing boosting_steps_before_interactions_are_allowed will increase interpretabilty but may decrease predictiveness.
 }
 aplr_tuner = APLRTuner(parameters=parameters, is_regressor=False)
 aplr_tuner.fit(
diff --git a/examples/train_aplr_regression.py b/examples/train_aplr_regression.py
@@ -38,8 +38,8 @@
 param_grid = ParameterGrid(
     {
         "max_interaction_level": [0, 1],
-        "min_observations_in_split": [1, 4, 20, 50, 100, 200],
-        "ridge_penalty": [0.0001, 0.001],
+        "min_observations_in_split": [1, 4, 20, 50],
+        "ridge_penalty": [0, 0.0001, 0.001],
     }
 )
 best_model: APLRRegressor = None
@@ -55,7 +55,8 @@
         v=0.5,
         loss_function=loss_function,
         link_function=link_function,
-        # max_terms=10,  # Optionally tune this to find a trade-off between interpretability and predictiveness. May require a higher learning rate for best results.
+        num_first_steps_with_linear_effects_only=0,  # Increasing this will increase interpretabilty but may decrease predictiveness.
+        boosting_steps_before_interactions_are_allowed=0,  # Increasing this will increase interpretabilty but may decrease predictivenes.
         **params,
     )
     model.fit(
@@ -103,10 +104,9 @@
     by="importance", ascending=False
 )
 
-# Shapes for all term affiliations in the model. For each term affiliation, contains predictor values and the corresponding
+# Shapes for all term affiliations in the model. For each term affiliation, shape_df contains predictor values and the corresponding
 # contributions to the linear predictor. Plots are created for main effects and two-way interactions.
 # This is probably the most useful method to use for understanding how the model works.
-shapes: Dict[str, pd.DataFrame] = {}
 predictors_in_each_affiliation = (
     best_model.get_base_predictors_in_each_unique_term_affiliation()
 )
@@ -119,7 +119,6 @@
         shape,
         columns=[predictors[i] for i in predictor_indexes_used] + ["contribution"],
     )
-    shapes.update({affiliation: shape_df})
     is_main_effect: bool = len(predictor_indexes_used) == 1
     is_two_way_interaction: bool = len(predictor_indexes_used) == 2
     if is_main_effect:
diff --git a/examples/train_aplr_regression_using_aplr_tuner.py b/examples/train_aplr_regression_using_aplr_tuner.py
@@ -40,14 +40,19 @@
 parameters = {
     "random_state": [random_state],
     "max_interaction_level": [0, 1],
-    "min_observations_in_split": [1, 4, 20, 50, 100, 200],
+    "min_observations_in_split": [1, 4, 20, 50],
     "verbosity": [2],
     "m": [3000],
     "v": [0.5],
     "loss_function": [loss_function],
     "link_function": [link_function],
-    "ridge_penalty": [0.0001, 0.001],
-    # "max_terms": [10],
+    "ridge_penalty": [0, 0.0001, 0.001],
+    "num_first_steps_with_linear_effects_only": [
+        0
+    ],  # Increasing num_first_steps_with_linear_effects_only will increase interpretabilty but may decrease predictiveness.
+    "boosting_steps_before_interactions_are_allowed": [
+        0
+    ],  # Increasing boosting_steps_before_interactions_are_allowed will increase interpretabilty but may decrease predictiveness.
 }
 aplr_tuner = APLRTuner(parameters=parameters, is_regressor=True)
 aplr_tuner.fit(
@@ -90,10 +95,9 @@
     by="importance", ascending=False
 )
 
-# Shapes for all term affiliations in the model. For each term affiliation, contains predictor values and the corresponding
+# Shapes for all term affiliations in the model. For each term affiliation, shape_df contains predictor values and the corresponding
 # contributions to the linear predictor. Plots are created for main effects and two-way interactions.
 # This is probably the most useful method to use for understanding how the model works.
-shapes: Dict[str, pd.DataFrame] = {}
 predictors_in_each_affiliation = (
     best_model.get_base_predictors_in_each_unique_term_affiliation()
 )
@@ -106,7 +110,6 @@
         shape,
         columns=[predictors[i] for i in predictor_indexes_used] + ["contribution"],
     )
-    shapes.update({affiliation: shape_df})
     is_main_effect: bool = len(predictor_indexes_used) == 1
     is_two_way_interaction: bool = len(predictor_indexes_used) == 2
     if is_main_effect:

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@`
`39`	`39`	`param_grid = ParameterGrid(`
`40`	`40`	`{`
`41`	`41`	`"max_interaction_level": [0, 1],`
`42`		`- "min_observations_in_split": [1, 4, 20, 40],`
`43`		`- "ridge_penalty": [0.0001, 0.001],`
	`42`	`+ "min_observations_in_split": [1, 4, 20],`
	`43`	`+ "ridge_penalty": [0, 0.0001, 0.001],`
`44`	`44`	`}`
`45`	`45`	`)`
`46`	`46`	`best_model: APLRClassifier = None`
`@@ -50,7 +50,8 @@`
`50`	`50`	`verbosity=2,`
`51`	`51`	`m=3000,`
`52`	`52`	`v=0.5,`
`53`		`- # max_terms=5, # Optionally tune this to find a trade-off between interpretability and predictiveness. May require a higher learning rate for best results.`
	`53`	`+ num_first_steps_with_linear_effects_only=0, # Increasing this will increase interpretabilty but may decrease predictiveness.`
	`54`	`+ boosting_steps_before_interactions_are_allowed=0, # Increasing this will increase interpretabilty but may decrease predictiveness.`
`54`	`55`	`**params`
`55`	`56`	`)`
`56`	`57`	`model.fit(`
Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,8 @@`
`38`	`38`	`param_grid = ParameterGrid(`
`39`	`39`	`{`
`40`	`40`	`"max_interaction_level": [0, 1],`
`41`		`- "min_observations_in_split": [1, 4, 20, 50, 100, 200],`
`42`		`- "ridge_penalty": [0.0001, 0.001],`
	`41`	`+ "min_observations_in_split": [1, 4, 20, 50],`
	`42`	`+ "ridge_penalty": [0, 0.0001, 0.001],`
`43`	`43`	`}`
`44`	`44`	`)`
`45`	`45`	`best_model: APLRRegressor = None`
`@@ -55,7 +55,8 @@`
`55`	`55`	`v=0.5,`
`56`	`56`	`loss_function=loss_function,`
`57`	`57`	`link_function=link_function,`
`58`		`- # max_terms=10, # Optionally tune this to find a trade-off between interpretability and predictiveness. May require a higher learning rate for best results.`
	`58`	`+ num_first_steps_with_linear_effects_only=0, # Increasing this will increase interpretabilty but may decrease predictiveness.`
	`59`	`+ boosting_steps_before_interactions_are_allowed=0, # Increasing this will increase interpretabilty but may decrease predictivenes.`
`59`	`60`	`**params,`
`60`	`61`	`)`
`61`	`62`	`model.fit(`
`@@ -103,10 +104,9 @@`
`103`	`104`	`by="importance", ascending=False`
`104`	`105`	`)`
`105`	`106`
`106`		`-# Shapes for all term affiliations in the model. For each term affiliation, contains predictor values and the corresponding`
	`107`	`+# Shapes for all term affiliations in the model. For each term affiliation, shape_df contains predictor values and the corresponding`
`107`	`108`	`# contributions to the linear predictor. Plots are created for main effects and two-way interactions.`
`108`	`109`	`# This is probably the most useful method to use for understanding how the model works.`
`109`		`-shapes: Dict[str, pd.DataFrame] = {}`
`110`	`110`	`predictors_in_each_affiliation = (`
`111`	`111`	`best_model.get_base_predictors_in_each_unique_term_affiliation()`
`112`	`112`	`)`
`@@ -119,7 +119,6 @@`
`119`	`119`	`shape,`
`120`	`120`	`columns=[predictors[i] for i in predictor_indexes_used] + ["contribution"],`
`121`	`121`	`)`
`122`		`- shapes.update({affiliation: shape_df})`
`123`	`122`	`is_main_effect: bool = len(predictor_indexes_used) == 1`
`124`	`123`	`is_two_way_interaction: bool = len(predictor_indexes_used) == 2`
`125`	`124`	`if is_main_effect:`