ottenbreit-data-science
diff --git a/‎API_REFERENCE.md‎
Lines changed: 9 additions & 9 deletions b/‎API_REFERENCE.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎aplr/aplr.py‎
Lines changed: 7 additions & 7 deletions b/‎aplr/aplr.py‎
Lines changed: 7 additions & 7 deletions
@@ -1,6 +1,6 @@
 # APLRRegressor
 
-## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5)
+## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5)
 
 ### Constructor parameters
 
@@ -13,11 +13,11 @@ The learning rate. Must be greater than zero and not more than one. The higher t
 #### random_state (default = 0)
 Used to randomly split training observations into training and validation if ***validation_set_indexes*** is not specified when fitting.
 
-#### family (default = "gaussian")
-Determines the loss function used. Allowed values are "gaussian", "binomial", "poisson", "gamma", "tweedie", "group_gaussian", "mae" and "quantile". This is used together with ***link_function***. When ***family*** is "group_gaussian" then the "group" argument in the ***fit*** method must be provided. In the latter case APLR will try to minimize group MSE when training the model. The ***family*** "quantile" is used together with the ***quantile*** constructor parameter.
+#### loss_function (default = "mse")
+Determines the loss function used. Allowed values are "mse", "binomial", "poisson", "gamma", "tweedie", "group_mse", "mae", "quantile", "negative_binomial" and "cauchy". This is used together with ***link_function***. When ***loss_function*** is "group_mse" then the "group" argument in the ***fit*** method must be provided. In the latter case APLR will try to minimize group MSE when training the model. The ***loss_function*** "quantile" is used together with the ***quantile*** constructor parameter.
 
 #### link_function (default = "identity")
-Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit" and "log". For an ordinary regression model use ***family*** "gaussian" and ***link_function*** "identity". For logistic regression use ***family*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma" or "tweedie" ***family***, depending on the data. The ***family*** "poisson", "gamma" or "tweedie" should only be used with the "log" ***link_function***. Inappropriate combinations of ***family*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. Please note that values other than "identity" typically require a significantly higher ***m*** (or ***v***) in order to converge.
+Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit" and "log". For an ordinary regression model use ***loss_function*** "mse" and ***link_function*** "identity". For logistic regression use ***loss_function*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma", "tweedie" or "negative_binomial" ***loss_function***, depending on the data. The ***loss_function*** "poisson", "gamma", "tweedie" or "negative_binomial" should only be used with the "log" ***link_function***. Inappropriate combinations of ***loss_function*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. Please note that values other than "identity" typically require a significantly higher ***m*** (or ***v***) in order to converge.
 
 #### n_jobs (default = 0)
 Multi-threading parameter. If ***0*** then uses all available cores for multi-threading. Any other positive integer specifies the number of cores to use (***1*** means single-threading).
@@ -49,14 +49,14 @@ Limits 1) the number of terms already in the model that can be considered as int
 #### verbosity (default = 0)
 ***0*** does not print progress reports during fitting. ***1*** prints a summary after running the ***fit*** method. ***2*** prints a summary after each boosting step.
 
-#### tweedie_power (default = 1.5)
-Specifies the variance power for the "tweedie" ***family***.
+#### dispersion_parameter (default = 1.5)
+Specifies the variance power when ***loss_function*** is "tweedie". Specifies a dispersion parameter when ***loss_function*** is "negative_binomial" or "cauchy". 
 
 #### validation_tuning_metric (default = "default")
-Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini" and "rankability". The default is often a choice that fits well with respect to the ***family*** chosen. However, if you want to use ***family*** or ***tweedie_power*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights.
+Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini", "rankability" and "group_mse". The default is often a choice that fits well with respect to the ***loss_function*** chosen. However, if you want to use ***loss_function*** or ***dispersion_parameter*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights. "group_mse" requires that the "group" argument in the ***fit*** method is provided.
 
 #### quantile (default = 0.5)
-Specifies the quantile to use when ***family*** is "quantile".
+Specifies the quantile to use when ***loss_function*** is "quantile".
 
 
 ## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[], monotonic_constraints:List[int]=[], group:npt.ArrayLike = np.empty(0), interaction_constraints:List[int]=[])
@@ -87,7 +87,7 @@ An optional list of integers specifying the indexes of predictors (columns) in *
 An optional list of integers specifying monotonic constraints on model terms. For example, if there are three predictors in ***X***, then monotonic_constraints = [1,0,-1] means that 1) the first predictor in ***X*** cannot be used in interaction terms as a secondary effect and all terms using the first predictor in ***X*** as a main effect must have positive regression coefficients, 2) there are no monotonic constraints on terms using the second predictor in ***X***, and 3) the third predictor in ***X*** cannot be used in interaction terms as a secondary effect and all terms using the third predictor in ***X*** as a main effect must have negative regression coefficients.
 
 #### group
-A numpy vector of integers that is used when ***family*** is "group_gaussian". For example, ***group*** may represent year (could be useful in a time series model).
+A numpy vector of integers that is used when ***loss_function*** is "group_mse". For example, ***group*** may represent year (could be useful in a time series model).
 
 #### interaction_constraints
 An optional list of integers specifying interaction constraints on model terms. For example, if there are three predictors in ***X***, then interaction_constraints = [1,0,2] means that 1) the first predictor in ***X*** cannot be used in interaction terms as a secondary effect, 2) there are no interaction constraints on terms using the second predictor in ***X***, and 3) the third predictor in ***X*** cannot be used in any interaction terms.
 
@@ -5,11 +5,11 @@
 
 
 class APLRRegressor():
-    def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5):
+    def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5):
         self.m=m
         self.v=v
         self.random_state=random_state
-        self.family=family
+        self.loss_function=loss_function
         self.link_function=link_function
         self.n_jobs=n_jobs
         self.validation_ratio=validation_ratio
@@ -21,7 +21,7 @@ def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, family:str="gaus
         self.ineligible_boosting_steps_added=ineligible_boosting_steps_added
         self.max_eligible_terms=max_eligible_terms
         self.verbosity=verbosity
-        self.tweedie_power=tweedie_power
+        self.dispersion_parameter=dispersion_parameter
         self.validation_tuning_metric=validation_tuning_metric
         self.quantile=quantile
 
@@ -34,7 +34,7 @@ def __set_params_cpp(self):
         self.APLRRegressor.m=self.m
         self.APLRRegressor.v=self.v
         self.APLRRegressor.random_state=self.random_state
-        self.APLRRegressor.family=self.family
+        self.APLRRegressor.loss_function=self.loss_function
         self.APLRRegressor.link_function=self.link_function
         self.APLRRegressor.n_jobs=self.n_jobs
         self.APLRRegressor.validation_ratio=self.validation_ratio
@@ -46,7 +46,7 @@ def __set_params_cpp(self):
         self.APLRRegressor.ineligible_boosting_steps_added=self.ineligible_boosting_steps_added
         self.APLRRegressor.max_eligible_terms=self.max_eligible_terms
         self.APLRRegressor.verbosity=self.verbosity
-        self.APLRRegressor.tweedie_power=self.tweedie_power
+        self.APLRRegressor.dispersion_parameter=self.dispersion_parameter
         self.APLRRegressor.validation_tuning_metric=self.validation_tuning_metric
         self.APLRRegressor.quantile=self.quantile
 
@@ -105,7 +105,7 @@ def get_params(self, deep=True):
             "m": self.m,
             "v": self.v,
             "random_state":self.random_state,
-            "family":self.family,
+            "loss_function":self.loss_function,
             "link_function":self.link_function,
             "n_jobs":self.n_jobs,
             "validation_ratio":self.validation_ratio,
@@ -117,7 +117,7 @@ def get_params(self, deep=True):
             "min_observations_in_split":self.min_observations_in_split,
             "ineligible_boosting_steps_added":self.ineligible_boosting_steps_added,
             "max_eligible_terms":self.max_eligible_terms,
-            "tweedie_power":self.tweedie_power,
+            "dispersion_parameter":self.dispersion_parameter,
             "validation_tuning_metric":self.validation_tuning_metric,
             "quantile":self.quantile
         }