ottenbreit-data-science
diff --git a/‎API_REFERENCE.md‎
Lines changed: 1 addition & 1 deletion b/‎API_REFERENCE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/APLRRegressor.h‎
Lines changed: 66 additions & 42 deletions b/‎cpp/APLRRegressor.h‎
Lines changed: 66 additions & 42 deletions
diff --git a/‎cpp/constants.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/constants.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/functions.h‎
Lines changed: 7 additions & 23 deletions b/‎cpp/functions.h‎
Lines changed: 7 additions & 23 deletions
diff --git a/‎cpp/main.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/main.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/term.h‎
Lines changed: 4 additions & 4 deletions b/‎cpp/term.h‎
Lines changed: 4 additions & 4 deletions
@@ -17,7 +17,7 @@ Used to randomly split training observations into training and validation if ***
 Determines the loss function used. Allowed values are "gaussian", "binomial", "poisson", "gamma" and "tweedie". This is used together with ***link_function***. Please note that the implementation of values other than "gaussian" is experimental.
 
 #### link_function (default = "identity")
-Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit", "log", "inverse" and "tweedie". These are canonical link functions for the "gaussian", "binomial", "poisson", "gamma" and "tweedie" ***family*** respectively. Canonical links usually work fine given that the data is appropriate for the selected combination of ***family*** and ***link_function***. Other combinations of ***family*** and ***link_function*** may or may not work (the model may fit poorly to the data if the wrong combination is used). Please note that the implementation of values other than "identity" is experimental.
+Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit" and "log". For logistic regression use ***family***="binomial" and ***link_function***="logit". For a multiplicative model use the "log" ***link_function*** and a ***family*** that is not "binomial". The ***family*** "poisson", "gamma" or "tweedie" should only be used with the "log" ***link_function***. Invalid combinations of ***family*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. Please note that the implementation of values other than "identity" is experimental.
 
 #### n_jobs (default = 0)
 Multi-threading parameter. If ***0*** then uses all available cores for multi-threading. Any other positive integer specifies the number of cores to use (***1*** means single-threading).
 
@@ -46,6 +46,7 @@ class APLRRegressor
     bool abort_boosting;
     VectorXd linear_predictor_current;
     VectorXd linear_predictor_current_validation;
+    double scaling_factor_for_log_link_function;
 
     //Methods
     void validate_input_to_fit(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight,const std::vector<std::string> &X_names, const std::vector<size_t> &validation_set_indexes);
@@ -81,11 +82,13 @@ class APLRRegressor
     VectorXd calculate_linear_predictor(const MatrixXd &X);
     void update_linear_predictor_and_predictors();
     void throw_error_if_response_contains_invalid_values(const VectorXd &y);
-    void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y);
-    void throw_error_if_response_is_negative(const VectorXd &y);
-    void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y);
+    void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y,const std::string &error_message);
+    void throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message);
+    void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message);
     void throw_error_if_tweedie_power_is_invalid();
     VectorXd differentiate_predictions();
+    void scale_training_observations_if_using_log_link_function();
+    void revert_scaling_if_using_log_link_function();
 
 public:
     //Fields
@@ -187,6 +190,7 @@ void APLRRegressor::fit(const MatrixXd &X,const VectorXd &y,const VectorXd &samp
     update_coefficients_for_all_steps();
     print_final_summary();
     find_optimal_m_and_update_model_accordingly();
+    revert_scaling_if_using_log_link_function();
     name_terms(X, X_names);
     calculate_feature_importance_on_validation_set();
     cleanup_after_fit();
@@ -218,17 +222,13 @@ void APLRRegressor::throw_error_if_link_function_does_not_exist()
         link_function_exists=true;
     else if(link_function=="log")
         link_function_exists=true;
-    else if(link_function=="tweedie")
-        link_function_exists=true;
-    else if(link_function=="inverse")
-        link_function_exists=true;        
     if(!link_function_exists)
         throw std::runtime_error("Link function "+link_function+" is not available in APLR.");
 }
 
 void APLRRegressor::throw_error_if_tweedie_power_is_invalid()
 {
-    bool tweedie_power_equals_invalid_poits{check_if_approximately_equal(tweedie_power,1.0) || check_if_approximately_equal(tweedie_power,2.0)};
+    bool tweedie_power_equals_invalid_poits{is_approximately_equal(tweedie_power,1.0) || is_approximately_equal(tweedie_power,2.0)};
     bool tweedie_power_is_in_invalid_range{std::isless(tweedie_power,1.0)};
     bool tweedie_power_is_invalid{tweedie_power_equals_invalid_poits || tweedie_power_is_in_invalid_range};
     if(tweedie_power_is_invalid)
@@ -262,34 +262,47 @@ void APLRRegressor::throw_error_if_validation_set_indexes_has_invalid_indexes(co
 
 void APLRRegressor::throw_error_if_response_contains_invalid_values(const VectorXd &y)
 {
-    if(link_function=="logit")
-        throw_error_if_response_is_not_between_0_and_1(y);
-    else if(link_function=="log" || (link_function=="tweedie" && std::isgreater(tweedie_power,1) && std::isless(tweedie_power,2)) )
-        throw_error_if_response_is_negative(y);
-    else if(link_function=="inverse" || (link_function=="tweedie" && std::isgreater(tweedie_power,2)) )
-        throw_error_if_response_is_not_greater_than_zero(y);
+    if(link_function=="logit" || family=="binomial")
+    {
+        std::string error_message{"Response values for the logit link function or binomial family cannot be less than zero or greater than one."};
+        throw_error_if_response_is_not_between_0_and_1(y,error_message);
+    }
+    else if(family=="gamma" || (family=="tweedie" && std::isgreater(tweedie_power,2)) )
+    {
+        std::string error_message;
+        if(family=="tweedie")
+            error_message="Response values for the "+family+" family when tweedie_power>2 must be greater than zero.";
+        else
+            error_message="Response values for the "+family+" family must be greater than zero.";
+        throw_error_if_response_is_not_greater_than_zero(y,error_message);
+    }
+    else if(link_function=="log" || family=="poisson" || (family=="tweedie" && std::isless(tweedie_power,2) && std::isgreater(tweedie_power,1)))
+    {
+        std::string error_message{"Response values for the log link function or poisson family or tweedie family when tweedie_power<2 cannot be less than zero."};
+        throw_error_if_response_is_negative(y,error_message);
+    }
 }
 
-void APLRRegressor::throw_error_if_response_is_not_between_0_and_1(const VectorXd &y)
+void APLRRegressor::throw_error_if_response_is_not_between_0_and_1(const VectorXd &y, const std::string &error_message)
 {
     bool response_is_less_than_zero{(y.array()<0.0).any()};
     bool response_is_greater_than_one{(y.array()>1.0).any()};
     if(response_is_less_than_zero || response_is_greater_than_one)
-        throw std::runtime_error("Response values for "+link_function+" link functions cannot be less than zero or greater than one.");   
+        throw std::runtime_error(error_message);   
 }
 
-void APLRRegressor::throw_error_if_response_is_negative(const VectorXd &y)
+void APLRRegressor::throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message)
 {
     bool response_is_less_than_zero{(y.array()<0.0).any()};
     if(response_is_less_than_zero)
-        throw std::runtime_error("Response values for "+link_function+" link functions cannot be less than zero.");   
+        throw std::runtime_error(error_message);   
 }
 
-void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const VectorXd &y)
+void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message)
 {
     bool response_is_not_greater_than_zero{(y.array()<=0.0).any()};
     if(response_is_not_greater_than_zero)
-        throw std::runtime_error("Response values for "+link_function+" link functions must be greater than zero.");   
+        throw std::runtime_error(error_message);   
 
 }
 
@@ -363,6 +376,25 @@ void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X,const
             sample_weight_validation[i]=sample_weight[validation_indexes[i]];
         }
     }
+
+    scale_training_observations_if_using_log_link_function();
+}
+
+void APLRRegressor::scale_training_observations_if_using_log_link_function()
+{
+    if(link_function=="log")
+    {
+        double inverse_scaling_factor{y_train.maxCoeff()/std::exp(1)};
+        bool inverse_scaling_factor_is_not_zero{!is_approximately_zero(inverse_scaling_factor)};
+        if(inverse_scaling_factor_is_not_zero)
+        {
+            scaling_factor_for_log_link_function=1/inverse_scaling_factor;
+            y_train*=scaling_factor_for_log_link_function;
+            y_validation*=scaling_factor_for_log_link_function;
+        }
+        else
+            scaling_factor_for_log_link_function=1.0;
+    }
 }
 
 void APLRRegressor::initialize()
@@ -407,7 +439,7 @@ bool APLRRegressor::check_if_base_term_has_only_one_unique_value(size_t base_ter
     bool term_has_one_unique_value{true};
     for (size_t i = 1; i < rows; ++i)
     {
-        bool observation_is_equal_to_previous{check_if_approximately_equal(X_train.col(base_term)[i], X_train.col(base_term)[i-1])};
+        bool observation_is_equal_to_previous{is_approximately_equal(X_train.col(base_term)[i], X_train.col(base_term)[i-1])};
         if(!observation_is_equal_to_previous)
         {
             term_has_one_unique_value=false;
@@ -450,20 +482,7 @@ VectorXd APLRRegressor::differentiate_predictions()
         return 1.0/4.0 * (linear_predictor_current.array()/2.0).cosh().array().pow(-2);
     else if(link_function=="log")
     {
-        double scaling{linear_predictor_current.maxCoeff()};
-        return (linear_predictor_current.array()-scaling).array().exp();
-    }
-    else if(link_function=="tweedie")
-    {
-        VectorXd transformed_linear_predictor{transform_linear_predictor_to_negative(linear_predictor_current)};
-        double scaling{std::pow((1-tweedie_power)*transformed_linear_predictor.mean(),-tweedie_power/(1-tweedie_power))};
-        return scaling*((1-tweedie_power)*transformed_linear_predictor.array()).pow(tweedie_power/(1-tweedie_power));
-    }
-    else if(link_function=="inverse")
-    {
-        VectorXd transformed_linear_predictor{transform_linear_predictor_to_negative(linear_predictor_current)};
-        double scaling{std::pow(transformed_linear_predictor.mean(),2)};
-        return scaling * transformed_linear_predictor.array().pow(-2);
+        return linear_predictor_current.array().exp();
     }
     return VectorXd(0);
 }
@@ -768,10 +787,7 @@ void APLRRegressor::select_the_best_term_and_update_errors(size_t boosting_step)
     if(validation_error_is_invalid)
     {
         abort_boosting=true;
-        std::string warning_message{"Warning: Encountered numerical problems when calculating prediction errors in the previous boosting step. Not continuing with further boosting steps."};
-        bool show_additional_warning{family=="poisson" || family=="tweedie" || family=="gamma" || (link_function!="identity" && link_function!="logit")};
-        if(show_additional_warning)
-            warning_message+=" For this combination of family and link_function, a reason may be too large or too small response values.";
+        std::string warning_message{"Warning: Encountered numerical problems when calculating prediction errors in the previous boosting step. Not continuing with further boosting steps. One potential reason is if the combination of family and link_function is invalid."};
         std::cout<<warning_message<<"\n";
     }
 }
@@ -854,15 +870,15 @@ void APLRRegressor::update_coefficients_for_all_steps()
     //Filling down coefficient_steps for the intercept
     for (size_t j = 0; j < m; ++j) //For each boosting step
     {
-        if(j>0 && check_if_approximately_zero(intercept_steps[j]) && !check_if_approximately_zero(intercept_steps[j-1]))
+        if(j>0 && is_approximately_zero(intercept_steps[j]) && !is_approximately_zero(intercept_steps[j-1]))
             intercept_steps[j]=intercept_steps[j-1];
     }
     //Filling down coefficient_steps for each term in the model
     for (size_t i = 0; i < terms.size(); ++i) //For each term
     {
         for (size_t j = 0; j < m; ++j) //For each boosting step
         {
-            if(j>0 && check_if_approximately_zero(terms[i].coefficient_steps[j]) && !check_if_approximately_zero(terms[i].coefficient_steps[j-1]))
+            if(j>0 && is_approximately_zero(terms[i].coefficient_steps[j]) && !is_approximately_zero(terms[i].coefficient_steps[j-1]))
                 terms[i].coefficient_steps[j]=terms[i].coefficient_steps[j-1];
         }
     }
@@ -893,12 +909,20 @@ void APLRRegressor::find_optimal_m_and_update_model_accordingly()
     terms_new.reserve(terms.size());
     for (size_t i = 0; i < terms.size(); ++i)
     {
-        if(!check_if_approximately_zero(terms[i].coefficient))
+        if(!is_approximately_zero(terms[i].coefficient))
             terms_new.push_back(terms[i]);
     }
     terms=std::move(terms_new);
 }
 
+void APLRRegressor::revert_scaling_if_using_log_link_function()
+{
+    if(link_function=="log")
+    {
+        intercept+=std::log(1/scaling_factor_for_log_link_function);
+    }
+}
+
 void APLRRegressor::name_terms(const MatrixXd &X, const std::vector<std::string> &X_names)
 {
     if(X_names.size()==0) //If nothing in X_names
 
@@ -1,5 +1,4 @@
 #pragma once
 #include <limits>
 
-const double NAN_DOUBLE{ std::numeric_limits<double>::quiet_NaN() };
-const double SMALL_NEGATIVE_VALUE{-0.000001};
+const double NAN_DOUBLE{ std::numeric_limits<double>::quiet_NaN() };
@@ -15,7 +15,7 @@ using namespace Eigen;
 //implements relative method - do not use for comparing with zero
 //use this most of the time, tolerance needs to be meaningful in your context
 template<typename TReal>
-static bool check_if_approximately_equal(TReal a, TReal b, TReal tolerance = std::numeric_limits<TReal>::epsilon())
+static bool is_approximately_equal(TReal a, TReal b, TReal tolerance = std::numeric_limits<TReal>::epsilon())
 {
     if(std::isinf(a) && std::isinf(b) && std::signbit(a)==std::signbit(b))
         return true;
@@ -33,7 +33,7 @@ static bool check_if_approximately_equal(TReal a, TReal b, TReal tolerance = std
 //supply tolerance that is meaningful in your context
 //for example, default tolerance may not work if you are comparing double with float
 template<typename TReal>
-static bool check_if_approximately_zero(TReal a, TReal tolerance = std::numeric_limits<TReal>::epsilon())
+static bool is_approximately_zero(TReal a, TReal tolerance = std::numeric_limits<TReal>::epsilon())
 {
     if (std::fabs(a) <= tolerance)
         return true;
@@ -146,18 +146,6 @@ double calculate_sum_error(const VectorXd &errors)
     return error;
 }
 
-VectorXd transform_linear_predictor_to_negative(const VectorXd &linear_predictor)
-{
-    VectorXd transformed_linear_predictor{linear_predictor};
-    for (size_t i = 0; i < static_cast<size_t>(transformed_linear_predictor.rows()); ++i)
-    {
-        bool row_is_positive{std::isgreaterequal(transformed_linear_predictor[i],0.0)};
-        if(row_is_positive)
-            transformed_linear_predictor[i]=SMALL_NEGATIVE_VALUE;
-    }
-    return transformed_linear_predictor;
-}
-
 VectorXd transform_linear_predictor_to_predictions(const VectorXd &linear_predictor, const std::string &link_function="identity", double tweedie_power=1.5)
 {
     if(link_function=="identity")
@@ -169,10 +157,6 @@ VectorXd transform_linear_predictor_to_predictions(const VectorXd &linear_predic
     }
     else if(link_function=="log")
         return linear_predictor.array().exp();
-    else if(link_function=="tweedie")
-        return (transform_linear_predictor_to_negative(linear_predictor).array() * (1-tweedie_power)).array().pow(1/(1-tweedie_power));
-    else if(link_function=="inverse")
-        return -1.0 / transform_linear_predictor_to_negative(linear_predictor).array();
     return VectorXd(0);
 }
 
@@ -274,10 +258,10 @@ size_t calculate_max_index_in_vector(T &vector)
 }
 
 template <typename T> //type must be an Eigen Matrix or Vector
-bool check_if_matrix_has_nan_or_infinite_elements(const T &x)
+bool matrix_has_nan_or_infinite_elements(const T &x)
 {
-    bool matrix_has_nan_or_infinite_elements{!x.allFinite()};
-    if(matrix_has_nan_or_infinite_elements)
+    bool has_nan_or_infinite_elements{!x.allFinite()};
+    if(has_nan_or_infinite_elements)
         return true;
     else
         return false;
@@ -289,8 +273,8 @@ void throw_error_if_matrix_has_nan_or_infinite_elements(const T &x, const std::s
     bool matrix_is_empty{x.size()==0};
     if(matrix_is_empty) return;
 
-    bool matrix_has_nan_or_infinite_elements{check_if_matrix_has_nan_or_infinite_elements(x)};
-    if(matrix_has_nan_or_infinite_elements)
+    bool has_nan_or_infinite_elements{matrix_has_nan_or_infinite_elements(x)};
+    if(has_nan_or_infinite_elements)
     {
         throw std::runtime_error(matrix_name + " has nan or infinite elements.");
     }
 
@@ -46,10 +46,10 @@ int main()
     //Saving results
     save_data("output.csv",predictions);
     std::cout<<"min validation_error "<<model.validation_error_steps.minCoeff()<<"\n\n";
-    std::cout<<check_if_approximately_equal(model.validation_error_steps.minCoeff(),7.02559,0.00001)<<"\n";
+    std::cout<<is_approximately_equal(model.validation_error_steps.minCoeff(),7.02559,0.00001)<<"\n";
 
     std::cout<<"mean prediction "<<predictions.mean()<<"\n\n";
-    std::cout<<check_if_approximately_equal(predictions.mean(),23.9213,0.0001)<<"\n";
+    std::cout<<is_approximately_equal(predictions.mean(),23.9213,0.0001)<<"\n";
 
     std::cout<<"best_m: "<<model.m<<"\n";
 
 
@@ -115,7 +115,7 @@ Term::~Term()
 //Compare everything except given_terms
 bool Term::equals_not_comparing_given_terms(const Term &p1,const Term &p2)
 {
-    bool split_point_and_direction{(check_if_approximately_equal(p1.split_point,p2.split_point) && p1.direction_right==p2.direction_right) || (std::isnan(p1.split_point) && std::isnan(p2.split_point))};
+    bool split_point_and_direction{(is_approximately_equal(p1.split_point,p2.split_point) && p1.direction_right==p2.direction_right) || (std::isnan(p1.split_point) && std::isnan(p2.split_point))};
     bool base_term{p1.base_term==p2.base_term};
     return split_point_and_direction && base_term;
 }
@@ -191,7 +191,7 @@ void Term::calculate_given_terms_indices(const MatrixXd &X)
             VectorXd values_given_term{given_terms[j].calculate(X)};
             for (size_t i = 0; i < static_cast<size_t>(X.rows()); ++i) //for each row
             {
-                if(check_if_approximately_zero(values_given_term[i])) //if zeroed out by given term
+                if(is_approximately_zero(values_given_term[i])) //if zeroed out by given term
                 {
                     given_terms_indices.zeroed[count_zeroed]=i;
                     ++count_zeroed;
@@ -227,7 +227,7 @@ VectorXd Term::calculate(const MatrixXd &X)
             VectorXd values_given_term{given_terms[j].calculate(X)};
             for (size_t i = 0; i < static_cast<size_t>(values.size()); ++i) //for each row
             {
-                if(check_if_approximately_zero(values_given_term[i]))
+                if(is_approximately_zero(values_given_term[i]))
                     values[i]=0;
             }
         }   
@@ -365,7 +365,7 @@ void Term::setup_bins()
             potential_start_indexes.reserve(sorted_vectors.values_sorted.size());
             for (size_t i = start_row; i <= end_row; ++i)
             {
-                bool is_eligible_start_index{i>0 && !check_if_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};
+                bool is_eligible_start_index{i>0 && !is_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};
                 if(is_eligible_start_index)
                     potential_start_indexes.push_back(i);
             }
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ Term::~Term()`
`115`	`115`	`//Compare everything except given_terms`
`116`	`116`	`bool Term::equals_not_comparing_given_terms(const Term &p1,const Term &p2)`
`117`	`117`	`{`
`118`		`- bool split_point_and_direction{(check_if_approximately_equal(p1.split_point,p2.split_point) && p1.direction_right==p2.direction_right) \|\| (std::isnan(p1.split_point) && std::isnan(p2.split_point))};`
	`118`	`+ bool split_point_and_direction{(is_approximately_equal(p1.split_point,p2.split_point) && p1.direction_right==p2.direction_right) \|\| (std::isnan(p1.split_point) && std::isnan(p2.split_point))};`
`119`	`119`	`bool base_term{p1.base_term==p2.base_term};`
`120`	`120`	`return split_point_and_direction && base_term;`
`121`	`121`	`}`
`@@ -191,7 +191,7 @@ void Term::calculate_given_terms_indices(const MatrixXd &X)`
`191`	`191`	`VectorXd values_given_term{given_terms[j].calculate(X)};`
`192`	`192`	`for (size_t i = 0; i < static_cast<size_t>(X.rows()); ++i) //for each row`
`193`	`193`	`{`
`194`		`- if(check_if_approximately_zero(values_given_term[i])) //if zeroed out by given term`
	`194`	`+ if(is_approximately_zero(values_given_term[i])) //if zeroed out by given term`
`195`	`195`	`{`
`196`	`196`	`given_terms_indices.zeroed[count_zeroed]=i;`
`197`	`197`	`++count_zeroed;`
`@@ -227,7 +227,7 @@ VectorXd Term::calculate(const MatrixXd &X)`
`227`	`227`	`VectorXd values_given_term{given_terms[j].calculate(X)};`
`228`	`228`	`for (size_t i = 0; i < static_cast<size_t>(values.size()); ++i) //for each row`
`229`	`229`	`{`
`230`		`- if(check_if_approximately_zero(values_given_term[i]))`
	`230`	`+ if(is_approximately_zero(values_given_term[i]))`
`231`	`231`	`values[i]=0;`
`232`	`232`	`}`
`233`	`233`	`}`
`@@ -365,7 +365,7 @@ void Term::setup_bins()`
`365`	`365`	`potential_start_indexes.reserve(sorted_vectors.values_sorted.size());`
`366`	`366`	`for (size_t i = start_row; i <= end_row; ++i)`
`367`	`367`	`{`
`368`		`- bool is_eligible_start_index{i>0 && !check_if_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};`
	`368`	`+ bool is_eligible_start_index{i>0 && !is_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};`
`369`	`369`	`if(is_eligible_start_index)`
`370`	`370`	`potential_start_indexes.push_back(i);`
`371`	`371`	`}`