ottenbreit-data-science
diff --git a/‎cpp/APLRRegressor.h‎
Lines changed: 17 additions & 33 deletions b/‎cpp/APLRRegressor.h‎
Lines changed: 17 additions & 33 deletions
diff --git a/‎cpp/functions.h‎
Lines changed: 40 additions & 35 deletions b/‎cpp/functions.h‎
Lines changed: 40 additions & 35 deletions
@@ -551,7 +551,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
         std::string error_message{"Response values for the logit link function or binomial loss_function cannot be less than zero or greater than one."};
         throw_error_if_response_is_not_between_0_and_1(y, error_message);
     }
-    else if (loss_function == "gamma" || (loss_function == "tweedie" && std::isgreater(dispersion_parameter, 2)))
+    else if (loss_function == "gamma" || (loss_function == "tweedie" && std::isgreater(dispersion_parameter, 2.0)))
     {
         std::string error_message;
         if (loss_function == "tweedie")
@@ -560,7 +560,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
             error_message = "Response values for the " + loss_function + " loss_function must be greater than zero.";
         throw_error_if_vector_contains_non_positive_values(y, error_message);
     }
-    else if (link_function == "log" || loss_function == "poisson" || loss_function == "negative_binomial" || loss_function == "weibull" || (loss_function == "tweedie" && std::isless(dispersion_parameter, 2) && std::isgreater(dispersion_parameter, 1)))
+    else if (link_function == "log" || loss_function == "poisson" || loss_function == "negative_binomial" || loss_function == "weibull" || (loss_function == "tweedie" && std::isless(dispersion_parameter, 2.0) && std::isgreater(dispersion_parameter, 1.0)))
     {
         std::string error_message{"Response values for the log link function or poisson loss_function or negative binomial loss function or weibull loss function or tweedie loss_function when dispersion_parameter<2 cannot be less than zero."};
         throw_error_if_vector_contains_negative_values(y, error_message);
@@ -569,7 +569,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
     {
         std::string error_message{"Response values cannot be negative when using the negative_gini validation_tuning_metric."};
         throw_error_if_vector_contains_negative_values(y, error_message);
-        bool sum_is_zero{y.sum() == 0};
+        bool sum_is_zero{is_approximately_zero(y.sum())};
         if (sum_is_zero)
             throw std::runtime_error("Response values cannot sum to zero when using the negative_gini validation_tuning_metric.");
     }
@@ -687,7 +687,10 @@ void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X, const
         {
             sample_weight_train[i] = sample_weight[train_indexes[i]];
         }
+        sample_weight_train /= sample_weight_train.mean();
     }
+    else
+        sample_weight_train = VectorXd::Constant(y_train.rows(), 1.0);
     bool groups_are_provided{group.size() > 0};
     if (groups_are_provided)
     {
@@ -720,7 +723,10 @@ void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X, const
         {
             sample_weight_validation[i] = sample_weight[validation_indexes[i]];
         }
+        sample_weight_validation /= sample_weight_validation.mean();
     }
+    else
+        sample_weight_validation = VectorXd::Constant(y_validation.rows(), 1.0);
     if (groups_are_provided)
     {
         group_validation.resize(validation_indexes.size());
@@ -937,7 +943,7 @@ VectorXd APLRRegressor::calculate_neg_gradient_current()
         output = (y_train.array() - predictions_current.array()).sign() * mae;
         for (Eigen::Index i = 0; i < y_train.size(); ++i)
         {
-            if (y_train[i] < predictions_current[i])
+            if (std::isless(y_train[i], predictions_current[i]))
                 output[i] *= 1 - quantile;
             else
                 output[i] *= quantile;
@@ -984,20 +990,9 @@ VectorXd APLRRegressor::calculate_neg_gradient_current_for_group_mse(GroupData &
     }
 
     VectorXd output{VectorXd(y_train.rows())};
-    bool sample_weight_is_provided{sample_weight_train.size() > 0};
-    if (sample_weight_is_provided)
+    for (Eigen::Index i = 0; i < y_train.size(); ++i)
     {
-        for (Eigen::Index i = 0; i < y_train.size(); ++i)
-        {
-            output[i] = group_residuals_and_count.error[group[i]] * sample_weight_train[i];
-        }
-    }
-    else
-    {
-        for (Eigen::Index i = 0; i < y_train.size(); ++i)
-        {
-            output[i] = group_residuals_and_count.error[group[i]];
-        }
+        output[i] = group_residuals_and_count.error[group[i]] * sample_weight_train[i];
     }
 
     return output;
@@ -1093,10 +1088,7 @@ void APLRRegressor::execute_boosting_step(size_t boosting_step, Eigen::Index fol
 void APLRRegressor::update_intercept(size_t boosting_step)
 {
     double intercept_update;
-    if (sample_weight_train.size() == 0)
-        intercept_update = v * neg_gradient_current.mean();
-    else
-        intercept_update = v * (neg_gradient_current.array() * sample_weight_train.array()).sum() / sample_weight_train.array().sum();
+    intercept_update = v * (neg_gradient_current.array() * sample_weight_train.array()).sum() / sample_weight_train.array().sum();
     if (model_has_changed_in_this_boosting_step == false)
         model_has_changed_in_this_boosting_step = !is_approximately_equal(intercept_update, 0.0);
     linear_predictor_update = VectorXd::Constant(neg_gradient_current.size(), intercept_update);
@@ -1630,7 +1622,7 @@ void APLRRegressor::merge_similar_terms(const MatrixXd &X)
                 {
                     VectorXd values_i{terms[i].calculate(X)};
                     VectorXd values_j{terms[j].calculate(X)};
-                    bool terms_are_similar{values_i == values_j};
+                    bool terms_are_similar{all_are_equal(values_i, values_j)};
                     if (terms_are_similar)
                     {
                         if (terms[i].get_interaction_level() > terms[j].get_interaction_level())
@@ -1744,7 +1736,7 @@ std::string APLRRegressor::compute_raw_base_term_name(const Term &term, const st
     {
         double temp_split_point{term.split_point};
         std::string sign{"-"};
-        if (std::isless(temp_split_point, 0))
+        if (std::isless(temp_split_point, 0.0))
         {
             temp_split_point = -temp_split_point;
             sign = "+";
@@ -1839,15 +1831,7 @@ void APLRRegressor::write_output_to_cv_fold_models(Eigen::Index fold_index)
     cv_fold_models[fold_index].fold_index = fold_index;
     cv_fold_models[fold_index].min_training_prediction_or_response = min_training_prediction_or_response;
     cv_fold_models[fold_index].max_training_prediction_or_response = max_training_prediction_or_response;
-    bool sample_weight_is_provided{sample_weight_train.size() > 0};
-    if (sample_weight_is_provided)
-    {
-        cv_fold_models[fold_index].sample_weight_train_sum = sample_weight_train.sum();
-    }
-    else
-    {
-        cv_fold_models[fold_index].sample_weight_train_sum = static_cast<double>(y_train.rows());
-    }
+    cv_fold_models[fold_index].sample_weight_train_sum = sample_weight_train.sum();
 }
 
 void APLRRegressor::cleanup_after_fit()
@@ -1989,7 +1973,7 @@ void APLRRegressor::sort_terms()
 {
     std::sort(terms.begin(), terms.end(),
               [](const Term &a, const Term &b)
-              { return a.estimated_term_importance > b.estimated_term_importance ||
+              { return std::isgreater(a.estimated_term_importance, b.estimated_term_importance) ||
                        (is_approximately_equal(a.estimated_term_importance, b.estimated_term_importance) && (a.base_term < b.base_term)) ||
                        (is_approximately_equal(a.estimated_term_importance, b.estimated_term_importance) && (a.base_term == b.base_term) &&
                         std::isless(a.coefficient, b.coefficient)); });
 
@@ -13,28 +13,46 @@
 
 using namespace Eigen;
 
-template <typename TReal>
-static bool is_approximately_equal(TReal a, TReal b, TReal tolerance = std::numeric_limits<TReal>::epsilon())
+bool is_approximately_equal(double a, double b, double tolerance = std::numeric_limits<double>::epsilon())
 {
-    if (std::isinf(a) && std::isinf(b) && std::signbit(a) == std::signbit(b))
-        return true;
+    if (std::isinf(a) && std::isinf(b))
+    {
+        if (std::signbit(a) == std::signbit(b))
+            return true;
+        else
+            return false;
+    }
 
-    TReal diff = std::fabs(a - b);
-    if (diff <= tolerance)
-        return true;
+    double relative_tolerance;
+    if (std::isinf(a) || std::isinf(b))
+        relative_tolerance = (fabs(a) > fabs(b) ? fabs(b) : fabs(a)) * tolerance;
+    else
+        relative_tolerance = (fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * tolerance;
+    double absolute_tolerance{std::fmax(relative_tolerance, tolerance)};
+    bool equal{fabs(a - b) <= absolute_tolerance};
 
-    if (diff < std::fmax(std::fabs(a), std::fabs(b)) * tolerance)
-        return true;
+    return equal;
+}
 
-    return false;
+bool is_approximately_zero(double a, double tolerance = std::numeric_limits<double>::epsilon())
+{
+    return is_approximately_equal(a, 0.0, tolerance);
 }
 
-template <typename TReal>
-static bool is_approximately_zero(TReal a, TReal tolerance = std::numeric_limits<TReal>::epsilon())
+bool all_are_equal(VectorXd &v1, VectorXd &v2)
 {
-    if (std::fabs(a) <= tolerance)
-        return true;
-    return false;
+    if (v1.rows() != v2.rows())
+        return false;
+
+    for (Eigen::Index i = 0; i < v1.size(); ++i)
+    {
+        if (!is_approximately_equal(v1[i], v2[i]))
+        {
+            return false;
+        }
+    }
+
+    return true;
 }
 
 std::set<std::string> get_unique_strings(const std::vector<std::string> &string_vector)
@@ -105,22 +123,10 @@ GroupData calculate_group_errors_and_count(const VectorXd &y, const VectorXd &pr
         group_data.count[unique_group_value] = 0.0;
     }
 
-    bool sample_weight_is_provided{sample_weight.size() > 0};
-    if (sample_weight_is_provided)
-    {
-        for (Eigen::Index i = 0; i < group.size(); ++i)
-        {
-            group_data.error[group[i]] += (y[i] - predicted[i]) * sample_weight[i];
-            group_data.count[group[i]] += sample_weight[i];
-        }
-    }
-    else
+    for (Eigen::Index i = 0; i < group.size(); ++i)
     {
-        for (Eigen::Index i = 0; i < group.size(); ++i)
-        {
-            group_data.error[group[i]] += y[i] - predicted[i];
-            group_data.count[group[i]] += 1.0;
-        }
+        group_data.error[group[i]] += (y[i] - predicted[i]) * sample_weight[i];
+        group_data.count[group[i]] += sample_weight[i];
     }
 
     for (int unique_group_value : unique_groups)
@@ -194,7 +200,7 @@ VectorXd calculate_weibull_errors(const VectorXd &y, const VectorXd &predicted,
     return errors;
 }
 
-VectorXd calculate_errors(const VectorXd &y, const VectorXd &predicted, const VectorXd &sample_weight = VectorXd(0), const std::string &loss_function = "mse",
+VectorXd calculate_errors(const VectorXd &y, const VectorXd &predicted, const VectorXd &sample_weight, const std::string &loss_function = "mse",
                           double dispersion_parameter = 1.5, const VectorXi &group = VectorXi(0), const std::set<int> &unique_groups = {}, double quantile = 0.5)
 {
     VectorXd errors;
@@ -221,8 +227,7 @@ VectorXd calculate_errors(const VectorXd &y, const VectorXd &predicted, const Ve
     else if (loss_function == "weibull")
         errors = calculate_weibull_errors(y, predicted, dispersion_parameter);
 
-    if (sample_weight.size() > 0)
-        errors = errors.array() * sample_weight.array();
+    errors = errors.array() * sample_weight.array();
 
     return errors;
 }
@@ -234,7 +239,7 @@ double calculate_mse_error_one_observation(double y, double predicted)
     return error;
 }
 
-double calculate_error_one_observation(double y, double predicted, double sample_weight = NAN_DOUBLE)
+double calculate_error_one_observation(double y, double predicted, double sample_weight)
 {
     double error{calculate_mse_error_one_observation(y, predicted)};
 
@@ -244,7 +249,7 @@ double calculate_error_one_observation(double y, double predicted, double sample
     return error;
 }
 
-double calculate_mean_error(const VectorXd &errors, const VectorXd &sample_weight = VectorXd(0))
+double calculate_mean_error(const VectorXd &errors, const VectorXd &sample_weight)
 {
     double error{std::numeric_limits<double>::infinity()};
Original file line number	Diff line number	Diff line change
`@@ -551,7 +551,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector`
`551`	`551`	`std::string error_message{"Response values for the logit link function or binomial loss_function cannot be less than zero or greater than one."};`
`552`	`552`	`throw_error_if_response_is_not_between_0_and_1(y, error_message);`
`553`	`553`	`}`
`554`		`- else if (loss_function == "gamma" \|\| (loss_function == "tweedie" && std::isgreater(dispersion_parameter, 2)))`
	`554`	`+ else if (loss_function == "gamma" \|\| (loss_function == "tweedie" && std::isgreater(dispersion_parameter, 2.0)))`
`555`	`555`	`{`
`556`	`556`	`std::string error_message;`
`557`	`557`	`if (loss_function == "tweedie")`
`@@ -560,7 +560,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector`
`560`	`560`	`error_message = "Response values for the " + loss_function + " loss_function must be greater than zero.";`
`561`	`561`	`throw_error_if_vector_contains_non_positive_values(y, error_message);`
`562`	`562`	`}`
`563`		`- else if (link_function == "log" \|\| loss_function == "poisson" \|\| loss_function == "negative_binomial" \|\| loss_function == "weibull" \|\| (loss_function == "tweedie" && std::isless(dispersion_parameter, 2) && std::isgreater(dispersion_parameter, 1)))`
	`563`	`+ else if (link_function == "log" \|\| loss_function == "poisson" \|\| loss_function == "negative_binomial" \|\| loss_function == "weibull" \|\| (loss_function == "tweedie" && std::isless(dispersion_parameter, 2.0) && std::isgreater(dispersion_parameter, 1.0)))`
`564`	`564`	`{`
`565`	`565`	`std::string error_message{"Response values for the log link function or poisson loss_function or negative binomial loss function or weibull loss function or tweedie loss_function when dispersion_parameter<2 cannot be less than zero."};`
`566`	`566`	`throw_error_if_vector_contains_negative_values(y, error_message);`
`@@ -569,7 +569,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector`
`569`	`569`	`{`
`570`	`570`	`std::string error_message{"Response values cannot be negative when using the negative_gini validation_tuning_metric."};`
`571`	`571`	`throw_error_if_vector_contains_negative_values(y, error_message);`
`572`		`- bool sum_is_zero{y.sum() == 0};`
	`572`	`+ bool sum_is_zero{is_approximately_zero(y.sum())};`
`573`	`573`	`if (sum_is_zero)`
`574`	`574`	`throw std::runtime_error("Response values cannot sum to zero when using the negative_gini validation_tuning_metric.");`
`575`	`575`	`}`
`@@ -687,7 +687,10 @@ void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X, const`
`687`	`687`	`{`
`688`	`688`	`sample_weight_train[i] = sample_weight[train_indexes[i]];`
`689`	`689`	`}`
	`690`	`+ sample_weight_train /= sample_weight_train.mean();`
`690`	`691`	`}`
	`692`	`+ else`
	`693`	`+ sample_weight_train = VectorXd::Constant(y_train.rows(), 1.0);`
`691`	`694`	`bool groups_are_provided{group.size() > 0};`
`692`	`695`	`if (groups_are_provided)`
`693`	`696`	`{`
`@@ -720,7 +723,10 @@ void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X, const`
`720`	`723`	`{`
`721`	`724`	`sample_weight_validation[i] = sample_weight[validation_indexes[i]];`
`722`	`725`	`}`
	`726`	`+ sample_weight_validation /= sample_weight_validation.mean();`
`723`	`727`	`}`
	`728`	`+ else`
	`729`	`+ sample_weight_validation = VectorXd::Constant(y_validation.rows(), 1.0);`
`724`	`730`	`if (groups_are_provided)`
`725`	`731`	`{`
`726`	`732`	`group_validation.resize(validation_indexes.size());`
`@@ -937,7 +943,7 @@ VectorXd APLRRegressor::calculate_neg_gradient_current()`
`937`	`943`	`output = (y_train.array() - predictions_current.array()).sign() * mae;`
`938`	`944`	`for (Eigen::Index i = 0; i < y_train.size(); ++i)`
`939`	`945`	`{`
`940`		`- if (y_train[i] < predictions_current[i])`
	`946`	`+ if (std::isless(y_train[i], predictions_current[i]))`
`941`	`947`	`output[i] *= 1 - quantile;`
`942`	`948`	`else`
`943`	`949`	`output[i] *= quantile;`
`@@ -984,20 +990,9 @@ VectorXd APLRRegressor::calculate_neg_gradient_current_for_group_mse(GroupData &`
`984`	`990`	`}`
`985`	`991`
`986`	`992`	`VectorXd output{VectorXd(y_train.rows())};`
`987`		`- bool sample_weight_is_provided{sample_weight_train.size() > 0};`
`988`		`- if (sample_weight_is_provided)`
	`993`	`+ for (Eigen::Index i = 0; i < y_train.size(); ++i)`
`989`	`994`	`{`
`990`		`- for (Eigen::Index i = 0; i < y_train.size(); ++i)`
`991`		`- {`
`992`		`- output[i] = group_residuals_and_count.error[group[i]] * sample_weight_train[i];`
`993`		`- }`
`994`		`- }`
`995`		`- else`
`996`		`- {`
`997`		`- for (Eigen::Index i = 0; i < y_train.size(); ++i)`
`998`		`- {`
`999`		`- output[i] = group_residuals_and_count.error[group[i]];`
`1000`		`- }`
	`995`	`+ output[i] = group_residuals_and_count.error[group[i]] * sample_weight_train[i];`
`1001`	`996`	`}`
`1002`	`997`
`1003`	`998`	`return output;`
`@@ -1093,10 +1088,7 @@ void APLRRegressor::execute_boosting_step(size_t boosting_step, Eigen::Index fol`
`1093`	`1088`	`void APLRRegressor::update_intercept(size_t boosting_step)`
`1094`	`1089`	`{`
`1095`	`1090`	`double intercept_update;`
`1096`		`- if (sample_weight_train.size() == 0)`
`1097`		`- intercept_update = v * neg_gradient_current.mean();`
`1098`		`- else`
`1099`		`- intercept_update = v * (neg_gradient_current.array() * sample_weight_train.array()).sum() / sample_weight_train.array().sum();`
	`1091`	`+ intercept_update = v * (neg_gradient_current.array() * sample_weight_train.array()).sum() / sample_weight_train.array().sum();`
`1100`	`1092`	`if (model_has_changed_in_this_boosting_step == false)`
`1101`	`1093`	`model_has_changed_in_this_boosting_step = !is_approximately_equal(intercept_update, 0.0);`
`1102`	`1094`	`linear_predictor_update = VectorXd::Constant(neg_gradient_current.size(), intercept_update);`
`@@ -1630,7 +1622,7 @@ void APLRRegressor::merge_similar_terms(const MatrixXd &X)`
`1630`	`1622`	`{`
`1631`	`1623`	`VectorXd values_i{terms[i].calculate(X)};`
`1632`	`1624`	`VectorXd values_j{terms[j].calculate(X)};`
`1633`		`- bool terms_are_similar{values_i == values_j};`
	`1625`	`+ bool terms_are_similar{all_are_equal(values_i, values_j)};`
`1634`	`1626`	`if (terms_are_similar)`
`1635`	`1627`	`{`
`1636`	`1628`	`if (terms[i].get_interaction_level() > terms[j].get_interaction_level())`
`@@ -1744,7 +1736,7 @@ std::string APLRRegressor::compute_raw_base_term_name(const Term &term, const st`
`1744`	`1736`	`{`
`1745`	`1737`	`double temp_split_point{term.split_point};`
`1746`	`1738`	`std::string sign{"-"};`
`1747`		`- if (std::isless(temp_split_point, 0))`
	`1739`	`+ if (std::isless(temp_split_point, 0.0))`
`1748`	`1740`	`{`
`1749`	`1741`	`temp_split_point = -temp_split_point;`
`1750`	`1742`	`sign = "+";`
`@@ -1839,15 +1831,7 @@ void APLRRegressor::write_output_to_cv_fold_models(Eigen::Index fold_index)`
`1839`	`1831`	`cv_fold_models[fold_index].fold_index = fold_index;`
`1840`	`1832`	`cv_fold_models[fold_index].min_training_prediction_or_response = min_training_prediction_or_response;`
`1841`	`1833`	`cv_fold_models[fold_index].max_training_prediction_or_response = max_training_prediction_or_response;`
`1842`		`- bool sample_weight_is_provided{sample_weight_train.size() > 0};`
`1843`		`- if (sample_weight_is_provided)`
`1844`		`- {`
`1845`		`- cv_fold_models[fold_index].sample_weight_train_sum = sample_weight_train.sum();`
`1846`		`- }`
`1847`		`- else`
`1848`		`- {`
`1849`		`- cv_fold_models[fold_index].sample_weight_train_sum = static_cast<double>(y_train.rows());`
`1850`		`- }`
	`1834`	`+ cv_fold_models[fold_index].sample_weight_train_sum = sample_weight_train.sum();`
`1851`	`1835`	`}`
`1852`	`1836`
`1853`	`1837`	`void APLRRegressor::cleanup_after_fit()`
`@@ -1989,7 +1973,7 @@ void APLRRegressor::sort_terms()`
`1989`	`1973`	`{`
`1990`	`1974`	`std::sort(terms.begin(), terms.end(),`
`1991`	`1975`	`[](const Term &a, const Term &b)`
`1992`		`- { return a.estimated_term_importance > b.estimated_term_importance \|\|`
	`1976`	`+ { return std::isgreater(a.estimated_term_importance, b.estimated_term_importance) \|\|`
`1993`	`1977`	`(is_approximately_equal(a.estimated_term_importance, b.estimated_term_importance) && (a.base_term < b.base_term)) \|\|`
`1994`	`1978`	`(is_approximately_equal(a.estimated_term_importance, b.estimated_term_importance) && (a.base_term == b.base_term) &&`
`1995`	`1979`	`std::isless(a.coefficient, b.coefficient)); });`