Skip to content

Commit e05d632

Browse files
bugfix
1 parent 6e6ec50 commit e05d632

File tree

4 files changed

+58
-28
lines changed

4 files changed

+58
-28
lines changed

cpp/APLRRegressor.h

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class APLRRegressor
6666
const std::vector<std::vector<size_t>> &interaction_constraints);
6767
bool check_if_base_term_has_only_one_unique_value(size_t base_term);
6868
void add_term_to_terms_eligible_current(Term &term);
69-
VectorXd calculate_neg_gradient_current(const VectorXd &sample_weight_train);
69+
VectorXd calculate_neg_gradient_current();
7070
void execute_boosting_steps();
7171
void execute_boosting_step(size_t boosting_step);
7272
std::vector<size_t> find_terms_eligible_current_indexes_for_a_base_term(size_t base_term);
@@ -109,7 +109,7 @@ class APLRRegressor
109109
void throw_error_if_sample_weight_contains_invalid_values(const VectorXd &y, const VectorXd &sample_weight);
110110
void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y, const std::string &error_message);
111111
void throw_error_if_vector_contains_negative_values(const VectorXd &y, const std::string &error_message);
112-
void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message);
112+
void throw_error_if_vector_contains_non_positive_values(const VectorXd &y, const std::string &error_message);
113113
void throw_error_if_dispersion_parameter_is_invalid();
114114
VectorXd differentiate_predictions_wrt_linear_predictor();
115115
void scale_response_if_using_log_link_function();
@@ -366,9 +366,9 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X, const VectorXd &y,
366366
throw_error_if_interaction_constraints_has_invalid_indexes(X, interaction_constraints);
367367
throw_error_if_response_contains_invalid_values(y);
368368
throw_error_if_sample_weight_contains_invalid_values(y, sample_weight);
369-
bool group_is_of_incorrect_size{loss_function == "group_mse" && group.rows() != y.rows()};
369+
bool group_is_of_incorrect_size{(loss_function == "group_mse" || validation_tuning_metric == "group_mse") && group.rows() != y.rows()};
370370
if (group_is_of_incorrect_size)
371-
throw std::runtime_error("When loss_function is group_mse then y and group must have the same number of rows.");
371+
throw std::runtime_error("When loss_function or validation_tuning_metric is group_mse then y and group must have the same number of rows.");
372372
bool other_data_is_provided{other_data.size() > 0};
373373
if (other_data_is_provided)
374374
{
@@ -439,7 +439,7 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
439439
error_message = "Response values for the " + loss_function + " loss_function when dispersion_parameter>2 must be greater than zero.";
440440
else
441441
error_message = "Response values for the " + loss_function + " loss_function must be greater than zero.";
442-
throw_error_if_response_is_not_greater_than_zero(y, error_message);
442+
throw_error_if_vector_contains_non_positive_values(y, error_message);
443443
}
444444
else if (link_function == "log" || loss_function == "poisson" || loss_function == "negative_binomial" || loss_function == "weibull" || (loss_function == "tweedie" && std::isless(dispersion_parameter, 2) && std::isgreater(dispersion_parameter, 1)))
445445
{
@@ -471,7 +471,7 @@ void APLRRegressor::throw_error_if_vector_contains_negative_values(const VectorX
471471
throw std::runtime_error(error_message);
472472
}
473473

474-
void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message)
474+
void APLRRegressor::throw_error_if_vector_contains_non_positive_values(const VectorXd &y, const std::string &error_message)
475475
{
476476
bool response_is_not_greater_than_zero{(y.array() <= 0.0).any()};
477477
if (response_is_not_greater_than_zero)
@@ -485,10 +485,7 @@ void APLRRegressor::throw_error_if_sample_weight_contains_invalid_values(const V
485485
{
486486
if (sample_weight.size() != y.size())
487487
throw std::runtime_error("sample_weight must have 0 or as many rows as X and y.");
488-
throw_error_if_vector_contains_negative_values(sample_weight, "sample_weight cannot contain negative values.");
489-
bool sum_is_zero{sample_weight.sum() == 0};
490-
if (sum_is_zero)
491-
throw std::runtime_error("sample_weight cannot sum to zero.");
488+
throw_error_if_vector_contains_non_positive_values(sample_weight, "all sample_weight values must be greater than zero.");
492489
}
493490
}
494491

@@ -705,7 +702,7 @@ void APLRRegressor::add_term_to_terms_eligible_current(Term &term)
705702
terms_eligible_current.push_back(term);
706703
}
707704

708-
VectorXd APLRRegressor::calculate_neg_gradient_current(const VectorXd &sample_weight_train)
705+
VectorXd APLRRegressor::calculate_neg_gradient_current()
709706
{
710707
VectorXd output;
711708
if (loss_function == "mse")
@@ -720,17 +717,29 @@ VectorXd APLRRegressor::calculate_neg_gradient_current(const VectorXd &sample_we
720717
output = (y_train.array() - predictions_current.array()).array() * predictions_current.array().pow(-dispersion_parameter);
721718
else if (loss_function == "group_mse")
722719
{
723-
GroupData group_residuals_and_count{calculate_group_errors_and_count(y_train, predictions_current, group_train, unique_groups_train)};
720+
GroupData group_residuals_and_count{calculate_group_errors_and_count(y_train, predictions_current, group_train, unique_groups_train,
721+
sample_weight_train)};
724722

725723
for (int unique_group_value : unique_groups_train)
726724
{
727725
group_residuals_and_count.error[unique_group_value] /= group_residuals_and_count.count[unique_group_value];
728726
}
729727

730728
output = VectorXd(y_train.rows());
731-
for (Eigen::Index i = 0; i < y_train.size(); ++i)
729+
bool sample_weight_is_provided{sample_weight_train.size() > 0};
730+
if (sample_weight_is_provided)
732731
{
733-
output[i] = group_residuals_and_count.error[group_train[i]];
732+
for (Eigen::Index i = 0; i < y_train.size(); ++i)
733+
{
734+
output[i] = group_residuals_and_count.error[group_train[i]] * sample_weight_train[i];
735+
}
736+
}
737+
else
738+
{
739+
for (Eigen::Index i = 0; i < y_train.size(); ++i)
740+
{
741+
output[i] = group_residuals_and_count.error[group_train[i]];
742+
}
734743
}
735744
}
736745
else if (loss_function == "mae")
@@ -892,7 +901,7 @@ void APLRRegressor::update_linear_predictor_and_predictions()
892901

893902
void APLRRegressor::update_gradient_and_errors()
894903
{
895-
neg_gradient_current = calculate_neg_gradient_current(sample_weight_train);
904+
neg_gradient_current = calculate_neg_gradient_current();
896905
neg_gradient_nullmodel_errors_sum = calculate_sum_error(calculate_errors(neg_gradient_current, linear_predictor_null_model, sample_weight_train, MSE_LOSS_FUNCTION));
897906
}
898907

cpp/functions.h

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,33 +94,53 @@ VectorXd calculate_tweedie_errors(const VectorXd &y, const VectorXd &predicted,
9494
struct GroupData
9595
{
9696
std::map<int, double> error;
97-
std::map<int, size_t> count;
97+
std::map<int, double> count;
9898
};
9999

100-
GroupData calculate_group_errors_and_count(const VectorXd &y, const VectorXd &predicted, const VectorXi &group, const std::set<int> &unique_groups)
100+
GroupData calculate_group_errors_and_count(const VectorXd &y, const VectorXd &predicted, const VectorXi &group, const std::set<int> &unique_groups,
101+
const VectorXd &sample_weight)
101102
{
102103
GroupData group_data;
103104
for (int unique_group_value : unique_groups)
104105
{
105106
group_data.error[unique_group_value] = 0.0;
106-
group_data.count[unique_group_value] = 0;
107+
group_data.count[unique_group_value] = 0.0;
107108
}
108-
for (Eigen::Index i = 0; i < group.size(); ++i)
109+
110+
bool sample_weight_is_provided{sample_weight.size() > 0};
111+
if (sample_weight_is_provided)
109112
{
110-
group_data.error[group[i]] += y[i] - predicted[i];
111-
group_data.count[group[i]] += 1;
113+
for (Eigen::Index i = 0; i < group.size(); ++i)
114+
{
115+
group_data.error[group[i]] += (y[i] - predicted[i]) * sample_weight[i];
116+
group_data.count[group[i]] += sample_weight[i];
117+
}
112118
}
119+
else
120+
{
121+
for (Eigen::Index i = 0; i < group.size(); ++i)
122+
{
123+
group_data.error[group[i]] += y[i] - predicted[i];
124+
group_data.count[group[i]] += 1.0;
125+
}
126+
}
127+
128+
for (int unique_group_value : unique_groups)
129+
{
130+
group_data.error[unique_group_value] = group_data.error[unique_group_value] / group_data.count[unique_group_value];
131+
}
132+
113133
return group_data;
114134
}
115135

116-
VectorXd calculate_group_mse_errors(const VectorXd &y, const VectorXd &predicted, const VectorXi &group, const std::set<int> &unique_groups)
136+
VectorXd calculate_group_mse_errors(const VectorXd &y, const VectorXd &predicted, const VectorXi &group, const std::set<int> &unique_groups,
137+
const VectorXd &sample_weight)
117138
{
118-
GroupData group_residuals_and_count{calculate_group_errors_and_count(y, predicted, group, unique_groups)};
139+
GroupData group_residuals_and_count{calculate_group_errors_and_count(y, predicted, group, unique_groups, sample_weight)};
119140

120141
for (int unique_group_value : unique_groups)
121142
{
122143
group_residuals_and_count.error[unique_group_value] *= group_residuals_and_count.error[unique_group_value];
123-
group_residuals_and_count.error[unique_group_value] /= group_residuals_and_count.count[unique_group_value];
124144
}
125145

126146
VectorXd errors(y.rows());
@@ -191,7 +211,7 @@ VectorXd calculate_errors(const VectorXd &y, const VectorXd &predicted, const Ve
191211
else if (loss_function == "tweedie")
192212
errors = calculate_tweedie_errors(y, predicted, dispersion_parameter);
193213
else if (loss_function == "group_mse")
194-
errors = calculate_group_mse_errors(y, predicted, group, unique_groups);
214+
errors = calculate_group_mse_errors(y, predicted, group, unique_groups, sample_weight);
195215
else if (loss_function == "mae")
196216
errors = calculate_absolute_errors(y, predicted);
197217
else if (loss_function == "quantile")

cpp/tests.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ class Tests
600600
VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};
601601
VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")};
602602

603-
VectorXd sample_weight{VectorXd::Constant(y_train.size(), 1.0)};
603+
VectorXd sample_weight{VectorXd::Constant(y_train.size(), 0.5)};
604604

605605
VectorXi group{X_train.col(0).cast<int>()};
606606

@@ -610,6 +610,7 @@ class Tests
610610
// model.fit(X_train,y_train);
611611
// model.fit(X_train,y_train,sample_weight);
612612
// model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
613+
// model.fit(X_train, y_train, VectorXd(0), {}, {}, {}, {}, group);
613614
model.fit(X_train, y_train, sample_weight, {}, {}, {}, {}, group);
614615
std::cout << "feature importance\n"
615616
<< model.feature_importance << "\n\n";
@@ -621,7 +622,7 @@ class Tests
621622
save_as_csv_file("data/output.csv", predictions);
622623

623624
std::cout << predictions.mean() << "\n\n";
624-
tests.push_back(is_approximately_equal(predictions.mean(), 23.4234, 0.00001));
625+
tests.push_back(is_approximately_equal(predictions.mean(), 20.7268, 0.00001));
625626
}
626627

627628
void test_aplrregressor_int_constr()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name="aplr",
18-
version="7.6.1",
18+
version="7.6.2",
1919
description="Automatic Piecewise Linear Regression",
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)