Skip to content

Commit c246776

Browse files
refactor
1 parent 1efeea3 commit c246776

File tree

4 files changed

+70
-115
lines changed

4 files changed

+70
-115
lines changed

cpp/APLRRegressor.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,8 @@ void APLRRegressor::initialize(const std::vector<size_t> &prioritized_predictors
483483
intercept_steps=VectorXd::Constant(m,0);
484484

485485
terms_eligible_current.reserve(X_train.cols()*reserved_terms_times_num_x);
486-
for (size_t i = 0; i < static_cast<size_t>(X_train.cols()); ++i)
486+
size_t X_train_cols{static_cast<size_t>(X_train.cols())};
487+
for (size_t i = 0; i < X_train_cols; ++i)
487488
{
488489
bool term_has_one_unique_value{check_if_base_term_has_only_one_unique_value(i)};
489490
Term copy_of_base_term{Term(i)};
@@ -495,7 +496,7 @@ void APLRRegressor::initialize(const std::vector<size_t> &prioritized_predictors
495496
}
496497

497498
predictor_indexes.resize(X_train.cols());
498-
for (size_t i = 0; i < static_cast<size_t>(X_train.cols()); ++i)
499+
for (size_t i = 0; i < X_train_cols; ++i)
499500
{
500501
predictor_indexes[i]=i;
501502
}
@@ -857,7 +858,7 @@ void APLRRegressor::add_necessary_given_terms_to_interaction(Term &interaction,
857858
}
858859

859860
bool given_term_provides_an_unique_zero{false};
860-
for (size_t row = 0; row < static_cast<size_t>(X_train.rows()); ++row)
861+
for (Eigen::Index row = 0; row < X_train.rows(); ++row)
861862
{
862863
given_term_provides_an_unique_zero = combined_value_indicator_for_the_other_given_terms[row]>0 && value_indicator_for_each_given_term.col(col)[row]==0;
863864
if(given_term_provides_an_unique_zero)
@@ -892,7 +893,7 @@ void APLRRegressor::add_promising_interactions_and_select_the_best_one()
892893
size_t best_term_before_interactions{best_term_index};
893894
bool best_term_before_interactions_was_not_selected{best_term_before_interactions==std::numeric_limits<size_t>::max()};
894895
bool error_is_less_than_for_best_term_before_interactions;
895-
for (size_t j = 0; j < static_cast<size_t>(sorted_indexes_of_errors_for_interactions_to_consider.size()); ++j) //for each interaction to consider starting from lowest to highest error
896+
for (Eigen::Index j = 0; j < sorted_indexes_of_errors_for_interactions_to_consider.size(); ++j) //for each interaction to consider starting from lowest to highest error
896897
{
897898
bool allowed_to_add_one_interaction{interactions_eligible<max_interactions};
898899
if(allowed_to_add_one_interaction)
@@ -1121,7 +1122,7 @@ void APLRRegressor::revert_scaling_if_using_log_link_function()
11211122
{
11221123
y_train/=scaling_factor_for_log_link_function;
11231124
intercept+=std::log(1/scaling_factor_for_log_link_function);
1124-
for (size_t i = 0; i < static_cast<size_t>(intercept_steps.size()); ++i)
1125+
for (Eigen::Index i = 0; i < intercept_steps.size(); ++i)
11251126
{
11261127
intercept_steps[i]+=std::log(1/scaling_factor_for_log_link_function);
11271128
}
@@ -1134,7 +1135,8 @@ void APLRRegressor::name_terms(const MatrixXd &X, const std::vector<std::string>
11341135
if(x_names_not_provided)
11351136
{
11361137
std::vector<std::string> temp(X.cols());
1137-
for (size_t i = 0; i < static_cast<size_t>(X.cols()); ++i)
1138+
size_t X_cols{static_cast<size_t>(X.cols())};
1139+
for (size_t i = 0; i < X_cols; ++i)
11381140
{
11391141
temp[i]="X"+std::to_string(i+1);
11401142
}
@@ -1207,7 +1209,7 @@ void APLRRegressor::calculate_feature_importance_on_validation_set()
12071209
{
12081210
feature_importance=VectorXd::Constant(number_of_base_terms,0);
12091211
MatrixXd li{calculate_local_feature_importance(X_validation)};
1210-
for (size_t i = 0; i < static_cast<size_t>(li.cols()); ++i) //for each column calculate mean abs values
1212+
for (Eigen::Index i = 0; i < li.cols(); ++i) //for each column calculate mean abs values
12111213
{
12121214
feature_importance[i]=li.col(i).cwiseAbs().mean();
12131215
}
@@ -1302,7 +1304,7 @@ VectorXd APLRRegressor::calculate_linear_predictor(const MatrixXd &X)
13021304

13031305
void APLRRegressor::cap_predictions_to_minmax_in_training(VectorXd &predictions)
13041306
{
1305-
for (size_t i = 0; i < static_cast<size_t>(predictions.rows()); ++i)
1307+
for (Eigen::Index i = 0; i < predictions.rows(); ++i)
13061308
{
13071309
if(std::isgreater(predictions[i],max_training_prediction_or_response))
13081310
predictions[i]=max_training_prediction_or_response;

cpp/functions.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ VectorXd calculate_exp_of_linear_predictor_adjusted_for_numerical_problems(const
150150
VectorXd exp_of_linear_predictor{linear_predictor.array().exp()};
151151
double min_exp_of_linear_predictor{std::exp(min_exponent)};
152152
double max_exp_of_linear_predictor{std::exp(max_exponent)};
153-
for (size_t i = 0; i < static_cast<size_t>(linear_predictor.rows()); ++i)
153+
for (Eigen::Index i = 0; i < linear_predictor.rows(); ++i)
154154
{
155155
bool linear_predictor_is_too_small{std::isless(linear_predictor[i], min_exponent)};
156156
if(linear_predictor_is_too_small)
@@ -270,7 +270,7 @@ void throw_error_if_matrix_has_nan_or_infinite_elements(const T &x, const std::s
270270
VectorXi calculate_indicator(const VectorXd &v)
271271
{
272272
VectorXi indicator{VectorXi::Constant(v.rows(),1)};
273-
for (size_t i = 0; i < static_cast<size_t>(v.size()); ++i)
273+
for (Eigen::Index i = 0; i < v.size(); ++i)
274274
{
275275
if(is_approximately_zero(v[i]))
276276
indicator[i]=0;
@@ -281,7 +281,7 @@ VectorXi calculate_indicator(const VectorXd &v)
281281
VectorXi calculate_indicator(const VectorXi &v)
282282
{
283283
VectorXi indicator{VectorXi::Constant(v.rows(),1)};
284-
for (size_t i = 0; i < static_cast<size_t>(v.size()); ++i)
284+
for (Eigen::Index i = 0; i < v.size(); ++i)
285285
{
286286
if(v[i]==0)
287287
indicator[i]=0;
@@ -369,7 +369,7 @@ double trapezoidal_integration(const VectorXd &y, const VectorXd &x)
369369
if(y_is_large_enough && x_and_y_have_the_same_size)
370370
{
371371
output=0;
372-
for (size_t i = 1; i < static_cast<size_t>(y.size()); ++i)
372+
for (Eigen::Index i = 1; i < y.size(); ++i)
373373
{
374374
double delta_y{(y[i]+y[i-1])/2};
375375
double delta_x{x[i]-x[i-1]};

cpp/term.h

Lines changed: 55 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ class Term
3636
double error_where_given_terms_are_zero;
3737
SortedData sorted_vectors;
3838
VectorXd negative_gradient_discretized;
39-
VectorXd errors_initial;
40-
double error_initial;
4139
std::vector<size_t> observations_in_bins;
4240
int monotonic_constraint;
4341

@@ -49,8 +47,8 @@ class Term
4947
void setup_bins();
5048
void discretize_data_by_bin();
5149
void estimate_split_point_on_discretized_data();
52-
void calculate_coefficient_and_error_on_discretized_data(bool direction_right, double split_point);
53-
void estimate_coefficient_and_error_on_all_data();
50+
void estimate_coefficient_and_error(const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight,double error_added=0.0);
51+
double estimate_coefficient(const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight=VectorXd(0));
5452
void cleanup_after_estimate_split_point();
5553
void cleanup_after_fit();
5654
void cleanup_when_this_term_was_added_as_a_given_term();
@@ -174,7 +172,8 @@ void Term::estimate_split_point(const MatrixXd &X,const VectorXd &negative_gradi
174172
}
175173
discretize_data_by_bin();
176174
estimate_split_point_on_discretized_data();
177-
estimate_coefficient_and_error_on_all_data();
175+
estimate_coefficient_and_error(calculate_without_interactions(sorted_vectors.values_sorted),sorted_vectors.negative_gradient_sorted,
176+
sorted_vectors.sample_weight_sorted,error_where_given_terms_are_zero);
178177
cleanup_after_estimate_split_point();
179178
determine_if_can_be_used_as_a_given_term(X.col(base_term));
180179
}
@@ -188,7 +187,7 @@ void Term::calculate_rows_to_zero_out_and_not_due_to_given_terms(const MatrixXd
188187
for (auto &given_term:given_terms)
189188
{
190189
VectorXd values_given_term{given_term.calculate(X)};
191-
for (size_t i = 0; i < static_cast<size_t>(X.rows()); ++i)
190+
for (Eigen::Index i = 0; i < X.rows(); ++i)
192191
{
193192
if(is_approximately_zero(values_given_term[i]))
194193
{
@@ -200,7 +199,7 @@ void Term::calculate_rows_to_zero_out_and_not_due_to_given_terms(const MatrixXd
200199
rows_to_zero_out_and_not_due_to_given_terms.zeroed.resize(X.rows()-rows_to_zero_out_and_not_due_to_given_terms.not_zeroed.rows());
201200
size_t count_zeroed{0};
202201
size_t count_not_zeroed{0};
203-
for (size_t i = 0; i < static_cast<size_t>(X.rows()); ++i)
202+
for (Eigen::Index i = 0; i < X.rows(); ++i)
204203
{
205204
bool value_is_non_zero{non_zero_values[i]==1};
206205
if(value_is_non_zero)
@@ -233,7 +232,7 @@ VectorXd Term::calculate(const MatrixXd &X)
233232
for(auto &given_term:given_terms)
234233
{
235234
VectorXd values_given_term{given_term.calculate(X)};
236-
for (size_t i = 0; i < static_cast<size_t>(values.size()); ++i)
235+
for (Eigen::Index i = 0; i < values.size(); ++i)
237236
{
238237
if(is_approximately_zero(values_given_term[i]))
239238
values[i]=0;
@@ -276,14 +275,14 @@ void Term::calculate_error_where_given_terms_are_zero(const VectorXd &negative_g
276275
{
277276
if(sample_weight.size()==0)
278277
{
279-
for (size_t i = 0; i < static_cast<size_t>(rows_to_zero_out_and_not_due_to_given_terms.zeroed.size()); ++i)
278+
for (Eigen::Index i = 0; i < rows_to_zero_out_and_not_due_to_given_terms.zeroed.size(); ++i)
280279
{
281280
error_where_given_terms_are_zero+=calculate_error_one_observation(negative_gradient[rows_to_zero_out_and_not_due_to_given_terms.zeroed[i]],0.0,NAN_DOUBLE);
282281
}
283282
}
284283
else
285284
{
286-
for (size_t i = 0; i < static_cast<size_t>(rows_to_zero_out_and_not_due_to_given_terms.zeroed.size()); ++i)
285+
for (Eigen::Index i = 0; i < rows_to_zero_out_and_not_due_to_given_terms.zeroed.size(); ++i)
287286
{
288287
error_where_given_terms_are_zero+=calculate_error_one_observation(negative_gradient[rows_to_zero_out_and_not_due_to_given_terms.zeroed[i]],0.0,sample_weight[rows_to_zero_out_and_not_due_to_given_terms.zeroed[i]]);
289288
}
@@ -504,40 +503,32 @@ void Term::discretize_data_by_bin()
504503
}
505504

506505
void Term::estimate_split_point_on_discretized_data()
507-
{
508-
errors_initial=calculate_errors(negative_gradient_discretized,VectorXd::Constant(negative_gradient_discretized.size(),0.0),
509-
sample_weight_discretized,FAMILY_GAUSSIAN);
510-
error_initial=calculate_sum_error(errors_initial);
511-
512-
double split_point_temp;
513-
514-
bool SPLIT_POINT_NAN{false};
515-
calculate_coefficient_and_error_on_discretized_data(SPLIT_POINT_NAN, NAN_DOUBLE);
516-
double error_cp_nan{split_point_search_errors_sum};
506+
{
507+
split_point=NAN_DOUBLE;
508+
estimate_coefficient_and_error(calculate_without_interactions(values_discretized),negative_gradient_discretized,sample_weight_discretized);
509+
double error_split_point_nan{split_point_search_errors_sum};
517510

518-
bool DIRECTION_LEFT{false};
519511
double split_point_left{NAN_DOUBLE};
520-
double error_min_left{error_cp_nan};
521-
for (size_t i = 0; i < bins_split_points_left.size(); ++i)
512+
double error_min_left{error_split_point_nan};
513+
for(auto &bin:bins_split_points_left)
522514
{
523-
split_point_temp=bins_split_points_left[i];
524-
525-
calculate_coefficient_and_error_on_discretized_data(DIRECTION_LEFT, split_point_temp);
515+
split_point=bin;
516+
direction_right=false;
517+
estimate_coefficient_and_error(calculate_without_interactions(values_discretized),negative_gradient_discretized,sample_weight_discretized);
526518
if(std::islessequal(split_point_search_errors_sum,error_min_left))
527519
{
528520
error_min_left=split_point_search_errors_sum;
529521
split_point_left=split_point;
530522
}
531523
}
532524

533-
bool DIRECTION_RIGHT{true};
534525
double split_point_right{NAN_DOUBLE};
535-
double error_min_right{error_cp_nan};
536-
for (size_t i = 0; i < bins_split_points_right.size(); ++i)
526+
double error_min_right{error_split_point_nan};
527+
for(auto &bin:bins_split_points_right)
537528
{
538-
split_point_temp=bins_split_points_right[i];
539-
540-
calculate_coefficient_and_error_on_discretized_data(DIRECTION_RIGHT, split_point_temp);
529+
split_point=bin;
530+
direction_right=true;
531+
estimate_coefficient_and_error(calculate_without_interactions(values_discretized),negative_gradient_discretized,sample_weight_discretized);
541532
if(std::islessequal(split_point_search_errors_sum,error_min_right))
542533
{
543534
error_min_right=split_point_search_errors_sum;
@@ -560,52 +551,52 @@ void Term::estimate_split_point_on_discretized_data()
560551
}
561552
}
562553

563-
void Term::calculate_coefficient_and_error_on_discretized_data(bool direction_right, double split_point)
554+
void Term::estimate_coefficient_and_error(const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight, double error_added)
564555
{
565-
this->direction_right=direction_right;
566-
this->split_point=split_point;
567-
568-
VectorXd values_sorted{calculate_without_interactions(values_discretized)};
569-
570-
size_t index_start{0};
571-
size_t index_end{max_index_discretized};
572-
573-
double xwx{0};
574-
double xwy{0};
575-
for (size_t i = index_start; i <= index_end; ++i)
556+
coefficient = estimate_coefficient(x,y,sample_weight);
557+
if(std::isfinite(coefficient))
576558
{
577-
xwx+=values_sorted[i]*values_sorted[i]*sample_weight_discretized[i];
578-
xwy+=values_sorted[i]*negative_gradient_discretized[i]*sample_weight_discretized[i];
579-
}
580-
if(xwx!=0)
581-
{
582-
double error_reduction{0};
583-
coefficient=xwy/xwx*v;
559+
coefficient*=v;
584560
if(coefficient_adheres_to_monotonic_constraint())
585561
{
586-
double predicted;
587-
double sample_weight_one_obs{NAN_DOUBLE};
588-
for (size_t i = index_start; i <= index_end; ++i)
589-
{
590-
predicted=values_sorted[i]*coefficient;
591-
if(sample_weight_discretized.size()>0)
592-
sample_weight_one_obs=sample_weight_discretized[i];
593-
594-
error_reduction+=errors_initial[i]-calculate_error_one_observation(negative_gradient_discretized[i],predicted,sample_weight_one_obs);
595-
}
596-
split_point_search_errors_sum=error_initial-error_reduction;
562+
VectorXd predictions{x*coefficient};
563+
split_point_search_errors_sum=calculate_sum_error(calculate_errors(y,predictions,sample_weight,FAMILY_GAUSSIAN))+error_added;
597564
}
598565
else
599566
{
600567
coefficient=0;
601-
split_point_search_errors_sum=error_initial;
568+
split_point_search_errors_sum=std::numeric_limits<double>::infinity();
602569
}
603570
}
604571
else
605572
{
606573
coefficient=0;
607-
split_point_search_errors_sum=error_initial;
574+
split_point_search_errors_sum=std::numeric_limits<double>::infinity();
575+
}
576+
}
577+
578+
double Term::estimate_coefficient(const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight)
579+
{
580+
double numerator{0};
581+
double denominator{0};
582+
bool sample_weight_is_provided{sample_weight.size()>0};
583+
if(sample_weight_is_provided)
584+
{
585+
for (Eigen::Index i = 0; i < y.size(); ++i)
586+
{
587+
numerator+=x[i]*y[i]*sample_weight[i];
588+
denominator+=x[i]*x[i]*sample_weight[i];
589+
}
590+
}
591+
else
592+
{
593+
for (Eigen::Index i = 0; i < y.size(); ++i)
594+
{
595+
numerator+=x[i]*y[i];
596+
denominator+=x[i]*x[i];
597+
}
608598
}
599+
return numerator/denominator;
609600
}
610601

611602
bool Term::coefficient_adheres_to_monotonic_constraint()
@@ -620,43 +611,6 @@ bool Term::coefficient_adheres_to_monotonic_constraint()
620611
return coefficient_adheres;
621612
}
622613

623-
void Term::estimate_coefficient_and_error_on_all_data()
624-
{
625-
sorted_vectors.values_sorted=calculate_without_interactions(sorted_vectors.values_sorted);
626-
double xwx{0};
627-
double xwy{0};
628-
if(sorted_vectors.sample_weight_sorted.size()>0)
629-
{
630-
xwx=(sorted_vectors.values_sorted.array()*sorted_vectors.values_sorted.array()*sorted_vectors.sample_weight_sorted.array()).sum();
631-
xwy=(sorted_vectors.values_sorted.array()*sorted_vectors.negative_gradient_sorted.array()*sorted_vectors.sample_weight_sorted.array()).sum();
632-
}
633-
else
634-
{
635-
xwx=(sorted_vectors.values_sorted.array()*sorted_vectors.values_sorted.array()).sum();
636-
xwy=(sorted_vectors.values_sorted.array()*sorted_vectors.negative_gradient_sorted.array()).sum();
637-
}
638-
if(xwx!=0)
639-
{
640-
coefficient=xwy/xwx*v;
641-
if(coefficient_adheres_to_monotonic_constraint())
642-
{
643-
VectorXd predictions{sorted_vectors.values_sorted*coefficient};
644-
split_point_search_errors_sum=calculate_sum_error(calculate_errors(sorted_vectors.negative_gradient_sorted,predictions,
645-
sorted_vectors.sample_weight_sorted,FAMILY_GAUSSIAN))+error_where_given_terms_are_zero;
646-
}
647-
else
648-
{
649-
coefficient=0;
650-
split_point_search_errors_sum=std::numeric_limits<double>::infinity();
651-
}
652-
}
653-
else
654-
{
655-
coefficient=0;
656-
split_point_search_errors_sum=std::numeric_limits<double>::infinity();
657-
}
658-
}
659-
660614
void Term::cleanup_after_estimate_split_point()
661615
{
662616
rows_to_zero_out_and_not_due_to_given_terms.not_zeroed.resize(0);
@@ -665,7 +619,6 @@ void Term::cleanup_after_estimate_split_point()
665619
sorted_vectors.negative_gradient_sorted.resize(0);
666620
sorted_vectors.sample_weight_sorted.resize(0);
667621
negative_gradient_discretized.resize(0);
668-
errors_initial.resize(0);
669622
}
670623

671624
void Term::determine_if_can_be_used_as_a_given_term(const VectorXd &x)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name='aplr',
18-
version='2.1.0',
18+
version='2.2.0',
1919
description='Automatic Piecewise Linear Regression',
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)