@@ -36,8 +36,6 @@ class Term
3636 double error_where_given_terms_are_zero;
3737 SortedData sorted_vectors;
3838 VectorXd negative_gradient_discretized;
39- VectorXd errors_initial;
40- double error_initial;
4139 std::vector<size_t > observations_in_bins;
4240 int monotonic_constraint;
4341
@@ -49,8 +47,8 @@ class Term
4947 void setup_bins ();
5048 void discretize_data_by_bin ();
5149 void estimate_split_point_on_discretized_data ();
52- void calculate_coefficient_and_error_on_discretized_data ( bool direction_right, double split_point );
53- void estimate_coefficient_and_error_on_all_data ( );
50+ void estimate_coefficient_and_error ( const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight, double error_added= 0.0 );
51+ double estimate_coefficient ( const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight=VectorXd( 0 ) );
5452 void cleanup_after_estimate_split_point ();
5553 void cleanup_after_fit ();
5654 void cleanup_when_this_term_was_added_as_a_given_term ();
@@ -174,7 +172,8 @@ void Term::estimate_split_point(const MatrixXd &X,const VectorXd &negative_gradi
174172 }
175173 discretize_data_by_bin ();
176174 estimate_split_point_on_discretized_data ();
177- estimate_coefficient_and_error_on_all_data ();
175+ estimate_coefficient_and_error (calculate_without_interactions (sorted_vectors.values_sorted ),sorted_vectors.negative_gradient_sorted ,
176+ sorted_vectors.sample_weight_sorted ,error_where_given_terms_are_zero);
178177 cleanup_after_estimate_split_point ();
179178 determine_if_can_be_used_as_a_given_term (X.col (base_term));
180179}
@@ -188,7 +187,7 @@ void Term::calculate_rows_to_zero_out_and_not_due_to_given_terms(const MatrixXd
188187 for (auto &given_term:given_terms)
189188 {
190189 VectorXd values_given_term{given_term.calculate (X)};
191- for (size_t i = 0 ; i < static_cast < size_t >( X.rows () ); ++i)
190+ for (Eigen::Index i = 0 ; i < X.rows (); ++i)
192191 {
193192 if (is_approximately_zero (values_given_term[i]))
194193 {
@@ -200,7 +199,7 @@ void Term::calculate_rows_to_zero_out_and_not_due_to_given_terms(const MatrixXd
200199 rows_to_zero_out_and_not_due_to_given_terms.zeroed .resize (X.rows ()-rows_to_zero_out_and_not_due_to_given_terms.not_zeroed .rows ());
201200 size_t count_zeroed{0 };
202201 size_t count_not_zeroed{0 };
203- for (size_t i = 0 ; i < static_cast < size_t >( X.rows () ); ++i)
202+ for (Eigen::Index i = 0 ; i < X.rows (); ++i)
204203 {
205204 bool value_is_non_zero{non_zero_values[i]==1 };
206205 if (value_is_non_zero)
@@ -233,7 +232,7 @@ VectorXd Term::calculate(const MatrixXd &X)
233232 for (auto &given_term:given_terms)
234233 {
235234 VectorXd values_given_term{given_term.calculate (X)};
236- for (size_t i = 0 ; i < static_cast < size_t >( values.size () ); ++i)
235+ for (Eigen::Index i = 0 ; i < values.size (); ++i)
237236 {
238237 if (is_approximately_zero (values_given_term[i]))
239238 values[i]=0 ;
@@ -276,14 +275,14 @@ void Term::calculate_error_where_given_terms_are_zero(const VectorXd &negative_g
276275 {
277276 if (sample_weight.size ()==0 )
278277 {
279- for (size_t i = 0 ; i < static_cast < size_t >( rows_to_zero_out_and_not_due_to_given_terms.zeroed .size () ); ++i)
278+ for (Eigen::Index i = 0 ; i < rows_to_zero_out_and_not_due_to_given_terms.zeroed .size (); ++i)
280279 {
281280 error_where_given_terms_are_zero+=calculate_error_one_observation (negative_gradient[rows_to_zero_out_and_not_due_to_given_terms.zeroed [i]],0.0 ,NAN_DOUBLE);
282281 }
283282 }
284283 else
285284 {
286- for (size_t i = 0 ; i < static_cast < size_t >( rows_to_zero_out_and_not_due_to_given_terms.zeroed .size () ); ++i)
285+ for (Eigen::Index i = 0 ; i < rows_to_zero_out_and_not_due_to_given_terms.zeroed .size (); ++i)
287286 {
288287 error_where_given_terms_are_zero+=calculate_error_one_observation (negative_gradient[rows_to_zero_out_and_not_due_to_given_terms.zeroed [i]],0.0 ,sample_weight[rows_to_zero_out_and_not_due_to_given_terms.zeroed [i]]);
289288 }
@@ -504,40 +503,32 @@ void Term::discretize_data_by_bin()
504503}
505504
506505void Term::estimate_split_point_on_discretized_data ()
507- {
508- errors_initial=calculate_errors (negative_gradient_discretized,VectorXd::Constant (negative_gradient_discretized.size (),0.0 ),
509- sample_weight_discretized,FAMILY_GAUSSIAN);
510- error_initial=calculate_sum_error (errors_initial);
511-
512- double split_point_temp;
513-
514- bool SPLIT_POINT_NAN{false };
515- calculate_coefficient_and_error_on_discretized_data (SPLIT_POINT_NAN, NAN_DOUBLE);
516- double error_cp_nan{split_point_search_errors_sum};
506+ {
507+ split_point=NAN_DOUBLE;
508+ estimate_coefficient_and_error (calculate_without_interactions (values_discretized),negative_gradient_discretized,sample_weight_discretized);
509+ double error_split_point_nan{split_point_search_errors_sum};
517510
518- bool DIRECTION_LEFT{false };
519511 double split_point_left{NAN_DOUBLE};
520- double error_min_left{error_cp_nan };
521- for ( size_t i = 0 ; i < bins_split_points_left. size (); ++i )
512+ double error_min_left{error_split_point_nan };
513+ for ( auto &bin: bins_split_points_left)
522514 {
523- split_point_temp=bins_split_points_left[i] ;
524-
525- calculate_coefficient_and_error_on_discretized_data (DIRECTION_LEFT, split_point_temp );
515+ split_point=bin ;
516+ direction_right= false ;
517+ estimate_coefficient_and_error ( calculate_without_interactions (values_discretized),negative_gradient_discretized,sample_weight_discretized );
526518 if (std::islessequal (split_point_search_errors_sum,error_min_left))
527519 {
528520 error_min_left=split_point_search_errors_sum;
529521 split_point_left=split_point;
530522 }
531523 }
532524
533- bool DIRECTION_RIGHT{true };
534525 double split_point_right{NAN_DOUBLE};
535- double error_min_right{error_cp_nan };
536- for ( size_t i = 0 ; i < bins_split_points_right. size (); ++i )
526+ double error_min_right{error_split_point_nan };
527+ for ( auto &bin: bins_split_points_right)
537528 {
538- split_point_temp=bins_split_points_right[i] ;
539-
540- calculate_coefficient_and_error_on_discretized_data (DIRECTION_RIGHT, split_point_temp );
529+ split_point=bin ;
530+ direction_right= true ;
531+ estimate_coefficient_and_error ( calculate_without_interactions (values_discretized),negative_gradient_discretized,sample_weight_discretized );
541532 if (std::islessequal (split_point_search_errors_sum,error_min_right))
542533 {
543534 error_min_right=split_point_search_errors_sum;
@@ -560,52 +551,52 @@ void Term::estimate_split_point_on_discretized_data()
560551 }
561552}
562553
563- void Term::calculate_coefficient_and_error_on_discretized_data ( bool direction_right, double split_point )
554+ void Term::estimate_coefficient_and_error ( const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight, double error_added )
564555{
565- this ->direction_right =direction_right;
566- this ->split_point =split_point;
567-
568- VectorXd values_sorted{calculate_without_interactions (values_discretized)};
569-
570- size_t index_start{0 };
571- size_t index_end{max_index_discretized};
572-
573- double xwx{0 };
574- double xwy{0 };
575- for (size_t i = index_start; i <= index_end; ++i)
556+ coefficient = estimate_coefficient (x,y,sample_weight);
557+ if (std::isfinite (coefficient))
576558 {
577- xwx+=values_sorted[i]*values_sorted[i]*sample_weight_discretized[i];
578- xwy+=values_sorted[i]*negative_gradient_discretized[i]*sample_weight_discretized[i];
579- }
580- if (xwx!=0 )
581- {
582- double error_reduction{0 };
583- coefficient=xwy/xwx*v;
559+ coefficient*=v;
584560 if (coefficient_adheres_to_monotonic_constraint ())
585561 {
586- double predicted;
587- double sample_weight_one_obs{NAN_DOUBLE};
588- for (size_t i = index_start; i <= index_end; ++i)
589- {
590- predicted=values_sorted[i]*coefficient;
591- if (sample_weight_discretized.size ()>0 )
592- sample_weight_one_obs=sample_weight_discretized[i];
593-
594- error_reduction+=errors_initial[i]-calculate_error_one_observation (negative_gradient_discretized[i],predicted,sample_weight_one_obs);
595- }
596- split_point_search_errors_sum=error_initial-error_reduction;
562+ VectorXd predictions{x*coefficient};
563+ split_point_search_errors_sum=calculate_sum_error (calculate_errors (y,predictions,sample_weight,FAMILY_GAUSSIAN))+error_added;
597564 }
598565 else
599566 {
600567 coefficient=0 ;
601- split_point_search_errors_sum=error_initial ;
568+ split_point_search_errors_sum=std::numeric_limits< double >:: infinity () ;
602569 }
603570 }
604571 else
605572 {
606573 coefficient=0 ;
607- split_point_search_errors_sum=error_initial;
574+ split_point_search_errors_sum=std::numeric_limits<double >::infinity ();
575+ }
576+ }
577+
578+ double Term::estimate_coefficient (const VectorXd &x, const VectorXd &y, const VectorXd &sample_weight)
579+ {
580+ double numerator{0 };
581+ double denominator{0 };
582+ bool sample_weight_is_provided{sample_weight.size ()>0 };
583+ if (sample_weight_is_provided)
584+ {
585+ for (Eigen::Index i = 0 ; i < y.size (); ++i)
586+ {
587+ numerator+=x[i]*y[i]*sample_weight[i];
588+ denominator+=x[i]*x[i]*sample_weight[i];
589+ }
590+ }
591+ else
592+ {
593+ for (Eigen::Index i = 0 ; i < y.size (); ++i)
594+ {
595+ numerator+=x[i]*y[i];
596+ denominator+=x[i]*x[i];
597+ }
608598 }
599+ return numerator/denominator;
609600}
610601
611602bool Term::coefficient_adheres_to_monotonic_constraint ()
@@ -620,43 +611,6 @@ bool Term::coefficient_adheres_to_monotonic_constraint()
620611 return coefficient_adheres;
621612}
622613
623- void Term::estimate_coefficient_and_error_on_all_data ()
624- {
625- sorted_vectors.values_sorted =calculate_without_interactions (sorted_vectors.values_sorted );
626- double xwx{0 };
627- double xwy{0 };
628- if (sorted_vectors.sample_weight_sorted .size ()>0 )
629- {
630- xwx=(sorted_vectors.values_sorted .array ()*sorted_vectors.values_sorted .array ()*sorted_vectors.sample_weight_sorted .array ()).sum ();
631- xwy=(sorted_vectors.values_sorted .array ()*sorted_vectors.negative_gradient_sorted .array ()*sorted_vectors.sample_weight_sorted .array ()).sum ();
632- }
633- else
634- {
635- xwx=(sorted_vectors.values_sorted .array ()*sorted_vectors.values_sorted .array ()).sum ();
636- xwy=(sorted_vectors.values_sorted .array ()*sorted_vectors.negative_gradient_sorted .array ()).sum ();
637- }
638- if (xwx!=0 )
639- {
640- coefficient=xwy/xwx*v;
641- if (coefficient_adheres_to_monotonic_constraint ())
642- {
643- VectorXd predictions{sorted_vectors.values_sorted *coefficient};
644- split_point_search_errors_sum=calculate_sum_error (calculate_errors (sorted_vectors.negative_gradient_sorted ,predictions,
645- sorted_vectors.sample_weight_sorted ,FAMILY_GAUSSIAN))+error_where_given_terms_are_zero;
646- }
647- else
648- {
649- coefficient=0 ;
650- split_point_search_errors_sum=std::numeric_limits<double >::infinity ();
651- }
652- }
653- else
654- {
655- coefficient=0 ;
656- split_point_search_errors_sum=std::numeric_limits<double >::infinity ();
657- }
658- }
659-
660614void Term::cleanup_after_estimate_split_point ()
661615{
662616 rows_to_zero_out_and_not_due_to_given_terms.not_zeroed .resize (0 );
@@ -665,7 +619,6 @@ void Term::cleanup_after_estimate_split_point()
665619 sorted_vectors.negative_gradient_sorted .resize (0 );
666620 sorted_vectors.sample_weight_sorted .resize (0 );
667621 negative_gradient_discretized.resize (0 );
668- errors_initial.resize (0 );
669622}
670623
671624void Term::determine_if_can_be_used_as_a_given_term (const VectorXd &x)
0 commit comments