Skip to content

Commit 1efeea3

Browse files
gini
1 parent 6859bbf commit 1efeea3

File tree

7 files changed

+227
-10
lines changed

7 files changed

+227
-10
lines changed

API_REFERENCE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ Limits 1) the number of terms already in the model that can be considered as int
5353
Specifies the variance power for the "tweedie" ***family***.
5454

5555
#### validation_tuning_metric (default = "default")
56-
Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae" and "rankability". The default is often a choice that fits well with respect to the ***family*** chosen. However, if you want to use ***family*** or ***tweedie_power*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb
56+
Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini" and "rankability". The default is often a choice that fits well with respect to the ***family*** chosen. However, if you want to use ***family*** or ***tweedie_power*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights.
5757

5858
## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[], monotonic_constraints:List[int]=[])
5959

cpp/APLRRegressor.h

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,9 @@ class APLRRegressor
9393
VectorXd calculate_linear_predictor(const MatrixXd &X);
9494
void update_linear_predictor_and_predictions();
9595
void throw_error_if_response_contains_invalid_values(const VectorXd &y);
96+
void throw_error_if_sample_weight_contains_invalid_values(const VectorXd &y, const VectorXd &sample_weight);
9697
void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y,const std::string &error_message);
97-
void throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message);
98+
void throw_error_if_vector_contains_negative_values(const VectorXd &y, const std::string &error_message);
9899
void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message);
99100
void throw_error_if_tweedie_power_is_invalid();
100101
VectorXd differentiate_predictions();
@@ -271,7 +272,6 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X,const VectorXd &y,co
271272
{
272273
if(X.rows()!=y.size()) throw std::runtime_error("X and y must have the same number of rows.");
273274
if(X.rows()<2) throw std::runtime_error("X and y cannot have less than two rows.");
274-
if(sample_weight.size()>0 && sample_weight.size()!=y.size()) throw std::runtime_error("sample_weight must have 0 or as many rows as X and y.");
275275
if(X_names.size()>0 && X_names.size()!=static_cast<size_t>(X.cols())) throw std::runtime_error("X_names must have as many columns as X.");
276276
throw_error_if_matrix_has_nan_or_infinite_elements(X, "X");
277277
throw_error_if_matrix_has_nan_or_infinite_elements(y, "y");
@@ -280,6 +280,7 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X,const VectorXd &y,co
280280
throw_error_if_prioritized_predictors_indexes_has_invalid_indexes(X, prioritized_predictors_indexes);
281281
throw_error_if_monotonic_constraints_has_invalid_indexes(X, monotonic_constraints);
282282
throw_error_if_response_contains_invalid_values(y);
283+
throw_error_if_sample_weight_contains_invalid_values(y, sample_weight);
283284
}
284285

285286
void APLRRegressor::throw_error_if_validation_set_indexes_has_invalid_indexes(const VectorXd &y, const std::vector<size_t> &validation_set_indexes)
@@ -332,7 +333,15 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
332333
else if(link_function=="log" || family=="poisson" || (family=="tweedie" && std::isless(tweedie_power,2) && std::isgreater(tweedie_power,1)))
333334
{
334335
std::string error_message{"Response values for the log link function or poisson family or tweedie family when tweedie_power<2 cannot be less than zero."};
335-
throw_error_if_response_is_negative(y,error_message);
336+
throw_error_if_vector_contains_negative_values(y,error_message);
337+
}
338+
else if(validation_tuning_metric=="negative_gini")
339+
{
340+
std::string error_message{"Response values cannot be negative when using the negative_gini validation_tuning_metric."};
341+
throw_error_if_vector_contains_negative_values(y, error_message);
342+
bool sum_is_zero{y.sum()==0};
343+
if(sum_is_zero)
344+
throw std::runtime_error("Response values cannot sum to zero when using the negative_gini validation_tuning_metric.");
336345
}
337346
}
338347

@@ -344,10 +353,10 @@ void APLRRegressor::throw_error_if_response_is_not_between_0_and_1(const VectorX
344353
throw std::runtime_error(error_message);
345354
}
346355

347-
void APLRRegressor::throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message)
356+
void APLRRegressor::throw_error_if_vector_contains_negative_values(const VectorXd &y, const std::string &error_message)
348357
{
349-
bool response_is_less_than_zero{(y.array()<0.0).any()};
350-
if(response_is_less_than_zero)
358+
bool vector_is_less_than_zero{(y.array()<0.0).any()};
359+
if(vector_is_less_than_zero)
351360
throw std::runtime_error(error_message);
352361
}
353362

@@ -359,6 +368,19 @@ void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const Vecto
359368

360369
}
361370

371+
void APLRRegressor::throw_error_if_sample_weight_contains_invalid_values(const VectorXd &y, const VectorXd &sample_weight)
372+
{
373+
bool sample_weight_are_provided{sample_weight.size()>0};
374+
if(sample_weight_are_provided)
375+
{
376+
if(sample_weight.size()!=y.size()) throw std::runtime_error("sample_weight must have 0 or as many rows as X and y.");
377+
throw_error_if_vector_contains_negative_values(sample_weight,"sample_weight cannot contain negative values.");
378+
bool sum_is_zero{sample_weight.sum()==0};
379+
if(sum_is_zero)
380+
throw std::runtime_error("sample_weight cannot sum to zero.");
381+
}
382+
}
383+
362384
void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight, const std::vector<size_t> &validation_set_indexes)
363385
{
364386
size_t y_size{static_cast<size_t>(y.size())};
@@ -969,11 +991,11 @@ void APLRRegressor::calculate_and_validate_validation_error(size_t boosting_step
969991
else
970992
calculate_validation_error(boosting_step, predictions_current_validation);
971993

972-
bool validation_error_is_invalid{std::isinf(validation_error_steps[boosting_step])};
994+
bool validation_error_is_invalid{!std::isfinite(validation_error_steps[boosting_step])};
973995
if(validation_error_is_invalid)
974996
{
975997
abort_boosting=true;
976-
std::string warning_message{"Warning: Encountered numerical problems when calculating prediction errors in the previous boosting step. Not continuing with further boosting steps. One potential reason is if the combination of family and link_function is invalid."};
998+
std::string warning_message{"Warning: Encountered numerical problems when calculating validation error in the previous boosting step. Not continuing with further boosting steps. One potential reason is if the combination of family and link_function is invalid."};
977999
std::cout<<warning_message<<"\n";
9781000
}
9791001
}
@@ -986,6 +1008,8 @@ void APLRRegressor::calculate_validation_error(size_t boosting_step, const Vecto
9861008
validation_error_steps[boosting_step]=calculate_mean_error(calculate_errors(y_validation,predictions,sample_weight_validation,FAMILY_GAUSSIAN),sample_weight_validation);
9871009
else if(validation_tuning_metric=="mae")
9881010
validation_error_steps[boosting_step]=calculate_mean_error(calculate_absolute_errors(y_validation,predictions,sample_weight_validation),sample_weight_validation);
1011+
else if(validation_tuning_metric=="negative_gini")
1012+
validation_error_steps[boosting_step]=-calculate_gini(y_validation,predictions,sample_weight_validation);
9891013
else if(validation_tuning_metric=="rankability")
9901014
validation_error_steps[boosting_step]=-calculate_rankability(y_validation,predictions,sample_weight_validation,random_state);
9911015
else

cpp/functions.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,4 +358,57 @@ double calculate_rankability(const VectorXd &y_true, const VectorXd &y_pred, con
358358
rankability=0.5;
359359

360360
return rankability;
361+
}
362+
363+
double trapezoidal_integration(const VectorXd &y, const VectorXd &x)
364+
{
365+
bool y_is_large_enough{y.rows()>1};
366+
bool x_and_y_have_the_same_size{x.rows()==y.rows()};
367+
368+
double output{NAN_DOUBLE};
369+
if(y_is_large_enough && x_and_y_have_the_same_size)
370+
{
371+
output=0;
372+
for (size_t i = 1; i < static_cast<size_t>(y.size()); ++i)
373+
{
374+
double delta_y{(y[i]+y[i-1])/2};
375+
double delta_x{x[i]-x[i-1]};
376+
output += delta_y*delta_x;
377+
}
378+
}
379+
380+
return output;
381+
}
382+
383+
VectorXd calculate_weights_if_they_are_not_provided(const VectorXd &y_true, const VectorXd &weights=VectorXd(0))
384+
{
385+
bool weights_are_not_provided{weights.size()==0};
386+
if(weights_are_not_provided)
387+
{
388+
return VectorXd::Constant(y_true.size(),1.0);
389+
}
390+
else
391+
return weights;
392+
}
393+
394+
double calculate_gini(const VectorXd &y_true, const VectorXd &y_pred, const VectorXd &weights=VectorXd(0))
395+
{
396+
VectorXd weights_used{calculate_weights_if_they_are_not_provided(y_true,weights)};
397+
398+
VectorXi y_pred_sorted_index{sort_indexes_ascending(y_pred)};
399+
400+
Eigen::Index normalized_cumsum_vector_rows{y_true.size()+1};
401+
VectorXd normalized_cumsum_y_true{VectorXd::Constant(normalized_cumsum_vector_rows, 0.0)};
402+
VectorXd normalized_cumsum_weights{VectorXd::Constant(normalized_cumsum_vector_rows, 0.0)};
403+
for (Eigen::Index i = 1; i < normalized_cumsum_vector_rows; ++i)
404+
{
405+
normalized_cumsum_y_true[i] += normalized_cumsum_y_true[i-1] + y_true[y_pred_sorted_index[i-1]];
406+
normalized_cumsum_weights[i] += normalized_cumsum_weights[i-1] + weights_used[y_pred_sorted_index[i-1]];
407+
}
408+
normalized_cumsum_y_true /= y_true.sum();
409+
normalized_cumsum_weights /= weights_used.sum();
410+
411+
double gini{1.0 - 2 * trapezoidal_integration(normalized_cumsum_y_true, normalized_cumsum_weights)};
412+
413+
return gini;
361414
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include <iostream>
2+
#include "term.h"
3+
#include "../dependencies/eigen-master/Eigen/Dense"
4+
#include <vector>
5+
#include <numeric>
6+
#include "APLRRegressor.h"
7+
#include <cmath>
8+
9+
10+
using namespace Eigen;
11+
12+
int main()
13+
{
14+
std::vector<bool> tests;
15+
tests.reserve(1000);
16+
17+
//Model
18+
APLRRegressor model{APLRRegressor()};
19+
model.m=100;
20+
model.v=0.1;
21+
model.bins=300;
22+
model.n_jobs=0;
23+
model.family="gamma";
24+
model.link_function="log";
25+
model.verbosity=3;
26+
model.max_interaction_level=0;
27+
model.max_interactions=1000;
28+
model.min_observations_in_split=20;
29+
model.ineligible_boosting_steps_added=10;
30+
model.max_eligible_terms=5;
31+
model.validation_tuning_metric="negative_gini";
32+
33+
//Data
34+
MatrixXd X_train{load_csv_into_eigen_matrix<MatrixXd>("data/X_train.csv")};
35+
MatrixXd X_test{load_csv_into_eigen_matrix<MatrixXd>("data/X_test.csv")};
36+
VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};
37+
VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")};
38+
39+
VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
40+
41+
std::cout<<X_train;
42+
43+
//Fitting
44+
//model.fit(X_train,y_train);
45+
model.fit(X_train,y_train,sample_weight);
46+
//model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
47+
std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
48+
49+
VectorXd predictions{model.predict(X_test)};
50+
MatrixXd li{model.calculate_local_feature_importance(X_test)};
51+
52+
//Saving results
53+
save_as_csv_file("data/output.csv",predictions);
54+
55+
std::cout<<predictions.mean()<<"\n\n";
56+
tests.push_back(is_approximately_equal(predictions.mean(),23.6507,0.00001));
57+
58+
//Test summary
59+
std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
60+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include <iostream>
2+
#include "term.h"
3+
#include "../dependencies/eigen-master/Eigen/Dense"
4+
#include <vector>
5+
#include <numeric>
6+
#include "APLRRegressor.h"
7+
#include <cmath>
8+
9+
10+
using namespace Eigen;
11+
12+
int main()
13+
{
14+
std::vector<bool> tests;
15+
tests.reserve(1000);
16+
17+
//Model
18+
APLRRegressor model{APLRRegressor()};
19+
model.m=100;
20+
model.v=0.1;
21+
model.bins=300;
22+
model.n_jobs=0;
23+
model.family="gamma";
24+
model.link_function="log";
25+
model.verbosity=3;
26+
model.max_interaction_level=0;
27+
model.max_interactions=1000;
28+
model.min_observations_in_split=20;
29+
model.ineligible_boosting_steps_added=10;
30+
model.max_eligible_terms=5;
31+
model.validation_tuning_metric="negative_gini";
32+
33+
//Data
34+
MatrixXd X_train{load_csv_into_eigen_matrix<MatrixXd>("data/X_train.csv")};
35+
MatrixXd X_test{load_csv_into_eigen_matrix<MatrixXd>("data/X_test.csv")};
36+
VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};
37+
VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")};
38+
39+
VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
40+
41+
std::cout<<X_train;
42+
43+
//Fitting
44+
model.fit(X_train,y_train);
45+
//model.fit(X_train,y_train,sample_weight);
46+
//model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
47+
std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
48+
49+
VectorXd predictions{model.predict(X_test)};
50+
MatrixXd li{model.calculate_local_feature_importance(X_test)};
51+
52+
//Saving results
53+
save_as_csv_file("data/output.csv",predictions);
54+
55+
std::cout<<predictions.mean()<<"\n\n";
56+
tests.push_back(is_approximately_equal(predictions.mean(),23.6507,0.00001));
57+
58+
//Test summary
59+
std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
60+
}

cpp/test functions.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,26 @@ int main()
8383
tests.push_back(is_approximately_equal(rankability_bad_dw,0.0));
8484
tests.push_back(is_approximately_equal(rankability_equal_dw,0.5));
8585

86+
VectorXd y_integration(3);
87+
VectorXd x_integration(3);
88+
y_integration<<1,2,3;
89+
x_integration<<4,6,8;
90+
double integration{trapezoidal_integration(y_integration,x_integration)};
91+
tests.push_back(is_approximately_equal(integration,8.0));
92+
93+
VectorXd weights_none{VectorXd(0)};
94+
VectorXd calculated_weights_if_not_provided{calculate_weights_if_they_are_not_provided(y_true)};
95+
VectorXd calculated_weights_if_provided{calculate_weights_if_they_are_not_provided(y_true,weights_different)};
96+
tests.push_back(calculated_weights_if_not_provided==weights_equal);
97+
tests.push_back(calculated_weights_if_provided==weights_different);
98+
99+
VectorXd y_pred(3);
100+
VectorXd weights_gini(3);
101+
y_pred<<1.0,3.0,2.0;
102+
weights_gini<<0.2,0.5,0.3;
103+
double gini{calculate_gini(y_true,y_pred,weights_gini)};
104+
tests.push_back(is_approximately_equal(gini,-0.1166667,0.0000001));
105+
86106
//Test summary
87107
std::cout<<"Test summary\n\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
88108
}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name='aplr',
18-
version='2.0.1',
18+
version='2.1.0',
1919
description='Automatic Piecewise Linear Regression',
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)