gini

mathias-von-ottenbreit · mathias-von-ottenbreit · commit 1efeea340fea · 2023-03-28T17:21:19.000+02:00
diff --git a/API_REFERENCE.md b/API_REFERENCE.md
@@ -53,7 +53,7 @@ Limits 1) the number of terms already in the model that can be considered as int
 Specifies the variance power for the "tweedie" ***family***.
 
 #### validation_tuning_metric (default = "default")
-Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae" and "rankability". The default is often a choice that fits well with respect to the ***family*** chosen. However, if you want to use ***family*** or ***tweedie_power*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb
+Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini" and "rankability". The default is often a choice that fits well with respect to the ***family*** chosen. However, if you want to use ***family*** or ***tweedie_power*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights.
 
 ## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[], monotonic_constraints:List[int]=[])
 
diff --git a/cpp/APLRRegressor.h b/cpp/APLRRegressor.h
@@ -93,8 +93,9 @@ class APLRRegressor
     VectorXd calculate_linear_predictor(const MatrixXd &X);
     void update_linear_predictor_and_predictions();
     void throw_error_if_response_contains_invalid_values(const VectorXd &y);
+    void throw_error_if_sample_weight_contains_invalid_values(const VectorXd &y, const VectorXd &sample_weight);
     void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y,const std::string &error_message);
-    void throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message);
+    void throw_error_if_vector_contains_negative_values(const VectorXd &y, const std::string &error_message);
     void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y, const std::string &error_message);
     void throw_error_if_tweedie_power_is_invalid();
     VectorXd differentiate_predictions();
@@ -271,7 +272,6 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X,const VectorXd &y,co
 {
     if(X.rows()!=y.size()) throw std::runtime_error("X and y must have the same number of rows.");
     if(X.rows()<2) throw std::runtime_error("X and y cannot have less than two rows.");
-    if(sample_weight.size()>0 && sample_weight.size()!=y.size()) throw std::runtime_error("sample_weight must have 0 or as many rows as X and y.");
     if(X_names.size()>0 && X_names.size()!=static_cast<size_t>(X.cols())) throw std::runtime_error("X_names must have as many columns as X.");
     throw_error_if_matrix_has_nan_or_infinite_elements(X, "X");
     throw_error_if_matrix_has_nan_or_infinite_elements(y, "y");
@@ -280,6 +280,7 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X,const VectorXd &y,co
     throw_error_if_prioritized_predictors_indexes_has_invalid_indexes(X, prioritized_predictors_indexes);
     throw_error_if_monotonic_constraints_has_invalid_indexes(X, monotonic_constraints);
     throw_error_if_response_contains_invalid_values(y);
+    throw_error_if_sample_weight_contains_invalid_values(y, sample_weight);
 }
 
 void APLRRegressor::throw_error_if_validation_set_indexes_has_invalid_indexes(const VectorXd &y, const std::vector<size_t> &validation_set_indexes)
@@ -332,7 +333,15 @@ void APLRRegressor::throw_error_if_response_contains_invalid_values(const Vector
     else if(link_function=="log" || family=="poisson" || (family=="tweedie" && std::isless(tweedie_power,2) && std::isgreater(tweedie_power,1)))
     {
         std::string error_message{"Response values for the log link function or poisson family or tweedie family when tweedie_power<2 cannot be less than zero."};
-        throw_error_if_response_is_negative(y,error_message);
+        throw_error_if_vector_contains_negative_values(y,error_message);
+    }
+    else if(validation_tuning_metric=="negative_gini")
+    {
+        std::string error_message{"Response values cannot be negative when using the negative_gini validation_tuning_metric."};
+        throw_error_if_vector_contains_negative_values(y, error_message);
+        bool sum_is_zero{y.sum()==0};
+        if(sum_is_zero)
+            throw std::runtime_error("Response values cannot sum to zero when using the negative_gini validation_tuning_metric.");
     }
 }
 
@@ -344,10 +353,10 @@ void APLRRegressor::throw_error_if_response_is_not_between_0_and_1(const VectorX
         throw std::runtime_error(error_message);   
 }
 
-void APLRRegressor::throw_error_if_response_is_negative(const VectorXd &y, const std::string &error_message)
+void APLRRegressor::throw_error_if_vector_contains_negative_values(const VectorXd &y, const std::string &error_message)
 {
-    bool response_is_less_than_zero{(y.array()<0.0).any()};
-    if(response_is_less_than_zero)
+    bool vector_is_less_than_zero{(y.array()<0.0).any()};
+    if(vector_is_less_than_zero)
         throw std::runtime_error(error_message);   
 }
 
@@ -359,6 +368,19 @@ void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const Vecto
 
 }
 
+void APLRRegressor::throw_error_if_sample_weight_contains_invalid_values(const VectorXd &y, const VectorXd &sample_weight)
+{
+    bool sample_weight_are_provided{sample_weight.size()>0};
+    if(sample_weight_are_provided)
+    {
+        if(sample_weight.size()!=y.size()) throw std::runtime_error("sample_weight must have 0 or as many rows as X and y.");
+        throw_error_if_vector_contains_negative_values(sample_weight,"sample_weight cannot contain negative values.");
+        bool sum_is_zero{sample_weight.sum()==0};
+        if(sum_is_zero)
+            throw std::runtime_error("sample_weight cannot sum to zero.");
+    }
+}
+
 void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight, const std::vector<size_t> &validation_set_indexes)
 {
     size_t y_size{static_cast<size_t>(y.size())};
@@ -969,11 +991,11 @@ void APLRRegressor::calculate_and_validate_validation_error(size_t boosting_step
     else
         calculate_validation_error(boosting_step, predictions_current_validation);
     
-    bool validation_error_is_invalid{std::isinf(validation_error_steps[boosting_step])};
+    bool validation_error_is_invalid{!std::isfinite(validation_error_steps[boosting_step])};
     if(validation_error_is_invalid)
     {
         abort_boosting=true;
-        std::string warning_message{"Warning: Encountered numerical problems when calculating prediction errors in the previous boosting step. Not continuing with further boosting steps. One potential reason is if the combination of family and link_function is invalid."};
+        std::string warning_message{"Warning: Encountered numerical problems when calculating validation error in the previous boosting step. Not continuing with further boosting steps. One potential reason is if the combination of family and link_function is invalid."};
         std::cout<<warning_message<<"\n";
     }
 }
@@ -986,6 +1008,8 @@ void APLRRegressor::calculate_validation_error(size_t boosting_step, const Vecto
         validation_error_steps[boosting_step]=calculate_mean_error(calculate_errors(y_validation,predictions,sample_weight_validation,FAMILY_GAUSSIAN),sample_weight_validation);
     else if(validation_tuning_metric=="mae")
         validation_error_steps[boosting_step]=calculate_mean_error(calculate_absolute_errors(y_validation,predictions,sample_weight_validation),sample_weight_validation);
+    else if(validation_tuning_metric=="negative_gini")
+        validation_error_steps[boosting_step]=-calculate_gini(y_validation,predictions,sample_weight_validation);
     else if(validation_tuning_metric=="rankability")
         validation_error_steps[boosting_step]=-calculate_rankability(y_validation,predictions,sample_weight_validation,random_state);
     else
diff --git a/cpp/functions.h b/cpp/functions.h
@@ -358,4 +358,57 @@ double calculate_rankability(const VectorXd &y_true, const VectorXd &y_pred, con
         rankability=0.5;
 
     return rankability;
+}
+
+double trapezoidal_integration(const VectorXd &y, const VectorXd &x)
+{
+    bool y_is_large_enough{y.rows()>1};
+    bool x_and_y_have_the_same_size{x.rows()==y.rows()};
+
+    double output{NAN_DOUBLE};
+    if(y_is_large_enough && x_and_y_have_the_same_size)
+    {
+        output=0;
+        for (size_t i = 1; i < static_cast<size_t>(y.size()); ++i)
+        {
+            double delta_y{(y[i]+y[i-1])/2};
+            double delta_x{x[i]-x[i-1]};
+            output += delta_y*delta_x;
+        }
+    }
+
+    return output;
+}
+
+VectorXd calculate_weights_if_they_are_not_provided(const VectorXd &y_true, const VectorXd &weights=VectorXd(0))
+{
+    bool weights_are_not_provided{weights.size()==0};
+    if(weights_are_not_provided)
+    {
+        return VectorXd::Constant(y_true.size(),1.0);
+    }
+    else
+        return weights;
+}
+
+double calculate_gini(const VectorXd &y_true, const VectorXd &y_pred, const VectorXd &weights=VectorXd(0))
+{
+    VectorXd weights_used{calculate_weights_if_they_are_not_provided(y_true,weights)};
+
+    VectorXi y_pred_sorted_index{sort_indexes_ascending(y_pred)};
+
+    Eigen::Index normalized_cumsum_vector_rows{y_true.size()+1};
+    VectorXd normalized_cumsum_y_true{VectorXd::Constant(normalized_cumsum_vector_rows, 0.0)};
+    VectorXd normalized_cumsum_weights{VectorXd::Constant(normalized_cumsum_vector_rows, 0.0)};
+    for (Eigen::Index i = 1; i < normalized_cumsum_vector_rows; ++i)
+    {
+        normalized_cumsum_y_true[i] += normalized_cumsum_y_true[i-1] + y_true[y_pred_sorted_index[i-1]];
+        normalized_cumsum_weights[i] += normalized_cumsum_weights[i-1] + weights_used[y_pred_sorted_index[i-1]];
+    }
+    normalized_cumsum_y_true /= y_true.sum();
+    normalized_cumsum_weights /= weights_used.sum();
+    
+    double gini{1.0 - 2 * trapezoidal_integration(normalized_cumsum_y_true, normalized_cumsum_weights)};
+
+    return gini;
 }
diff --git a/cpp/test ALRRegressor gamma gini weighted.cpp b/cpp/test ALRRegressor gamma gini weighted.cpp
@@ -0,0 +1,60 @@
+#include <iostream>
+#include "term.h"
+#include "../dependencies/eigen-master/Eigen/Dense"
+#include <vector>
+#include <numeric>
+#include "APLRRegressor.h"
+#include <cmath>
+
+
+using namespace Eigen;
+
+int main()
+{
+    std::vector<bool> tests;
+    tests.reserve(1000);
+
+    //Model
+    APLRRegressor model{APLRRegressor()};
+    model.m=100;
+    model.v=0.1;
+    model.bins=300;
+    model.n_jobs=0;
+    model.family="gamma";
+    model.link_function="log";
+    model.verbosity=3;
+    model.max_interaction_level=0;
+    model.max_interactions=1000;
+    model.min_observations_in_split=20;
+    model.ineligible_boosting_steps_added=10;
+    model.max_eligible_terms=5;
+    model.validation_tuning_metric="negative_gini";
+
+    //Data    
+    MatrixXd X_train{load_csv_into_eigen_matrix<MatrixXd>("data/X_train.csv")};
+    MatrixXd X_test{load_csv_into_eigen_matrix<MatrixXd>("data/X_test.csv")}; 
+    VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};    
+    VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")}; 
+
+    VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
+
+    std::cout<<X_train;
+
+    //Fitting
+    //model.fit(X_train,y_train);
+    model.fit(X_train,y_train,sample_weight);
+    //model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
+    std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
+
+    VectorXd predictions{model.predict(X_test)};
+    MatrixXd li{model.calculate_local_feature_importance(X_test)};
+
+    //Saving results
+    save_as_csv_file("data/output.csv",predictions);
+
+    std::cout<<predictions.mean()<<"\n\n";
+    tests.push_back(is_approximately_equal(predictions.mean(),23.6507,0.00001));
+
+    //Test summary
+    std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
+}
diff --git a/cpp/test ALRRegressor gamma gini.cpp b/cpp/test ALRRegressor gamma gini.cpp
@@ -0,0 +1,60 @@
+#include <iostream>
+#include "term.h"
+#include "../dependencies/eigen-master/Eigen/Dense"
+#include <vector>
+#include <numeric>
+#include "APLRRegressor.h"
+#include <cmath>
+
+
+using namespace Eigen;
+
+int main()
+{
+    std::vector<bool> tests;
+    tests.reserve(1000);
+
+    //Model
+    APLRRegressor model{APLRRegressor()};
+    model.m=100;
+    model.v=0.1;
+    model.bins=300;
+    model.n_jobs=0;
+    model.family="gamma";
+    model.link_function="log";
+    model.verbosity=3;
+    model.max_interaction_level=0;
+    model.max_interactions=1000;
+    model.min_observations_in_split=20;
+    model.ineligible_boosting_steps_added=10;
+    model.max_eligible_terms=5;
+    model.validation_tuning_metric="negative_gini";
+
+    //Data    
+    MatrixXd X_train{load_csv_into_eigen_matrix<MatrixXd>("data/X_train.csv")};
+    MatrixXd X_test{load_csv_into_eigen_matrix<MatrixXd>("data/X_test.csv")}; 
+    VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};    
+    VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")}; 
+
+    VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
+
+    std::cout<<X_train;
+
+    //Fitting
+    model.fit(X_train,y_train);
+    //model.fit(X_train,y_train,sample_weight);
+    //model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
+    std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
+
+    VectorXd predictions{model.predict(X_test)};
+    MatrixXd li{model.calculate_local_feature_importance(X_test)};
+
+    //Saving results
+    save_as_csv_file("data/output.csv",predictions);
+
+    std::cout<<predictions.mean()<<"\n\n";
+    tests.push_back(is_approximately_equal(predictions.mean(),23.6507,0.00001));
+
+    //Test summary
+    std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
+}
diff --git a/cpp/test functions.cpp b/cpp/test functions.cpp
@@ -83,6 +83,26 @@ int main()
     tests.push_back(is_approximately_equal(rankability_bad_dw,0.0));
     tests.push_back(is_approximately_equal(rankability_equal_dw,0.5));
 
+    VectorXd y_integration(3);
+    VectorXd x_integration(3);
+    y_integration<<1,2,3;
+    x_integration<<4,6,8;
+    double integration{trapezoidal_integration(y_integration,x_integration)};
+    tests.push_back(is_approximately_equal(integration,8.0));
+
+    VectorXd weights_none{VectorXd(0)};
+    VectorXd calculated_weights_if_not_provided{calculate_weights_if_they_are_not_provided(y_true)};
+    VectorXd calculated_weights_if_provided{calculate_weights_if_they_are_not_provided(y_true,weights_different)};
+    tests.push_back(calculated_weights_if_not_provided==weights_equal);
+    tests.push_back(calculated_weights_if_provided==weights_different);
+
+    VectorXd y_pred(3);
+    VectorXd weights_gini(3);
+    y_pred<<1.0,3.0,2.0;
+    weights_gini<<0.2,0.5,0.3;
+    double gini{calculate_gini(y_true,y_pred,weights_gini)};
+    tests.push_back(is_approximately_equal(gini,-0.1166667,0.0000001));
+
     //Test summary
     std::cout<<"Test summary\n\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
 }
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setuptools.setup(
     name='aplr',
-    version='2.0.1',
+    version='2.1.0',
     description='Automatic Piecewise Linear Regression',
     ext_modules=[sfc_module],
     author="Mathias von Ottenbreit",