Skip to content

Commit 0758ac5

Browse files
added the possibility to use a custom validation_tuning_metric
1 parent 5168b14 commit 0758ac5

File tree

6 files changed

+118
-13
lines changed

6 files changed

+118
-13
lines changed

API_REFERENCE_FOR_REGRESSION.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# APLRRegressor
22

3-
## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5)
3+
## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5, calculate_custom_validation_error_function:Optional[Callable[[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike, npt.ArrayLike], float]]=None)
44

55
### Constructor parameters
66

@@ -50,11 +50,19 @@ Limits 1) the number of terms already in the model that can be considered as int
5050
Specifies the variance power when ***loss_function*** is "tweedie". Specifies a dispersion parameter when ***loss_function*** is "negative_binomial", "cauchy" or "weibull".
5151

5252
#### validation_tuning_metric (default = "default")
53-
Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini", "rankability" and "group_mse". The default is often a choice that fits well with respect to the ***loss_function*** chosen. However, if you want to use ***loss_function*** or ***dispersion_parameter*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights. "group_mse" requires that the "group" argument in the ***fit*** method is provided.
53+
Specifies which metric to use for validating the model and tuning ***m***. Available options are "default" (using the same methodology as when calculating the training error), "mse", "mae", "negative_gini", "rankability", "group_mse" and "custom_function". The default is often a choice that fits well with respect to the ***loss_function*** chosen. However, if you want to use ***loss_function*** or ***dispersion_parameter*** as tuning parameters then the default is not suitable. "rankability" uses a methodology similar to the one described in https://towardsdatascience.com/how-to-calculate-roc-auc-score-for-regression-models-c0be4fdf76bb except that the metric is inverted and can be weighted by sample weights. "group_mse" requires that the "group" argument in the ***fit*** method is provided. For "custom_function" see ***calculate_custom_validation_error_function*** below.
5454

5555
#### quantile (default = 0.5)
5656
Specifies the quantile to use when ***loss_function*** is "quantile".
5757

58+
#### calculate_custom_validation_error_function (default = None)
59+
An optional Python function that calculates validation error if ***validation_tuning_metric*** is "custom_function". Example:
60+
61+
```
62+
def custom_validation_error_function(y, predictions, sample_weight, group):
63+
squared_errors = (y-predictions)**2
64+
return squared_errors.mean()
65+
```
5866

5967
## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[], monotonic_constraints:List[int]=[], group:npt.ArrayLike = np.empty(0), interaction_constraints:List[int]=[])
6068

aplr/aplr.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import numpy as np
22
import numpy.typing as npt
3-
from typing import List
3+
from typing import List, Callable, Optional
44
import aplr_cpp
55

66

77
class APLRRegressor():
8-
def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5):
8+
def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, loss_function:str="mse", link_function:str="identity", n_jobs:int=0,
9+
validation_ratio:float=0.2, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000,
10+
min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0,
11+
dispersion_parameter:float=1.5, validation_tuning_metric:str="default", quantile:float=0.5,
12+
calculate_custom_validation_error_function:Optional[Callable[[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike, npt.ArrayLike], float]]=None):
913
self.m=m
1014
self.v=v
1115
self.random_state=random_state
@@ -23,6 +27,7 @@ def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, loss_function:st
2327
self.dispersion_parameter=dispersion_parameter
2428
self.validation_tuning_metric=validation_tuning_metric
2529
self.quantile=quantile
30+
self.calculate_custom_validation_error_function=calculate_custom_validation_error_function
2631

2732
#Creating aplr_cpp and setting parameters
2833
self.APLRRegressor=aplr_cpp.APLRRegressor()
@@ -47,6 +52,7 @@ def __set_params_cpp(self):
4752
self.APLRRegressor.dispersion_parameter=self.dispersion_parameter
4853
self.APLRRegressor.validation_tuning_metric=self.validation_tuning_metric
4954
self.APLRRegressor.quantile=self.quantile
55+
self.APLRRegressor.calculate_custom_validation_error_function=self.calculate_custom_validation_error_function
5056

5157
def fit(self, X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[], monotonic_constraints:List[int]=[], group:npt.ArrayLike = np.empty(0), interaction_constraints:List[int]=[]):
5258
self.__set_params_cpp()
@@ -116,7 +122,8 @@ def get_params(self, deep=True):
116122
"max_eligible_terms":self.max_eligible_terms,
117123
"dispersion_parameter":self.dispersion_parameter,
118124
"validation_tuning_metric":self.validation_tuning_metric,
119-
"quantile":self.quantile
125+
"quantile":self.quantile,
126+
"calculate_custom_validation_error_function":self.calculate_custom_validation_error_function
120127
}
121128

122129
#For sklearn
@@ -128,7 +135,9 @@ def set_params(self, **parameters):
128135

129136

130137
class APLRClassifier():
131-
def __init__(self, m:int=9000, v:float=0.1, random_state:int=0, n_jobs:int=0, validation_ratio:float=0.2, bins:int=300, verbosity:int=0, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5):
138+
def __init__(self, m:int=9000, v:float=0.1, random_state:int=0, n_jobs:int=0, validation_ratio:float=0.2, bins:int=300, verbosity:int=0,
139+
max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10,
140+
max_eligible_terms:int=5):
132141
self.m=m
133142
self.v=v
134143
self.random_state=random_state

cpp/APLRRegressor.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,14 @@ class APLRRegressor
143143
std::vector<size_t> validation_indexes;
144144
std::string validation_tuning_metric;
145145
double quantile;
146+
std::function<double(const VectorXd &y, const VectorXd &predictions, const VectorXd &sample_weight, const VectorXi &group)> calculate_custom_validation_error_function;
146147

147148
APLRRegressor(size_t m=1000,double v=0.1,uint_fast32_t random_state=std::numeric_limits<uint_fast32_t>::lowest(),std::string loss_function="mse",
148149
std::string link_function="identity", size_t n_jobs=0, double validation_ratio=0.2,
149150
size_t reserved_terms_times_num_x=100, size_t bins=300,size_t verbosity=0,size_t max_interaction_level=1,size_t max_interactions=100000,
150151
size_t min_observations_in_split=20, size_t ineligible_boosting_steps_added=10, size_t max_eligible_terms=5,double dispersion_parameter=1.5,
151-
std::string validation_tuning_metric="default", double quantile=0.5);
152+
std::string validation_tuning_metric="default", double quantile=0.5,
153+
const std::function<double(VectorXd,VectorXd,VectorXd,VectorXi)> &calculate_custom_validation_error_function={});
152154
APLRRegressor(const APLRRegressor &other);
153155
~APLRRegressor();
154156
void fit(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight=VectorXd(0),const std::vector<std::string> &X_names={},
@@ -174,15 +176,16 @@ class APLRRegressor
174176
APLRRegressor::APLRRegressor(size_t m,double v,uint_fast32_t random_state,std::string loss_function,std::string link_function,size_t n_jobs,
175177
double validation_ratio,size_t reserved_terms_times_num_x,size_t bins,size_t verbosity,size_t max_interaction_level,
176178
size_t max_interactions,size_t min_observations_in_split,size_t ineligible_boosting_steps_added,size_t max_eligible_terms,double dispersion_parameter,
177-
std::string validation_tuning_metric, double quantile):
179+
std::string validation_tuning_metric, double quantile,
180+
const std::function<double(VectorXd,VectorXd,VectorXd,VectorXi)> &calculate_custom_validation_error_function):
178181
reserved_terms_times_num_x{reserved_terms_times_num_x},intercept{NAN_DOUBLE},m{m},v{v},
179182
loss_function{loss_function},link_function{link_function},validation_ratio{validation_ratio},n_jobs{n_jobs},random_state{random_state},
180183
bins{bins},verbosity{verbosity},max_interaction_level{max_interaction_level},intercept_steps{VectorXd(0)},
181184
max_interactions{max_interactions},interactions_eligible{0},validation_error_steps{VectorXd(0)},
182185
min_observations_in_split{min_observations_in_split},ineligible_boosting_steps_added{ineligible_boosting_steps_added},
183186
max_eligible_terms{max_eligible_terms},number_of_base_terms{0},dispersion_parameter{dispersion_parameter},min_training_prediction_or_response{NAN_DOUBLE},
184187
max_training_prediction_or_response{NAN_DOUBLE}, validation_tuning_metric{validation_tuning_metric},
185-
validation_indexes{std::vector<size_t>(0)}, quantile{quantile}
188+
validation_indexes{std::vector<size_t>(0)}, quantile{quantile}, calculate_custom_validation_error_function{calculate_custom_validation_error_function}
186189
{
187190
}
188191

@@ -197,7 +200,8 @@ APLRRegressor::APLRRegressor(const APLRRegressor &other):
197200
max_eligible_terms{other.max_eligible_terms},number_of_base_terms{other.number_of_base_terms},
198201
feature_importance{other.feature_importance},dispersion_parameter{other.dispersion_parameter},min_training_prediction_or_response{other.min_training_prediction_or_response},
199202
max_training_prediction_or_response{other.max_training_prediction_or_response},validation_tuning_metric{other.validation_tuning_metric},
200-
validation_indexes{other.validation_indexes}, quantile{other.quantile}, m_optimal{other.m_optimal}
203+
validation_indexes{other.validation_indexes}, quantile{other.quantile}, m_optimal{other.m_optimal},
204+
calculate_custom_validation_error_function{other.calculate_custom_validation_error_function}
201205
{
202206
}
203207

@@ -1157,6 +1161,18 @@ void APLRRegressor::calculate_validation_error(size_t boosting_step, const Vecto
11571161
throw std::runtime_error("When validation_tuning_metric is group_mse then the group argument in fit() must be provided.");
11581162
validation_error_steps[boosting_step]=calculate_mean_error(calculate_errors(y_validation,predictions,sample_weight_validation,"group_mse",dispersion_parameter,group_validation,unique_groups_validation,quantile),sample_weight_validation);
11591163
}
1164+
else if(validation_tuning_metric=="custom_function")
1165+
{
1166+
try
1167+
{
1168+
validation_error_steps[boosting_step] = calculate_custom_validation_error_function(y_validation, predictions, sample_weight_validation, group_validation);
1169+
}
1170+
catch(const std::exception& e)
1171+
{
1172+
std::string error_msg{"Error when calculating custom validation error: " + static_cast<std::string>(e.what())};
1173+
throw std::runtime_error(error_msg);
1174+
}
1175+
}
11601176
else
11611177
throw std::runtime_error(validation_tuning_metric + " is an invalid validation_tuning_metric.");
11621178
}

cpp/pythonbinding.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,28 @@
55
#include <pybind11/operators.h>
66
#include <pybind11/eigen.h>
77
#include <pybind11/iostream.h>
8+
#include <pybind11/functional.h>
89
#include "APLRRegressor.h"
910
#include "APLRClassifier.h"
1011

1112

1213
namespace py = pybind11;
1314

15+
std::function<double(VectorXd,VectorXd,VectorXd,VectorXi)> empty_calculate_custom_validation_error_function={};
16+
1417
PYBIND11_MODULE(aplr_cpp, m) {
1518
py::class_<APLRRegressor>(m, "APLRRegressor",py::module_local())
1619
.def(py::init<int&,double&,int&,std::string&,std::string&,int&,double&,int&,int&,int&,int&,int&,int&,int&,int&,double&,std::string&,
17-
double&>(),
20+
double&,std::function<double(const VectorXd &y, const VectorXd &predictions, const VectorXd &sample_weight, const VectorXi &group)>&>(),
1821
py::arg("m")=1000,py::arg("v")=0.1,py::arg("random_state")=0,py::arg("loss_function")="mse",py::arg("link_function")="identity",
1922
py::arg("n_jobs")=0,py::arg("validation_ratio")=0.2,
2023
py::arg("reserved_terms_times_num_x")=100,py::arg("bins")=300,py::arg("verbosity")=0,
2124
py::arg("max_interaction_level")=1,py::arg("max_interactions")=100000,py::arg("min_observations_in_split")=20,
2225
py::arg("ineligible_boosting_steps_added")=10,py::arg("max_eligible_terms")=5,
2326
py::arg("dispersion_parameter")=1.5,
2427
py::arg("validation_tuning_metric")="default",
25-
py::arg("quantile")=0.5
28+
py::arg("quantile")=0.5,
29+
py::arg("calculate_custom_validation_error_function")=empty_calculate_custom_validation_error_function
2630
)
2731
.def("fit", &APLRRegressor::fit,py::arg("X"),py::arg("y"),py::arg("sample_weight")=VectorXd(0),py::arg("X_names")=std::vector<std::string>(),
2832
py::arg("validation_set_indexes")=std::vector<size_t>(),py::arg("prioritized_predictors_indexes")=std::vector<size_t>(),
@@ -73,6 +77,7 @@ PYBIND11_MODULE(aplr_cpp, m) {
7377
.def_readwrite("validation_tuning_metric",&APLRRegressor::validation_tuning_metric)
7478
.def_readwrite("validation_indexes",&APLRRegressor::validation_indexes)
7579
.def_readwrite("quantile",&APLRRegressor::quantile)
80+
.def_readwrite("calculate_custom_validation_error_function",&APLRRegressor::calculate_custom_validation_error_function)
7681
.def(py::pickle(
7782
[](const APLRRegressor &a) { // __getstate__
7883
/* Return a tuple that fully encodes the state of the object */
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#include <cmath>
2+
#include <iostream>
3+
#include <vector>
4+
#include <numeric>
5+
#include "../dependencies/eigen-master/Eigen/Dense"
6+
#include "APLRRegressor.h"
7+
#include "term.h"
8+
9+
10+
using namespace Eigen;
11+
12+
double calculate_custom_validation_error(const VectorXd &y, const VectorXd &predictions, const VectorXd &sample_weight, const VectorXi &group)
13+
{
14+
VectorXd error{(y.array()-predictions.array()).pow(2)};
15+
return error.sum();
16+
}
17+
18+
int main()
19+
{
20+
std::vector<bool> tests;
21+
tests.reserve(1000);
22+
23+
//Model
24+
APLRRegressor model{APLRRegressor()};
25+
model.m=100;
26+
model.v=0.1;
27+
model.bins=300;
28+
model.n_jobs=0;
29+
model.loss_function="gamma";
30+
model.link_function="log";
31+
model.verbosity=3;
32+
model.max_interaction_level=0;
33+
model.max_interactions=1000;
34+
model.min_observations_in_split=20;
35+
model.ineligible_boosting_steps_added=10;
36+
model.max_eligible_terms=5;
37+
model.validation_tuning_metric="custom_function";
38+
model.calculate_custom_validation_error_function=calculate_custom_validation_error;
39+
40+
//Data
41+
MatrixXd X_train{load_csv_into_eigen_matrix<MatrixXd>("data/X_train.csv")};
42+
MatrixXd X_test{load_csv_into_eigen_matrix<MatrixXd>("data/X_test.csv")};
43+
VectorXd y_train{load_csv_into_eigen_matrix<MatrixXd>("data/y_train.csv")};
44+
VectorXd y_test{load_csv_into_eigen_matrix<MatrixXd>("data/y_test.csv")};
45+
46+
VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
47+
48+
std::cout<<X_train;
49+
50+
//Fitting
51+
//model.fit(X_train,y_train);
52+
model.fit(X_train,y_train,sample_weight);
53+
//model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
54+
std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
55+
56+
VectorXd predictions{model.predict(X_test)};
57+
MatrixXd li{model.calculate_local_feature_importance(X_test)};
58+
59+
//Saving results
60+
save_as_csv_file("data/output.csv",predictions);
61+
62+
std::cout<<predictions.mean()<<"\n\n";
63+
tests.push_back(is_approximately_equal(predictions.mean(),23.6503,0.00001));
64+
65+
//Test summary
66+
std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
67+
}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name='aplr',
18-
version='6.1.0',
18+
version='6.2.0',
1919
description='Automatic Piecewise Linear Regression',
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)