Skip to content

Commit 650e2f7

Browse files
Fixed bugs with other model families than gaussian. Added inversegaussian family.
1 parent bd9ab1a commit 650e2f7

10 files changed

+127
-55
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Currently available for Windows and most Linux distributions.
1414
Please see the two example Python scripts in the examples folder. They cover common use cases, but not all of the functionality in this package. For example, fitting with user-specified observation weights is possible but the example scripts do not use this functionality.
1515

1616
# Sponsorship
17-
Please consider sponsoring Ottenbreit Data Science by clicking on the Sponsor button. Sufficient funding will enable maintenance of APLR and further development, such as developing a classifier based on APLR.
17+
Please consider sponsoring Ottenbreit Data Science by clicking on the Sponsor button. Sufficient funding will enable maintenance of APLR and further development.
1818

1919
# API reference
2020
A thorough API reference will be provided in the future.

cpp/APLRRegressor.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ class APLRRegressor
7979
void throw_error_if_family_does_not_exist();
8080
VectorXd calculate_linear_predictor(const MatrixXd &X);
8181
void update_linear_predictor_and_predictors();
82+
void throw_error_if_response_contains_invalid_values(const VectorXd &y);
83+
void throw_error_if_response_is_not_between_0_and_1(const VectorXd &y);
84+
void throw_error_if_response_is_negative(const VectorXd &y);
85+
void throw_error_if_response_is_not_greater_than_zero(const VectorXd &y);
8286

8387
public:
8488
//Fields
@@ -191,6 +195,7 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X,const VectorXd &y,co
191195
throw_error_if_matrix_has_nan_or_infinite_elements(y, "y");
192196
throw_error_if_matrix_has_nan_or_infinite_elements(sample_weight, "sample_weight");
193197
throw_error_if_validation_set_indexes_has_invalid_indexes(y, validation_set_indexes);
198+
throw_error_if_response_contains_invalid_values(y);
194199
}
195200

196201
void APLRRegressor::throw_error_if_validation_set_indexes_has_invalid_indexes(const VectorXd &y, const std::vector<size_t> &validation_set_indexes)
@@ -205,6 +210,39 @@ void APLRRegressor::throw_error_if_validation_set_indexes_has_invalid_indexes(co
205210
}
206211
}
207212

213+
void APLRRegressor::throw_error_if_response_contains_invalid_values(const VectorXd &y)
214+
{
215+
if(family=="logit")
216+
throw_error_if_response_is_not_between_0_and_1(y);
217+
else if(family=="poisson" || family=="poissongamma")
218+
throw_error_if_response_is_negative(y);
219+
else if(family=="gamma" || family=="inversegaussian")
220+
throw_error_if_response_is_not_greater_than_zero(y);
221+
}
222+
223+
void APLRRegressor::throw_error_if_response_is_not_between_0_and_1(const VectorXd &y)
224+
{
225+
bool response_is_less_than_zero{(y.array()<0.0).any()};
226+
bool response_is_greater_than_one{(y.array()>1.0).any()};
227+
if(response_is_less_than_zero || response_is_greater_than_one)
228+
throw std::runtime_error("Response values for "+family+" models cannot be less than zero or greater than one.");
229+
}
230+
231+
void APLRRegressor::throw_error_if_response_is_negative(const VectorXd &y)
232+
{
233+
bool response_is_less_than_zero{(y.array()<0.0).any()};
234+
if(response_is_less_than_zero)
235+
throw std::runtime_error("Response values for "+family+" models cannot be less than zero.");
236+
}
237+
238+
void APLRRegressor::throw_error_if_response_is_not_greater_than_zero(const VectorXd &y)
239+
{
240+
bool response_is_not_greater_than_zero{(y.array()<=0.0).any()};
241+
if(response_is_not_greater_than_zero)
242+
throw std::runtime_error("Response values for "+family+" models must be greater than zero.");
243+
244+
}
245+
208246
void APLRRegressor::define_training_and_validation_sets(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight, const std::vector<size_t> &validation_set_indexes)
209247
{
210248
//Defining train and validation indexes
@@ -349,6 +387,8 @@ VectorXd APLRRegressor::calculate_neg_gradient_current(const VectorXd &y,const V
349387
output=(y.array() - predictions_current.array()) / predictions_current.array() / predictions_current.array();
350388
else if(family=="poissongamma")
351389
output=(y.array() / predictions_current.array().pow(1.5) - predictions_current.array().pow(-0.5));
390+
else if(family=="inversegaussian")
391+
output=y.array() / predictions_current.array().pow(3.0) - predictions_current.array().pow(-2.0);
352392
return output;
353393
}
354394

@@ -648,6 +688,15 @@ void APLRRegressor::select_the_best_term_and_update_errors(size_t boosting_step)
648688
}
649689

650690
validation_error_steps[boosting_step]=calculate_error(calculate_errors(y_validation,predictions_current_validation,sample_weight_validation,family),sample_weight_validation);
691+
bool validation_error_is_invalid{!std::isfinite(validation_error_steps[boosting_step]) || std::isnan(validation_error_steps[boosting_step])};
692+
if(validation_error_is_invalid)
693+
{
694+
abort_boosting=true;
695+
std::string warning_message{"Warning: Encountered numerical problems when calculating prediction errors."};
696+
if(family=="poisson" || family=="poissongamma" ||family=="gamma" || family=="inversegaussian")
697+
warning_message+=" A reason may be too large response values.";
698+
std::cout<<warning_message<<"\n";
699+
}
651700
}
652701

653702
void APLRRegressor::update_linear_predictor_and_predictors()
@@ -1012,6 +1061,8 @@ void APLRRegressor::throw_error_if_family_does_not_exist()
10121061
family_exists=true;
10131062
else if(family=="poissongamma")
10141063
family_exists=true;
1064+
else if(family=="inversegaussian")
1065+
family_exists=true;
10151066
if(!family_exists)
10161067
throw std::runtime_error("Family "+family+" is not available in APLR.");
10171068
}

cpp/constants.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#pragma once
22
#include <limits>
33

4-
const double NAN_DOUBLE{ std::numeric_limits<double>::quiet_NaN() };
5-
const double SMALL_NEGATIVE_VALUE{-0.001};
4+
const double NAN_DOUBLE{ std::numeric_limits<double>::quiet_NaN() };

cpp/functions.h

Lines changed: 9 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ VectorXd calculate_poissongamma_errors(const VectorXd &y,const VectorXd &predict
7171
return errors;
7272
}
7373

74+
VectorXd calculate_inversegaussian_errors(const VectorXd &y,const VectorXd &predicted)
75+
{
76+
VectorXd errors{y.array().pow(-1.0).array() / 2.0 + y.array()*predicted.array().pow(-2.0) / 2.0 + predicted.array().pow(-1.0) / (-1.0)};
77+
return errors;
78+
}
79+
7480
//Computes errors (for each observation) based on error metric for a vector
7581
VectorXd calculate_errors(const VectorXd &y,const VectorXd &predicted,const VectorXd &sample_weight=VectorXd(0),const std::string &family="gaussian")
7682
{
@@ -86,6 +92,8 @@ VectorXd calculate_errors(const VectorXd &y,const VectorXd &predicted,const Vect
8692
errors=calculate_gamma_errors(y,predicted);
8793
else if(family=="poissongamma")
8894
errors=calculate_poissongamma_errors(y,predicted);
95+
else if(family=="inversegaussian")
96+
errors=calculate_inversegaussian_errors(y,predicted);
8997
//Adjusting for sample weights if specified
9098
if(sample_weight.size()>0)
9199
errors=errors.array()*sample_weight.array();
@@ -127,18 +135,6 @@ double calculate_error(const VectorXd &errors,const VectorXd &sample_weight=Vect
127135
return error;
128136
}
129137

130-
VectorXd transform_zero_to_negative(const VectorXd &linear_predictor)
131-
{
132-
VectorXd transformed_linear_predictor{linear_predictor};
133-
for (size_t i = 0; i < static_cast<size_t>(transformed_linear_predictor.rows()); ++i)
134-
{
135-
bool row_is_not_negative{std::isgreaterequal(transformed_linear_predictor[i],0.0)};
136-
if(row_is_not_negative)
137-
transformed_linear_predictor[i]=SMALL_NEGATIVE_VALUE;
138-
}
139-
return transformed_linear_predictor;
140-
}
141-
142138
VectorXd transform_linear_predictor_to_predictions(const VectorXd &linear_predictor, const std::string &family="gaussian")
143139
{
144140
if(family=="gaussian")
@@ -148,45 +144,11 @@ VectorXd transform_linear_predictor_to_predictions(const VectorXd &linear_predic
148144
VectorXd exp_of_linear_predictor{linear_predictor.array().exp()};
149145
return exp_of_linear_predictor.array() / (1.0 + exp_of_linear_predictor.array());
150146
}
151-
else if(family=="poisson")
147+
else if(family=="poisson" || family=="poissongamma" || family=="gamma" || family=="inversegaussian")
152148
return linear_predictor.array().exp();
153-
else if(family=="gamma")
154-
{
155-
VectorXd transformed_linear_predictor{transform_zero_to_negative(linear_predictor)};
156-
return -1/transformed_linear_predictor.array();
157-
}
158-
else if(family=="poissongamma")
159-
{
160-
VectorXd transformed_linear_predictor{transform_zero_to_negative(linear_predictor)};
161-
return transformed_linear_predictor.array().pow(-2).array() * 4.0;
162-
}
163149
return VectorXd(0);
164150
}
165151

166-
double transform_linear_predictor_to_prediction(double linear_predictor, const std::string &family="gaussian")
167-
{
168-
if(family=="gaussian")
169-
return linear_predictor;
170-
else if(family=="logit")
171-
{
172-
double exp_of_linear_predictor{std::exp(linear_predictor)};
173-
return exp_of_linear_predictor / (1.0 + exp_of_linear_predictor);
174-
}
175-
else if(family=="poisson")
176-
return std::exp(linear_predictor);
177-
else if(family=="gamma")
178-
{
179-
double negative_linear_predictor{std::min(linear_predictor,SMALL_NEGATIVE_VALUE)};
180-
return -1/negative_linear_predictor;
181-
}
182-
else if(family=="poissongamma")
183-
{
184-
double negative_linear_predictor{std::min(linear_predictor,SMALL_NEGATIVE_VALUE)};
185-
return 4.0 * std::pow(negative_linear_predictor,-2);
186-
}
187-
return NAN_DOUBLE;
188-
}
189-
190152
//sorts index based on v
191153
VectorXi sort_indexes_ascending(const VectorXd &v)
192154
{

cpp/test ALRRegressor gamma.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ int main()
5151
save_data("data/output.csv",predictions);
5252

5353
std::cout<<predictions.mean()<<"\n\n";
54-
tests.push_back(check_if_approximately_equal(predictions.mean(),23.9757,0.00001));
54+
tests.push_back(check_if_approximately_equal(predictions.mean(),21.0539,0.00001));
5555

5656
//std::cout<<model.validation_error_steps<<"\n\n";
5757

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include <iostream>
2+
#include "term.h"
3+
#include "../dependencies/eigen-master/Eigen/Dense"
4+
#include <vector>
5+
#include <numeric>
6+
#include "APLRRegressor.h"
7+
#include <cmath>
8+
9+
10+
using namespace Eigen;
11+
12+
int main()
13+
{
14+
std::vector<bool> tests;
15+
tests.reserve(1000);
16+
17+
//Model
18+
APLRRegressor model{APLRRegressor()};
19+
model.m=100;
20+
model.v=0.1;
21+
model.bins=300;
22+
model.n_jobs=0;
23+
model.family="inversegaussian";
24+
model.verbosity=3;
25+
model.max_interaction_level=0;
26+
model.max_interactions=1000;
27+
model.min_observations_in_split=20;
28+
model.ineligible_boosting_steps_added=10;
29+
model.max_eligible_terms=5;
30+
31+
//Data
32+
MatrixXd X_train{load_csv<MatrixXd>("data/X_train.csv")};
33+
MatrixXd X_test{load_csv<MatrixXd>("data/X_test.csv")};
34+
VectorXd y_train{load_csv<MatrixXd>("data/y_train.csv")};
35+
VectorXd y_test{load_csv<MatrixXd>("data/y_test.csv")};
36+
37+
VectorXd sample_weight{VectorXd::Constant(y_train.size(),1.0)};
38+
39+
std::cout<<X_train;
40+
41+
//Fitting
42+
//model.fit(X_train,y_train);
43+
model.fit(X_train,y_train,sample_weight);
44+
//model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
45+
std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
46+
47+
VectorXd predictions{model.predict(X_test)};
48+
MatrixXd li{model.calculate_local_feature_importance(X_test)};
49+
50+
//Saving results
51+
save_data("data/output.csv",predictions);
52+
53+
std::cout<<predictions.mean()<<"\n\n";
54+
tests.push_back(check_if_approximately_equal(predictions.mean(),14.8751,0.00001));
55+
56+
//std::cout<<model.validation_error_steps<<"\n\n";
57+
58+
//Test summary
59+
std::cout<<"\n\nTest summary\n"<<"Passed "<<std::accumulate(tests.begin(),tests.end(),0)<<" out of "<<tests.size()<<" tests.";
60+
}

cpp/test ALRRegressor poissongamma.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ int main()
5151
save_data("data/output.csv",predictions);
5252

5353
std::cout<<predictions.mean()<<"\n\n";
54-
tests.push_back(check_if_approximately_equal(predictions.mean(),1.89343,0.00001));
54+
tests.push_back(check_if_approximately_equal(predictions.mean(),1.88045,0.00001));
5555

5656
//std::cout<<model.validation_error_steps<<"\n\n";
5757

examples/train_aplr_cross_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
#Training model
3030
param_grid = {"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]}
31-
family="gaussian" #other available families are logit, poisson, gamma and poissongamma
31+
family="gaussian" #other available families are logit, poisson, poissongamma, gamma and inversegaussian
3232
grid_search_cv = GridSearchCV(APLRRegressor(random_state=random_state,verbosity=1,m=1000,v=0.1,family=family),param_grid,cv=5,n_jobs=4,scoring="neg_mean_squared_error")
3333
grid_search_cv.fit(data_train[predictors].values,data_train[response].values)
3434
best_model:APLRRegressor = grid_search_cv.best_estimator_

examples/train_aplr_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
best_validation_result=np.inf
3333
param_grid=ParameterGrid({"max_interactions":[100000],"max_interaction_level":[0,1,2,3,100],"min_observations_in_split":[1, 20, 50, 100, 200]})
3434
bestmodel=None
35-
family="gaussian" #other available families are logit, poisson, gamma and poissongamma
35+
family="gaussian" #other available families are logit, poisson, poissongamma, gamma and inversegaussian
3636
for params in param_grid:
3737
model = APLRRegressor(random_state=random_state,verbosity=3,m=1000,v=0.1,family=family,**params)
3838
model.fit(data_train[predictors].values,data_train[response].values,X_names=predictors)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name='aplr',
18-
version='1.1.0',
18+
version='1.1.1',
1919
description='Automatic Piecewise Linear Regression',
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)