capping of predictions

mathias-von-ottenbreit · mathias-von-ottenbreit · commit 3df7b6de45d0 · 2022-12-02T19:01:50.000+01:00
diff --git a/API_REFERENCE.md b/API_REFERENCE.md
@@ -32,7 +32,7 @@ Specifies the intercept term of the model if you want to predict before doing an
 Specifies the maximum number of bins to discretize the data into when searching for the best split. The default value works well according to empirical results. This hyperparameter is intended for reducing computational costs.
 
 #### max_interaction_level (default = 1)
-Specifies the maximum allowed depth of interaction terms. ***0*** means that interactions are not allowed. This hyperparameter should be tuned. Please note that occasionally a too high value produces a model that performs poorly on an independent test set despite looking good when tuning hyperparameters, typically because of a few outlier predictions. If this happens then capping of predictions should be considered. Alternatively, ***max_interaction_level*** may be decreased until the problem disappears.
+Specifies the maximum allowed depth of interaction terms. ***0*** means that interactions are not allowed. This hyperparameter should be tuned. Please note that occasionally a too high value produces a model that performs poorly on an independent test set despite looking good when tuning hyperparameters, typically because of a few outlier predictions. To alleviate this, the ***predict*** method by default caps predictions to limits calculated on the training data (if you need the model to extrapolate then switch off the default capping). Alternatively, ***max_interaction_level*** may be decreased until the problem disappears.
 
 #### max_interactions (default = 100000)
 The maximum number of interactions allowed. A lower value may be used to reduce computational time.
@@ -75,7 +75,7 @@ An optional list of strings containing names for each predictor in ***X***. Nami
 An optional list of integers specifying the indexes of observations to be used for validation instead of training. If this is specified then ***validation_ratio*** is not used. Specifying ***validation_set_indexes*** may be useful for example when modelling time series data (you can place more recent observations in the validation set).
 
 
-## Method: predict(X:npt.ArrayLike)
+## Method: predict(X:npt.ArrayLike, cap_predictions_to_minmax_in_training:bool=True)
 
 ***Returns a numpy vector containing predictions of the data in X. Requires that the model has been fitted with the fit method.***
 
@@ -84,6 +84,9 @@ An optional list of integers specifying the indexes of observations to be used f
 #### X
 A numpy matrix with predictor values.
 
+#### cap_predictions_to_minmax_in_training
+If ***True*** then predictions are capped so that they are not less than the minimum and not greater than the maximum prediction in the training dataset. This is recommended especially if ***max_interaction_level*** is high. However, if you need the model to extrapolate then set this parameter to ***False***.
+
 
 ## Method: set_term_names(X_names:List[str])
 
diff --git a/aplr/aplr.py b/aplr/aplr.py
@@ -50,8 +50,8 @@ def fit(self, X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np
         self.__set_params_cpp()
         self.APLRRegressor.fit(X,y,sample_weight,X_names,validation_set_indexes)
 
-    def predict(self,X:npt.ArrayLike)->npt.ArrayLike:
-        return self.APLRRegressor.predict(X)
+    def predict(self, X:npt.ArrayLike, cap_predictions_to_minmax_in_training:bool=True)->npt.ArrayLike:
+        return self.APLRRegressor.predict(X, cap_predictions_to_minmax_in_training)
 
     def set_term_names(self, X_names:List[str]):
         self.APLRRegressor.set_term_names(X_names)
diff --git a/cpp/APLRRegressor.h b/cpp/APLRRegressor.h
@@ -75,6 +75,7 @@ class APLRRegressor
     void find_optimal_m_and_update_model_accordingly();
     void name_terms(const MatrixXd &X, const std::vector<std::string> &X_names);
     void calculate_feature_importance_on_validation_set();
+    void find_min_and_max_training_predictions();
     void cleanup_after_fit();
     void validate_that_model_can_be_used(const MatrixXd &X);
     void throw_error_if_family_does_not_exist();
@@ -89,6 +90,7 @@ class APLRRegressor
     VectorXd differentiate_predictions();
     void scale_training_observations_if_using_log_link_function();
     void revert_scaling_if_using_log_link_function();
+    void cap_predictions_to_minmax_in_training(VectorXd &predictions);
     
 public:
     //Fields
@@ -118,6 +120,8 @@ class APLRRegressor
     size_t number_of_base_terms; 
     VectorXd feature_importance; //Populated in fit() using validation set. Rows are in the same order as in X.
     double tweedie_power;
+    double min_training_prediction;
+    double max_training_prediction;
 
     //Methods
     APLRRegressor(size_t m=1000,double v=0.1,uint_fast32_t random_state=std::numeric_limits<uint_fast32_t>::lowest(),std::string family="gaussian",
@@ -127,7 +131,7 @@ class APLRRegressor
     APLRRegressor(const APLRRegressor &other);
     ~APLRRegressor();
     void fit(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight=VectorXd(0),const std::vector<std::string> &X_names={},const std::vector<size_t> &validation_set_indexes={});
-    VectorXd predict(const MatrixXd &X);
+    VectorXd predict(const MatrixXd &X, bool cap_predictions_to_minmax_in_training=true);
     void set_term_names(const std::vector<std::string> &X_names);
     MatrixXd calculate_local_feature_importance(const MatrixXd &X);
     MatrixXd calculate_local_feature_importance_for_terms(const MatrixXd &X);
@@ -151,7 +155,8 @@ APLRRegressor::APLRRegressor(size_t m,double v,uint_fast32_t random_state,std::s
         bins{bins},verbosity{verbosity},max_interaction_level{max_interaction_level},
         intercept_steps{VectorXd(0)},max_interactions{max_interactions},interactions_eligible{0},validation_error_steps{VectorXd(0)},
         min_observations_in_split{min_observations_in_split},ineligible_boosting_steps_added{ineligible_boosting_steps_added},
-        max_eligible_terms{max_eligible_terms},number_of_base_terms{0},tweedie_power{tweedie_power}
+        max_eligible_terms{max_eligible_terms},number_of_base_terms{0},tweedie_power{tweedie_power},min_training_prediction{NAN_DOUBLE},
+        max_training_prediction{NAN_DOUBLE}
 {
 }
 
@@ -165,7 +170,8 @@ APLRRegressor::APLRRegressor(const APLRRegressor &other):
     max_interactions{other.max_interactions},interactions_eligible{other.interactions_eligible},validation_error_steps{other.validation_error_steps},
     min_observations_in_split{other.min_observations_in_split},ineligible_boosting_steps_added{other.ineligible_boosting_steps_added},
     max_eligible_terms{other.max_eligible_terms},number_of_base_terms{other.number_of_base_terms},
-    feature_importance{other.feature_importance},tweedie_power{other.tweedie_power}
+    feature_importance{other.feature_importance},tweedie_power{other.tweedie_power},min_training_prediction{other.min_training_prediction},
+    max_training_prediction{other.max_training_prediction}
 {
 }
 
@@ -193,6 +199,7 @@ void APLRRegressor::fit(const MatrixXd &X,const VectorXd &y,const VectorXd &samp
     revert_scaling_if_using_log_link_function();
     name_terms(X, X_names);
     calculate_feature_importance_on_validation_set();
+    find_min_and_max_training_predictions();
     cleanup_after_fit();
 }
 
@@ -1019,6 +1026,13 @@ MatrixXd APLRRegressor::calculate_local_feature_importance(const MatrixXd &X)
     return output;
 }
 
+void APLRRegressor::find_min_and_max_training_predictions()
+{
+    VectorXd training_predictions{predict(X_train,false)};
+    min_training_prediction=training_predictions.minCoeff();
+    max_training_prediction=training_predictions.maxCoeff();
+}
+
 void APLRRegressor::validate_that_model_can_be_used(const MatrixXd &X)
 {
     if(std::isnan(intercept) || number_of_base_terms==0) throw std::runtime_error("Model must be trained before predict() can be run.");
@@ -1056,13 +1070,18 @@ void APLRRegressor::cleanup_after_fit()
     }
 }
 
-VectorXd APLRRegressor::predict(const MatrixXd &X)
+VectorXd APLRRegressor::predict(const MatrixXd &X, bool cap_predictions_to_minmax_in_training)
 {
     validate_that_model_can_be_used(X);
 
     VectorXd linear_predictor{calculate_linear_predictor(X)};
     VectorXd predictions{transform_linear_predictor_to_predictions(linear_predictor,link_function,tweedie_power)};
 
+    if(cap_predictions_to_minmax_in_training)
+    {
+        this->cap_predictions_to_minmax_in_training(predictions);
+    }
+
     return predictions;
 }
 
@@ -1077,6 +1096,17 @@ VectorXd APLRRegressor::calculate_linear_predictor(const MatrixXd &X)
     return predictions;    
 }
 
+void APLRRegressor::cap_predictions_to_minmax_in_training(VectorXd &predictions)
+{
+    for (size_t i = 0; i < static_cast<size_t>(predictions.rows()); ++i)
+    {
+        if(std::isgreater(predictions[i],max_training_prediction))
+            predictions[i]=max_training_prediction;
+        else if(std::isless(predictions[i],min_training_prediction))
+            predictions[i]=min_training_prediction;
+    }
+}
+
 MatrixXd APLRRegressor::calculate_local_feature_importance_for_terms(const MatrixXd &X)
 {
     validate_that_model_can_be_used(X);
diff --git a/cpp/main.cpp b/cpp/main.cpp
@@ -49,7 +49,7 @@ int main()
     std::cout<<is_approximately_equal(model.validation_error_steps.minCoeff(),7.02559,0.00001)<<"\n";
 
     std::cout<<"mean prediction "<<predictions.mean()<<"\n\n";
-    std::cout<<is_approximately_equal(predictions.mean(),23.9213,0.0001)<<"\n";
+    std::cout<<is_approximately_equal(predictions.mean(),23.9133,0.0001)<<"\n";
 
     std::cout<<"best_m: "<<model.m<<"\n";
 
diff --git a/cpp/pythonbinding.cpp b/cpp/pythonbinding.cpp
@@ -20,7 +20,7 @@ PYBIND11_MODULE(aplr_cpp, m) {
             py::arg("tweedie_power")=1.5)
         .def("fit", &APLRRegressor::fit,py::arg("X"),py::arg("y"),py::arg("sample_weight")=VectorXd(0),py::arg("X_names")=std::vector<std::string>(),
             py::arg("validation_set_indexes")=std::vector<size_t>(),py::call_guard<py::scoped_ostream_redirect,py::scoped_estream_redirect>())
-        .def("predict", &APLRRegressor::predict,py::arg("X"))
+        .def("predict", &APLRRegressor::predict,py::arg("X"),py::arg("bool cap_predictions_to_minmax_in_training")=true)
         .def("set_term_names", &APLRRegressor::set_term_names,py::arg("X_names"))
         .def("calculate_local_feature_importance",&APLRRegressor::calculate_local_feature_importance,py::arg("X"))
         .def("calculate_local_feature_importance_for_terms",&APLRRegressor::calculate_local_feature_importance_for_terms,py::arg("X"))
@@ -57,16 +57,18 @@ PYBIND11_MODULE(aplr_cpp, m) {
         .def_readwrite("number_of_base_terms",&APLRRegressor::number_of_base_terms)
         .def_readwrite("feature_importance",&APLRRegressor::feature_importance)
         .def_readwrite("tweedie_power",&APLRRegressor::tweedie_power)
+        .def_readwrite("min_training_prediction",&APLRRegressor::min_training_prediction)
+        .def_readwrite("max_training_prediction",&APLRRegressor::max_training_prediction)
         .def(py::pickle(
             [](const APLRRegressor &a) { // __getstate__
                 /* Return a tuple that fully encodes the state of the object */
                 return py::make_tuple(a.m,a.v,a.random_state,a.family,a.n_jobs,a.validation_ratio,a.intercept,a.bins,a.verbosity,
                     a.max_interaction_level,a.max_interactions,a.validation_error_steps,a.term_names,a.term_coefficients,a.terms,a.intercept_steps,
                     a.interactions_eligible,a.min_observations_in_split,a.ineligible_boosting_steps_added,a.max_eligible_terms,
-                    a.number_of_base_terms,a.feature_importance,a.link_function,a.tweedie_power);
+                    a.number_of_base_terms,a.feature_importance,a.link_function,a.tweedie_power,a.min_training_prediction,a.max_training_prediction);
             },
             [](py::tuple t) { // __setstate__
-                if (t.size() != 24)
+                if (t.size() != 26)
                     throw std::runtime_error("Invalid state!");
 
                 /* Create a new C++ instance */
@@ -85,6 +87,8 @@ PYBIND11_MODULE(aplr_cpp, m) {
                 a.max_eligible_terms=t[19].cast<size_t>();
                 a.number_of_base_terms=t[20].cast<size_t>();
                 a.feature_importance=t[21].cast<VectorXd>();
+                a.min_training_prediction=t[24].cast<double>();
+                a.max_training_prediction=t[25].cast<double>();
 
                 return a;
             }
diff --git a/cpp/test ALRRegressor gamma.cpp b/cpp/test ALRRegressor gamma.cpp
@@ -52,7 +52,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),23.6098,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),23.6065,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/cpp/test ALRRegressor inversegaussian.cpp b/cpp/test ALRRegressor inversegaussian.cpp
@@ -53,7 +53,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),23.4164,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),23.4135,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/cpp/test ALRRegressor logit.cpp b/cpp/test ALRRegressor logit.cpp
@@ -52,7 +52,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),0.104267,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),0.103821,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/cpp/test ALRRegressor poisson.cpp b/cpp/test ALRRegressor poisson.cpp
@@ -52,7 +52,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),1.89421,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),1.89378,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/cpp/test ALRRegressor poissongamma.cpp b/cpp/test ALRRegressor poissongamma.cpp
@@ -53,7 +53,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),1.88935,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),1.88887,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/cpp/test ALRRegressor.cpp b/cpp/test ALRRegressor.cpp
@@ -51,7 +51,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),23.4891,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),23.4569,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setuptools.setup(
     name='aplr',
-    version='1.3.2',
+    version='1.5.0',
     description='Automatic Piecewise Linear Regression',
     ext_modules=[sfc_module],
     author="Mathias von Ottenbreit",