10.0.0

mathias-von-ottenbreit · mathias-von-ottenbreit · commit 398e8821d038 · 2024-05-23T07:38:20.000+02:00
diff --git a/API_REFERENCE_FOR_REGRESSION.md b/API_REFERENCE_FOR_REGRESSION.md
@@ -312,9 +312,9 @@ A numpy matrix with predictor values.
 ***Returns the validation_tuning_metric used.*** 
 
 
-## Method: get_coefficient_shape_function(predictor_index:int)
+## Method: get_main_effect_shape(predictor_index:int)
 
-***For the predictor in X specified by predictor_index, get_coefficient_shape_function returns a dictionary with keys equal to predictor values and values equal to coefficient. For each predictor value, the coefficient denotes the change in the linear predictor given that the predictor value increases by one unit (interactions with other predictors are ignored). This function makes it easier to interpret APLR models as one can quickly see how the main effects work across relevant values of the predictor. Predictor values lower than the lowest predictor value in the dictionary have the same coefficient that the lowest predictor value in the dictionary has. Predictor values higher than the highest predictor value in the dictionary have the same coefficient that the highest predictor value in the dictionary has.***
+***For the predictor in X specified by predictor_index, get_main_effect_shape returns a dictionary with keys equal to predictor values and values equal to the corresponding contribution to the linear predictor (interactions with other predictors are ignored). This method makes it easier to interpret main effects.***
 
 ### Parameters
 
diff --git a/README.md b/README.md
@@ -17,4 +17,4 @@ Please see the two example Python scripts [here](https://github.com/ottenbreit-d
 Please consider sponsoring Ottenbreit Data Science by clicking on the Sponsor button. Sufficient funding will enable maintenance of APLR and further development.
 
 # API reference
-Please see the [api reference for regression](https://github.com/ottenbreit-data-science/aplr/blob/main/API_REFERENCE_FOR_REGRESSION.md) and [api reference for classification](https://github.com/ottenbreit-data-science/aplr/blob/main/API_REFERENCE_FOR_CLASSIFICATION.md).
+Please see the [API reference for regression](https://github.com/ottenbreit-data-science/aplr/blob/main/API_REFERENCE_FOR_REGRESSION.md) and [API reference for classification](https://github.com/ottenbreit-data-science/aplr/blob/main/API_REFERENCE_FOR_CLASSIFICATION.md).
diff --git a/aplr/aplr.py b/aplr/aplr.py
@@ -277,10 +277,8 @@ def get_optimal_m(self) -> int:
     def get_validation_tuning_metric(self) -> str:
         return self.APLRRegressor.get_validation_tuning_metric()
 
-    def get_coefficient_shape_function(
-        self, predictor_index: int
-    ) -> Dict[float, float]:
-        return self.APLRRegressor.get_coefficient_shape_function(predictor_index)
+    def get_main_effect_shape(self, predictor_index: int) -> Dict[float, float]:
+        return self.APLRRegressor.get_main_effect_shape(predictor_index)
 
     def get_cv_error(self) -> float:
         return self.APLRRegressor.get_cv_error()
diff --git a/cpp/APLRRegressor.h b/cpp/APLRRegressor.h
@@ -100,6 +100,7 @@ class APLRRegressor
     void preprocess_predictor_learning_rates_and_penalties(const MatrixXd &X, const std::vector<double> &predictor_learning_rates,
                                                            const std::vector<double> &predictor_penalties_for_non_linearity,
                                                            const std::vector<double> &predictor_penalties_for_interactions);
+    void calculate_min_and_max_predictor_values_in_training(const MatrixXd &X);
     std::vector<double> preprocess_predictor_learning_rate_or_penalty(const MatrixXd &X, double general_value,
                                                                       const std::vector<double> &predictor_specific_values);
     void fit_model_for_cv_fold(const MatrixXd &X, const VectorXd &y, const VectorXd &sample_weight,
@@ -233,6 +234,8 @@ class APLRRegressor
     double penalty_for_non_linearity;
     double penalty_for_interactions;
     size_t max_terms;
+    VectorXd min_predictor_values_in_training;
+    VectorXd max_predictor_values_in_training;
 
     APLRRegressor(size_t m = 3000, double v = 0.1, uint_fast32_t random_state = std::numeric_limits<uint_fast32_t>::lowest(), std::string loss_function = "mse",
                   std::string link_function = "identity", size_t n_jobs = 0, size_t cv_folds = 5,
@@ -274,7 +277,7 @@ class APLRRegressor
     double get_intercept();
     size_t get_optimal_m();
     std::string get_validation_tuning_metric();
-    std::map<double, double> get_coefficient_shape_function(size_t predictor_index);
+    std::map<double, double> get_main_effect_shape(size_t predictor_index);
     double get_cv_error();
 
     friend class APLRClassifier;
@@ -336,7 +339,8 @@ APLRRegressor::APLRRegressor(const APLRRegressor &other)
       early_stopping_rounds{other.early_stopping_rounds},
       num_first_steps_with_linear_effects_only{other.num_first_steps_with_linear_effects_only},
       penalty_for_non_linearity{other.penalty_for_non_linearity}, penalty_for_interactions{other.penalty_for_interactions},
-      max_terms{other.max_terms}
+      max_terms{other.max_terms}, min_predictor_values_in_training{other.min_predictor_values_in_training},
+      max_predictor_values_in_training{other.max_predictor_values_in_training}
 {
 }
 
@@ -364,6 +368,7 @@ void APLRRegressor::fit(const MatrixXd &X, const VectorXd &y, const VectorXd &sa
     preprocess_penalties();
     preprocess_predictor_learning_rates_and_penalties(X, predictor_learning_rates, predictor_penalties_for_non_linearity,
                                                       predictor_penalties_for_interactions);
+    calculate_min_and_max_predictor_values_in_training(X);
     cv_fold_models.resize(cv_observations_used.cols());
     for (Eigen::Index i = 0; i < cv_observations_used.cols(); ++i)
     {
@@ -447,6 +452,17 @@ std::vector<double> APLRRegressor::preprocess_predictor_learning_rate_or_penalty
     return output;
 }
 
+void APLRRegressor::calculate_min_and_max_predictor_values_in_training(const MatrixXd &X)
+{
+    min_predictor_values_in_training = VectorXd(X.cols());
+    max_predictor_values_in_training = VectorXd(X.cols());
+    for (Eigen::Index i = 0; i < X.cols(); ++i)
+    {
+        min_predictor_values_in_training[i] = X.col(i).minCoeff();
+        max_predictor_values_in_training[i] = X.col(i).maxCoeff();
+    }
+}
+
 void APLRRegressor::fit_model_for_cv_fold(const MatrixXd &X, const VectorXd &y, const VectorXd &sample_weight,
                                           const std::vector<std::string> &X_names, const VectorXi &cv_observations_in_fold,
                                           const std::vector<int> &monotonic_constraints, const VectorXi &group, const MatrixXd &other_data,
@@ -575,8 +591,8 @@ void APLRRegressor::validate_input_to_fit(const MatrixXd &X, const VectorXd &y,
         {
             Eigen::Index rows_with_ones{(cv_observations.col(i).array() == 1).count()};
             Eigen::Index rows_with_minus_ones{(cv_observations.col(i).array() == -1).count()};
-            if (rows_with_ones < min_obserations_in_a_cv_fold || rows_with_minus_ones < min_obserations_in_a_cv_fold)
-                throw std::runtime_error("Each column in cv_observations must contain at least " + std::to_string(min_obserations_in_a_cv_fold) + " observations for each of the values 1 and -1.");
+            if (rows_with_ones < MIN_OBSERATIONS_IN_A_CV_FOLD || rows_with_minus_ones < MIN_OBSERATIONS_IN_A_CV_FOLD)
+                throw std::runtime_error("Each column in cv_observations must contain at least " + std::to_string(MIN_OBSERATIONS_IN_A_CV_FOLD) + " observations for each of the values 1 and -1.");
         }
     }
     bool group_is_of_incorrect_size{(loss_function == "group_mse" || validation_tuning_metric == "group_mse") && group.rows() != y.rows()};
@@ -744,7 +760,7 @@ MatrixXi APLRRegressor::preprocess_cv_observations(const MatrixXi &cv_observatio
         {
             Eigen::Index rows_with_ones{(output.col(i).array() == 1).count()};
             Eigen::Index rows_with_minus_ones{(output.col(i).array() == -1).count()};
-            if (rows_with_ones < min_obserations_in_a_cv_fold || rows_with_minus_ones < min_obserations_in_a_cv_fold)
+            if (rows_with_ones < MIN_OBSERATIONS_IN_A_CV_FOLD || rows_with_minus_ones < MIN_OBSERATIONS_IN_A_CV_FOLD)
                 throw std::runtime_error("Did not generate enough observations in a fold. Please try again with a different random_state and/or change cv_folds.");
         }
     }
@@ -2366,20 +2382,21 @@ std::string APLRRegressor::get_validation_tuning_metric()
     return validation_tuning_metric;
 }
 
-std::map<double, double> APLRRegressor::get_coefficient_shape_function(size_t predictor_index)
+std::map<double, double> APLRRegressor::get_main_effect_shape(size_t predictor_index)
 {
     if (model_has_not_been_trained())
-        throw std::runtime_error("The model must have been trained before using get_coefficient_shape_function().");
+        throw std::runtime_error("The model must have been trained before using get_main_effect_shape().");
 
-    std::map<double, double> coefficient_shape_function;
+    std::map<double, double> main_effect_shape;
 
     std::vector<size_t> relevant_term_indexes{compute_relevant_term_indexes(predictor_index)};
     bool relevant_term_indexes_do_not_exist{relevant_term_indexes.size() == 0};
     if (relevant_term_indexes_do_not_exist)
-        return coefficient_shape_function;
+        return main_effect_shape;
 
     std::vector<double> split_points;
-    split_points.reserve(relevant_term_indexes.size() * 4);
+    size_t max_potential_split_points{relevant_term_indexes.size() * 3 + 2};
+    split_points.reserve(max_potential_split_points);
     for (auto &relevant_term_index : relevant_term_indexes)
     {
         bool split_point_exits{std::isfinite(terms[relevant_term_index].split_point)};
@@ -2396,35 +2413,8 @@ std::map<double, double> APLRRegressor::get_coefficient_shape_function(size_t pr
             }
         }
     }
-    bool no_split_points{split_points.size() == 0};
-    if (no_split_points)
-    {
-        split_points.push_back(0);
-        split_points.push_back(1);
-    }
-    split_points = remove_duplicate_elements_from_vector(split_points);
-    bool one_split_point{split_points.size() == 1};
-    if (one_split_point)
-    {
-        split_points.push_back(split_points[0] - 1);
-        split_points = remove_duplicate_elements_from_vector(split_points);
-    }
-
-    VectorXd split_point_increments{VectorXd(split_points.size() - 1)};
-    for (Eigen::Index i = 0; i < split_point_increments.size(); ++i)
-    {
-        split_point_increments[i] = split_points[i + 1] - split_points[i];
-    }
-    double minimum_split_point_increment{split_point_increments.minCoeff()};
-    double increment_around_split_points{minimum_split_point_increment / DIVISOR_IN_GET_COEFFICIENT_SHAPE_FUNCTION};
-
-    size_t num_split_points{split_points.size()};
-    for (size_t i = 0; i < num_split_points; ++i)
-    {
-        split_points.push_back(split_points[i] - increment_around_split_points);
-        split_points.push_back(split_points[i] + increment_around_split_points);
-    }
-    split_points.push_back(split_points[split_points.size() - 1] + increment_around_split_points);
+    split_points.push_back(min_predictor_values_in_training[predictor_index]);
+    split_points.push_back(max_predictor_values_in_training[predictor_index]);
     split_points = remove_duplicate_elements_from_vector(split_points);
     split_points.shrink_to_fit();
 
@@ -2435,12 +2425,12 @@ std::map<double, double> APLRRegressor::get_coefficient_shape_function(size_t pr
     }
 
     VectorXd contribution_to_linear_predictor{calculate_local_contribution_from_selected_terms(X, {predictor_index})};
-    for (size_t i = 0; i < split_points.size() - 1; ++i)
+    for (size_t i = 0; i < split_points.size(); ++i)
     {
-        coefficient_shape_function[split_points[i]] = (contribution_to_linear_predictor[i + 1] - contribution_to_linear_predictor[i]) / (split_points[i + 1] - split_points[i]);
+        main_effect_shape[split_points[i]] = contribution_to_linear_predictor[i];
     }
 
-    return coefficient_shape_function;
+    return main_effect_shape;
 }
 
 std::vector<size_t> APLRRegressor::compute_relevant_term_indexes(size_t predictor_index)
diff --git a/cpp/constants.h b/cpp/constants.h
@@ -5,5 +5,4 @@ const double NAN_DOUBLE{std::numeric_limits<double>::quiet_NaN()};
 const int MAX_ABS_EXPONENT_TO_APPLY_ON_LINEAR_PREDICTOR_IN_LOGIT_MODEL{std::min(16, std::numeric_limits<double>::max_exponent10)};
 const std::string MSE_LOSS_FUNCTION{"mse"};
 const size_t MIN_CATEGORIES_IN_CLASSIFIER{2};
-const double DIVISOR_IN_GET_COEFFICIENT_SHAPE_FUNCTION{1000.0};
-const Eigen::Index min_obserations_in_a_cv_fold{2};
+const Eigen::Index MIN_OBSERATIONS_IN_A_CV_FOLD{2};
diff --git a/cpp/pythonbinding.cpp b/cpp/pythonbinding.cpp
@@ -70,7 +70,7 @@ PYBIND11_MODULE(aplr_cpp, m)
         .def("get_intercept", &APLRRegressor::get_intercept)
         .def("get_optimal_m", &APLRRegressor::get_optimal_m)
         .def("get_validation_tuning_metric", &APLRRegressor::get_validation_tuning_metric)
-        .def("get_coefficient_shape_function", &APLRRegressor::get_coefficient_shape_function, py::arg("predictor_index"))
+        .def("get_main_effect_shape", &APLRRegressor::get_main_effect_shape, py::arg("predictor_index"))
         .def("get_cv_error", &APLRRegressor::get_cv_error)
         .def_readwrite("intercept", &APLRRegressor::intercept)
         .def_readwrite("m", &APLRRegressor::m)
@@ -118,6 +118,8 @@ PYBIND11_MODULE(aplr_cpp, m)
         .def_readwrite("penalty_for_non_linearity", &APLRRegressor::penalty_for_non_linearity)
         .def_readwrite("penalty_for_interactions", &APLRRegressor::penalty_for_interactions)
         .def_readwrite("max_terms", &APLRRegressor::max_terms)
+        .def_readwrite("min_predictor_values_in_training", &APLRRegressor::min_predictor_values_in_training)
+        .def_readwrite("max_predictor_values_in_training", &APLRRegressor::max_predictor_values_in_training)
         .def(py::pickle(
             [](const APLRRegressor &a) { // __getstate__
                 /* Return a tuple that fully encodes the state of the object */
@@ -130,10 +132,11 @@ PYBIND11_MODULE(aplr_cpp, m)
                                       a.monotonic_constraints_ignore_interactions, a.group_mse_by_prediction_bins,
                                       a.group_mse_cycle_min_obs_in_bin, a.cv_error, a.term_importance, a.term_main_predictor_indexes,
                                       a.term_interaction_levels, a.early_stopping_rounds, a.num_first_steps_with_linear_effects_only,
-                                      a.penalty_for_non_linearity, a.penalty_for_interactions, a.max_terms);
+                                      a.penalty_for_non_linearity, a.penalty_for_interactions, a.max_terms,
+                                      a.min_predictor_values_in_training, a.max_predictor_values_in_training);
             },
             [](py::tuple t) { // __setstate__
-                if (t.size() != 41)
+                if (t.size() != 43)
                     throw std::runtime_error("Invalid state!");
 
                 /* Create a new C++ instance */
@@ -178,6 +181,8 @@ PYBIND11_MODULE(aplr_cpp, m)
                 double penalty_for_non_linearity = t[38].cast<double>();
                 double penalty_for_interactions = t[39].cast<double>();
                 size_t max_terms = t[40].cast<size_t>();
+                VectorXd min_predictor_values_in_training = t[41].cast<VectorXd>();
+                VectorXd max_predictor_values_in_training = t[42].cast<VectorXd>();
 
                 APLRRegressor a(m, v, random_state, loss_function, link_function, n_jobs, cv_folds, 100, bins, verbosity, max_interaction_level,
                                 max_interactions, min_observations_in_split, ineligible_boosting_steps_added, max_eligible_terms, dispersion_parameter,
@@ -206,6 +211,8 @@ PYBIND11_MODULE(aplr_cpp, m)
                 a.penalty_for_non_linearity = penalty_for_non_linearity;
                 a.penalty_for_interactions = penalty_for_interactions;
                 a.max_terms = max_terms;
+                a.min_predictor_values_in_training = min_predictor_values_in_training;
+                a.max_predictor_values_in_training = max_predictor_values_in_training;
 
                 return a;
             }));
diff --git a/cpp/tests.cpp b/cpp/tests.cpp
@@ -1605,13 +1605,13 @@ class Tests
         std::cout << predictions.mean() << "\n\n";
         tests.push_back(is_approximately_equal(predictions.mean(), 23.7035, 0.00001));
 
-        std::map<double, double> coefficient_shape_function = model.get_coefficient_shape_function(1);
-        bool coefficient_shape_function_has_correct_length{coefficient_shape_function.size() == 27};
-        bool coefficient_shape_function_value_test{is_approximately_equal(coefficient_shape_function.begin()->second, 0.04175, 0.00001)};
+        std::map<double, double> main_effect_shape = model.get_main_effect_shape(1);
+        bool main_effect_shape_has_correct_length{main_effect_shape.size() == 11};
+        bool main_effect_shape_value_test{is_approximately_equal(main_effect_shape.begin()->second, -0.44924570143235887)};
         bool li_for_particular_terms_has_correct_size{li_for_particular_terms.rows() == X_train.rows()};
         bool li_for_particular_terms_mean_is_correct{is_approximately_equal(li_for_particular_terms.mean(), 0.30321952178814915)};
-        tests.push_back(coefficient_shape_function_has_correct_length);
-        tests.push_back(coefficient_shape_function_value_test);
+        tests.push_back(main_effect_shape_has_correct_length);
+        tests.push_back(main_effect_shape_value_test);
         tests.push_back(li_for_particular_terms_has_correct_size);
         tests.push_back(li_for_particular_terms_mean_is_correct);
     }
diff --git a/documentation/APLR 10.0.0.pdf b/documentation/APLR 10.0.0.pdf
diff --git a/examples/train_aplr_regression.py b/examples/train_aplr_regression.py
@@ -102,12 +102,12 @@
     }
 )
 
-# Coefficient shape for the third predictor. Will be empty if the third predictor is not used as a main effect in the model.
-coefficient_shape = best_model.get_coefficient_shape_function(predictor_index=2)
-coefficient_shape = pd.DataFrame(
+# Main effect shape for the third predictor. Will be empty if the third predictor is not used as a main effect in the model.
+main_effect_shape = best_model.get_main_effect_shape(predictor_index=2)
+main_effect_shape = pd.DataFrame(
     {
-        "predictor_value": coefficient_shape.keys(),
-        "coefficient": coefficient_shape.values(),
+        "predictor_value": main_effect_shape.keys(),
+        "coefficient": main_effect_shape.values(),
     }
 )
 
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
 
 setuptools.setup(
     name="aplr",
-    version="9.10.1",
+    version="10.0.0",
     description="Automatic Piecewise Linear Regression",
     ext_modules=[sfc_module],
     author="Mathias von Ottenbreit",

Original file line number	Diff line number	Diff line change
`@@ -102,12 +102,12 @@`
`102`	`102`	`}`
`103`	`103`	`)`
`104`	`104`
`105`		`-# Coefficient shape for the third predictor. Will be empty if the third predictor is not used as a main effect in the model.`
`106`		`-coefficient_shape = best_model.get_coefficient_shape_function(predictor_index=2)`
`107`		`-coefficient_shape = pd.DataFrame(`
	`105`	`+# Main effect shape for the third predictor. Will be empty if the third predictor is not used as a main effect in the model.`
	`106`	`+main_effect_shape = best_model.get_main_effect_shape(predictor_index=2)`
	`107`	`+main_effect_shape = pd.DataFrame(`
`108`	`108`	`{`
`109`		`- "predictor_value": coefficient_shape.keys(),`
`110`		`- "coefficient": coefficient_shape.values(),`
	`109`	`+ "predictor_value": main_effect_shape.keys(),`
	`110`	`+ "coefficient": main_effect_shape.values(),`
`111`	`111`	`}`
`112`	`112`	`)`
`113`	`113`