interactions

mathias-von-ottenbreit · mathias-von-ottenbreit · commit d611b5735395 · 2024-05-25T11:18:10.000+02:00
diff --git a/API_REFERENCE_FOR_CLASSIFICATION.md b/API_REFERENCE_FOR_CLASSIFICATION.md
@@ -29,7 +29,7 @@ Specifies the maximum number of bins to discretize the data into when searching
 Specifies the maximum allowed depth of interaction terms. ***0*** means that interactions are not allowed. This hyperparameter should be tuned.
 
 #### max_interactions (default = 100000)
-The maximum number of interactions allowed. A lower value may be used to reduce computational time.
+The maximum number of interactions allowed in each underlying model. A lower value may be used to reduce computational time.
 
 #### min_observations_in_split (default = 20)
 The minimum effective number of observations that a term in the model must rely on. This hyperparameter should be tuned. Larger values are more appropriate for larger datasets. Larger values result in more robust models (lower variance), potentially at the expense of increased bias.
diff --git a/API_REFERENCE_FOR_REGRESSION.md b/API_REFERENCE_FOR_REGRESSION.md
@@ -32,7 +32,7 @@ Specifies the maximum number of bins to discretize the data into when searching
 Specifies the maximum allowed depth of interaction terms. ***0*** means that interactions are not allowed. This hyperparameter should be tuned.
 
 #### max_interactions (default = 100000)
-The maximum number of interactions allowed. A lower value may be used to reduce computational time.
+The maximum number of interactions allowed in each underlying model. A lower value may be used to reduce computational time.
 
 #### min_observations_in_split (default = 20)
 The minimum effective number of observations that a term in the model must rely on. This hyperparameter should be tuned. Larger values are more appropriate for larger datasets. Larger values result in more robust models (lower variance), potentially at the expense of increased bias.
diff --git a/cpp/APLRRegressor.h b/cpp/APLRRegressor.h
@@ -1524,7 +1524,8 @@ void APLRRegressor::add_promising_interactions_and_select_the_best_one()
                 bool is_best_interaction{j == 0};
                 if (is_best_interaction)
                     best_term_index = terms_eligible_current.size() - 1;
-                ++interactions_eligible;
+                if (interactions_to_consider[sorted_indexes_of_errors_for_interactions_to_consider[j]].get_interaction_level() > 0)
+                    ++interactions_eligible;
             }
             else
                 break;
diff --git a/cpp/term.h b/cpp/term.h
@@ -715,7 +715,17 @@ VectorXd Term::calculate_contribution_to_linear_predictor(const MatrixXd &X)
 
 size_t Term::get_interaction_level()
 {
-    return given_terms.size();
+    std::vector<size_t> terms_used;
+    terms_used.reserve(1 + given_terms.size());
+    terms_used.push_back(base_term);
+    for (auto &given_term : given_terms)
+    {
+        terms_used.push_back(given_term.base_term);
+    }
+    std::set<size_t> unique_predictors_used{get_unique_integers(terms_used)};
+    size_t interaction_level{unique_predictors_used.size() - 1};
+
+    return interaction_level;
 }
 
 bool Term::get_can_be_used_as_a_given_term()
diff --git a/cpp/tests.cpp b/cpp/tests.cpp
@@ -167,7 +167,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 18.507179601395375));
+        tests.push_back(is_approximately_equal(predictions.mean(), 18.534016846656947));
     }
 
     void test_aplrregressor_cauchy_penalties()
@@ -221,7 +221,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 20.037964498663037));
+        tests.push_back(is_approximately_equal(predictions.mean(), 20.146282076477394));
     }
 
     void test_aplrregressor_cauchy_linear_effects_only_first()
@@ -275,7 +275,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 17.971623307698852));
+        tests.push_back(is_approximately_equal(predictions.mean(), 17.964887018234787));
     }
 
     void test_aplrregressor_cauchy_group_mse_validation()
@@ -333,7 +333,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 20.170939369337834));
+        tests.push_back(is_approximately_equal(predictions.mean(), 20.096177156192478));
 
         VectorXd feature_importance_on_test_set{model.calculate_feature_importance(X_test)};
         double feature_importance_on_test_set_mean{feature_importance_on_test_set.mean()};
@@ -349,12 +349,12 @@ class Tests
         std::cout << term_importance_first << "\n\n";
         std::cout << term_base_predictor_index_max << "\n\n";
         std::cout << term_interaction_level_max << "\n\n";
-        tests.push_back(is_approximately_equal(feature_importance_on_test_set_mean, 0.3745208543129413));
-        tests.push_back(is_approximately_equal(feature_importance_mean, 0.37558643075803277));
-        tests.push_back(is_approximately_equal(term_importance_mean, 0.12150899891033366));
-        tests.push_back(is_approximately_equal(feature_importance_first, 0.74048121167747938));
-        tests.push_back(is_approximately_equal(term_importance_first, 0.85496610382351568));
-        tests.push_back(term_base_predictor_index_max == 5);
+        tests.push_back(is_approximately_equal(feature_importance_on_test_set_mean, 0.37735253878466402));
+        tests.push_back(is_approximately_equal(feature_importance_mean, 0.37820511700233239));
+        tests.push_back(is_approximately_equal(term_importance_mean, 0.12843198080249971));
+        tests.push_back(is_approximately_equal(feature_importance_first, 0.73797521341670724));
+        tests.push_back(is_approximately_equal(term_importance_first, 1.0431553101537596));
+        tests.push_back(term_base_predictor_index_max == 6);
         tests.push_back(term_interaction_level_max == 1);
     }
 
@@ -412,7 +412,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 20.104856610039558));
+        tests.push_back(is_approximately_equal(predictions.mean(), 19.804431518585918));
     }
 
     void test_aplrregressor_cauchy()
@@ -466,7 +466,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 20.850035037781723));
+        tests.push_back(is_approximately_equal(predictions.mean(), 20.873594934501561));
     }
 
     void test_aplrregressor_custom_loss_and_validation()
@@ -526,7 +526,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.944797684016745));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.91507568241019));
     }
 
     void test_aplrregressor_custom_loss()
@@ -585,7 +585,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.7035, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.703500296203778, 0.00001));
     }
 
     void test_aplrregressor_gamma_custom_link()
@@ -640,7 +640,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.5266, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.526613939603266, 0.00001));
     }
 
     void test_aplrregressor_gamma_custom_validation()
@@ -695,7 +695,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.5512, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.551175298027964, 0.00001));
     }
 
     void test_aplrregressor_gamma_gini_weighted()
@@ -749,7 +749,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.3198, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.319789512734854, 0.00001));
     }
 
     void test_aplrregressor_gamma_gini()
@@ -803,7 +803,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.3198, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.319789512734854, 0.00001));
     }
 
     void test_aplrregressor_gamma()
@@ -857,7 +857,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.5512, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.551175298027964, 0.00001));
     }
 
     void test_aplrregressor_group_mse()
@@ -957,7 +957,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.533140818895273));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.526475166355244));
     }
 
     void test_aplrregressor_int_constr()
@@ -1010,7 +1010,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.606326522845816));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.576830262038001));
     }
 
     void test_aplrregressor_inversegaussian()
@@ -1065,7 +1065,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.3198, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.31977985222057, 0.00001));
     }
 
     void test_aplrregressor_logit()
@@ -1118,7 +1118,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 0.0875969, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 0.087596882912220717, 0.00001));
     }
 
     void test_aplrregressor_mae()
@@ -1171,7 +1171,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.557834093496929));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.563270291507191));
     }
 
     void test_aplrregressor_monotonic()
@@ -1224,7 +1224,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.337125831228487));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.47597042545404));
     }
 
     void test_aplrregressor_monotonic_ignore_interactions()
@@ -1278,7 +1278,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 24.3013, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 24.301339246925711, 0.00001));
     }
 
     void test_aplrregressor_negative_binomial()
@@ -1332,7 +1332,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 1.8694, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 1.8694002118421278, 0.00001));
     }
 
     void test_aplrregressor_poisson()
@@ -1385,7 +1385,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 1.88727, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 1.8872692088161898, 0.00001));
     }
 
     void test_aplrregressor_poissongamma()
@@ -1439,7 +1439,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 1.88553, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 1.8855344167602603, 0.00001));
     }
 
     void test_aplrregressor_quantile()
@@ -1492,7 +1492,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.65630148869738));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.610872525541577));
     }
 
     void test_aplrregressor_weibull()
@@ -1546,7 +1546,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.6406, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.640555263512187, 0.00001));
     }
 
     void test_aplrregressor()
@@ -1603,7 +1603,7 @@ class Tests
         save_as_csv_file("data/output.csv", predictions);
 
         std::cout << predictions.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(predictions.mean(), 23.7035, 0.00001));
+        tests.push_back(is_approximately_equal(predictions.mean(), 23.703500296203778, 0.00001));
 
         std::map<double, double> main_effect_shape = model.get_main_effect_shape(1);
         bool main_effect_shape_has_correct_length{main_effect_shape.size() == 11};
@@ -1688,15 +1688,15 @@ class Tests
 
         std::cout << "cv_error\n"
                   << model.get_cv_error() << "\n\n";
-        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.246477, 0.000001));
+        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.24647671959943313, 0.000001));
 
         std::cout << "predicted_class_prob_mean\n"
                   << predicted_class_probabilities.mean() << "\n\n";
         tests.push_back(is_approximately_equal(predicted_class_probabilities.mean(), 0.2, 0.00001));
 
         std::cout << "local_feature_importance_mean\n"
                   << local_feature_importance.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.15805, 0.00001));
+        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.1580504780375889, 0.00001));
     }
 
     void test_aplrclassifier_multi_class()
@@ -1911,15 +1911,15 @@ class Tests
 
         std::cout << "cv_error\n"
                   << model.get_cv_error() << "\n\n";
-        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.22862513689095387));
+        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.23802511407945728));
 
         std::cout << "predicted_class_prob_mean\n"
                   << predicted_class_probabilities.mean() << "\n\n";
         tests.push_back(is_approximately_equal(predicted_class_probabilities.mean(), 0.5, 0.00001));
 
         std::cout << "local_feature_importance_mean\n"
                   << local_feature_importance.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.14062146736733369));
+        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.13431844066700888));
     }
 
     void test_aplrclassifier_two_class()
@@ -2049,15 +2049,15 @@ class Tests
 
         std::cout << "cv_error\n"
                   << model.get_cv_error() << "\n\n";
-        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.16172925575492014, 0.000001));
+        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.16317052975361318, 0.000001));
 
         std::cout << "predicted_class_prob_mean\n"
                   << predicted_class_probabilities.mean() << "\n\n";
         tests.push_back(is_approximately_equal(predicted_class_probabilities.mean(), 0.5, 0.00001));
 
         std::cout << "local_feature_importance_mean\n"
                   << local_feature_importance.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.12241915926968391, 0.00001));
+        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.12221717377018071, 0.00001));
     }
 
     void test_aplrclassifier_two_class_predictor_specific_penalties_and_learning_rates()
@@ -2119,15 +2119,15 @@ class Tests
 
         std::cout << "cv_error\n"
                   << model.get_cv_error() << "\n\n";
-        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.17021028319567164, 0.000001));
+        tests.push_back(is_approximately_equal(model.get_cv_error(), 0.16984042158451909, 0.000001));
 
         std::cout << "predicted_class_prob_mean\n"
                   << predicted_class_probabilities.mean() << "\n\n";
         tests.push_back(is_approximately_equal(predicted_class_probabilities.mean(), 0.5, 0.00001));
 
         std::cout << "local_feature_importance_mean\n"
                   << local_feature_importance.mean() << "\n\n";
-        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.18697312064762112, 0.00001));
+        tests.push_back(is_approximately_equal(local_feature_importance.mean(), 0.18613865090207235, 0.00001));
     }
 
     void test_aplrclassifier_two_class_max_terms()
@@ -2368,7 +2368,7 @@ class Tests
         size_t p7il{p7.get_interaction_level()};
         size_t p8il{p8.get_interaction_level()};
         tests.push_back(pil == 2 ? true : false);
-        tests.push_back(p5il == 3 ? true : false);
+        tests.push_back(p5il == 2 ? true : false);
         tests.push_back(p7il == 1 ? true : false);
         tests.push_back(p8il == 0 ? true : false);
     }
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
 
 setuptools.setup(
     name="aplr",
-    version="10.0.0",
+    version="10.1.0",
     description="Automatic Piecewise Linear Regression",
     ext_modules=[sfc_module],
     author="Mathias von Ottenbreit",