Merge pull request #13 from ottenbreit-data-science/f

mathias-von-ottenbreit · web-flow · commit 1a9f5b42e17b · 2023-02-14T21:49:19.000+01:00
prioritized predictors
diff --git a/API_REFERENCE.md b/API_REFERENCE.md
@@ -56,7 +56,7 @@ Species the variance power for the "tweedie" ***family***.
 APLR calculates a tuning metric, mean squared error for groups of observations in the validation set. This metric is provided by the method ***get_validation_group_mse()***. The metric may be useful for tuning ***tweedie_power*** and to some extent ***family*** or ***link_function***. The reasoning behind this is that mean squared error (MSE) is often appropriate for evaluating goodness of fit on approximately normally distributed data. The mean of a group of observations is approximately normally distributed according to the Central Limit Theorem (CLT) if there are enough observations in the group, regardless of how individual observations are distributed. Ideally, ***group_size_for_validation_group_mse*** should be large enough so that the Central Limit Theorem holds (at least 30, but the default of 100 is a safer choice). Also, the number of observations in the validation set should be substantially higher than ***group_size_for_validation_group_mse***.
 
 
-## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[])
+## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[])
 
 ***This method fits the model to data.***
 
@@ -77,6 +77,9 @@ An optional list of strings containing names for each predictor in ***X***. Nami
 #### validation_set_indexes
 An optional list of integers specifying the indexes of observations to be used for validation instead of training. If this is specified then ***validation_ratio*** is not used. Specifying ***validation_set_indexes*** may be useful for example when modelling time series data (you can place more recent observations in the validation set).
 
+#### prioritized_predictors_indexes
+An optional list of integers specifying the indexes of predictors (columns) in ***X*** that should be prioritized. Terms of the prioritized predictors will enter the model as long as they reduce the training error and do not contain too few effective observations. They will also be updated more often.
+
 
 ## Method: predict(X:npt.ArrayLike, cap_predictions_to_minmax_in_training:bool=True)
 
diff --git a/aplr/aplr.py b/aplr/aplr.py
@@ -48,9 +48,9 @@ def __set_params_cpp(self):
         self.APLRRegressor.tweedie_power=self.tweedie_power
         self.APLRRegressor.group_size_for_validation_group_mse=self.group_size_for_validation_group_mse
 
-    def fit(self, X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[]):
+    def fit(self, X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[], prioritized_predictors_indexes:List[int]=[]):
         self.__set_params_cpp()
-        self.APLRRegressor.fit(X,y,sample_weight,X_names,validation_set_indexes)
+        self.APLRRegressor.fit(X,y,sample_weight,X_names,validation_set_indexes,prioritized_predictors_indexes)
 
     def predict(self, X:npt.ArrayLike, cap_predictions_to_minmax_in_training:bool=True)->npt.ArrayLike:
         return self.APLRRegressor.predict(X, cap_predictions_to_minmax_in_training)
diff --git a/cpp/APLRRegressor.h b/cpp/APLRRegressor.h
diff --git a/cpp/pythonbinding.cpp b/cpp/pythonbinding.cpp
@@ -20,7 +20,8 @@ PYBIND11_MODULE(aplr_cpp, m) {
             py::arg("tweedie_power")=1.5,
             py::arg("group_size_for_validation_group_mse")=100)
         .def("fit", &APLRRegressor::fit,py::arg("X"),py::arg("y"),py::arg("sample_weight")=VectorXd(0),py::arg("X_names")=std::vector<std::string>(),
-            py::arg("validation_set_indexes")=std::vector<size_t>(),py::call_guard<py::scoped_ostream_redirect,py::scoped_estream_redirect>())
+            py::arg("validation_set_indexes")=std::vector<size_t>(),py::arg("prioritized_predictors_indexes")=std::vector<size_t>(),
+            py::call_guard<py::scoped_ostream_redirect,py::scoped_estream_redirect>())
         .def("predict", &APLRRegressor::predict,py::arg("X"),py::arg("bool cap_predictions_to_minmax_in_training")=true)
         .def("set_term_names", &APLRRegressor::set_term_names,py::arg("X_names"))
         .def("calculate_local_feature_importance",&APLRRegressor::calculate_local_feature_importance,py::arg("X"))
diff --git a/cpp/term.h b/cpp/term.h
@@ -685,16 +685,10 @@ size_t Term::get_interaction_level(size_t previous_int_level)
 }
 
 
-//Distribution of terms to multiple cores
-std::vector<std::vector<size_t>> distribute_terms_to_cores(std::vector<Term> &terms,size_t n_jobs)
+std::vector<std::vector<size_t>> distribute_terms_indexes_to_cores(std::vector<size_t> &term_indexes,size_t n_jobs)
 {
     //Determing number of terms actually eligible
-    size_t num_eligible_terms{0};
-    for (size_t i = 0; i < terms.size(); ++i)
-    {
-        if(terms[i].ineligible_boosting_steps==0)
-            ++num_eligible_terms;
-    }
+    size_t num_eligible_terms{term_indexes.size()};
     
     //Determining how many items to evaluate per core
     size_t available_cores{static_cast<size_t>(std::thread::hardware_concurrency())};
@@ -713,13 +707,10 @@ std::vector<std::vector<size_t>> distribute_terms_to_cores(std::vector<Term> &te
     //Distributing
     size_t core{0};
     size_t count{0};
-    for (size_t i = 0; i < terms.size(); ++i) //for each term
+    for (size_t i = 0; i < term_indexes.size(); ++i) //for each term
     {
-        if(terms[i].ineligible_boosting_steps==0) //if can be distributed to cores
-        {
-            output[core].push_back(i);
-            ++count;
-        }
+        output[core].push_back(i);
+        ++count;
         if(count>=units_per_core)
         {
             if(core<available_cores-1)
@@ -737,4 +728,18 @@ std::vector<std::vector<size_t>> distribute_terms_to_cores(std::vector<Term> &te
     }
 
     return output;
+}
+
+std::vector<size_t> create_term_indexes(std::vector<Term> &terms)
+{
+    std::vector<size_t> term_indexes;
+    term_indexes.reserve(terms.size());
+    for (size_t i = 0; i < terms.size(); ++i)
+    {
+        bool term_is_eligible{terms[i].ineligible_boosting_steps==0};
+        if(term_is_eligible)
+            term_indexes.push_back(i);
+    }
+    term_indexes.shrink_to_fit();
+    return term_indexes;
 }
diff --git a/cpp/test ALRRegressor.cpp b/cpp/test ALRRegressor.cpp
@@ -41,7 +41,8 @@ int main()
     //Fitting
     //model.fit(X_train,y_train);
     //model.fit(X_train,y_train,sample_weight);
-    model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
+    //model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)});
+    model.fit(X_train,y_train,sample_weight,{},{0,1,2,3,4,5,10,static_cast<size_t>(y_train.size()-1)},{1,8});
     std::cout<<"feature importance\n"<<model.feature_importance<<"\n\n";
 
     VectorXd predictions{model.predict(X_test)};
@@ -51,7 +52,7 @@ int main()
     save_data("data/output.csv",predictions);
 
     std::cout<<predictions.mean()<<"\n\n";
-    tests.push_back(is_approximately_equal(predictions.mean(),23.7858,0.00001));
+    tests.push_back(is_approximately_equal(predictions.mean(),23.6889,0.00001));
 
     //std::cout<<model.validation_error_steps<<"\n\n";
 
diff --git a/examples/train_aplr_cross_validation.py b/examples/train_aplr_cross_validation.py
@@ -17,6 +17,9 @@
 #This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
 #This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
 
+#Please also note that APLR may be vulnerable to outliers in predictor values. If you experience this problem then please consider winsorising 
+#the predictors (or similar methods) before passing them to APLR.
+
 #Randomly splitting data into training and test sets
 data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
 del data
diff --git a/examples/train_aplr_validation.py b/examples/train_aplr_validation.py
@@ -18,6 +18,9 @@
 #This means that if you have missing values in the data then you need to either drop rows with missing data or impute them.
 #This also means that if you have a categorical text variable then you need to convert it to for example dummy variables for each category.
 
+#Please also note that APLR may be vulnerable to outliers in predictor values. If you experience this problem then please consider winsorising 
+#the predictors (or similar methods) before passing them to APLR.
+
 #Randomly splitting data into training and test sets
 data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state)
 del data
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setuptools.setup(
     name='aplr',
-    version='1.9.0',
+    version='1.10.0',
     description='Automatic Piecewise Linear Regression',
     ext_modules=[sfc_module],
     author="Mathias von Ottenbreit",