ottenbreit-data-science
diff --git a/‎API_REFERENCE.md‎
Lines changed: 12 additions & 4 deletions b/‎API_REFERENCE.md‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎aplr/aplr.py‎
Lines changed: 7 additions & 2 deletions b/‎aplr/aplr.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cpp/APLRRegressor.h‎
Lines changed: 28 additions & 4 deletions b/‎cpp/APLRRegressor.h‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎cpp/functions.h‎
Lines changed: 61 additions & 0 deletions b/‎cpp/functions.h‎
Lines changed: 61 additions & 0 deletions
@@ -1,6 +1,6 @@
 # APLRRegressor
 
-## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5)
+## class aplr.APLRRegressor(m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5, group_size_for_validation_group_mse:int=100)
 
 ### Constructor parameters
 
@@ -14,7 +14,7 @@ The learning rate. Must be greater than zero and not more than one. The higher t
 Used to randomly split training observations into training and validation if ***validation_set_indexes*** is not specified when fitting.
 
 #### family (default = "gaussian")
-Determines the loss function used. Allowed values are "gaussian", "binomial", "poisson", "gamma" and "tweedie". This is used together with ***link_function***. Please note that this is not a tuning parameter because it defines how the loss function is calculated.
+Determines the loss function used. Allowed values are "gaussian", "binomial", "poisson", "gamma" and "tweedie". This is used together with ***link_function***. Please note that this is not a tuning parameter because it defines how the loss function is calculated. However, it can be tuned with ***get_validation_group_mse()*** as the tuning metric.
 
 #### link_function (default = "identity")
 Determines how the linear predictor is transformed to predictions. Allowed values are "identity", "logit" and "log". For an ordinary regression model use ***family*** "gaussian" and ***link_function*** "identity". For logistic regression use ***family*** "binomial" and ***link_function*** "logit". For a multiplicative model use the "log" ***link_function***. The "log" ***link_function*** often works best with a "poisson", "gamma" or "tweedie" ***family***, depending on the data. The ***family*** "poisson", "gamma" or "tweedie" should only be used with the "log" ***link_function***. Inappropriate combinations of ***family*** and ***link_function*** may result in a warning message when fitting the model and/or a poor model fit. Please note that values other than "identity" typically require a significantly higher ***m*** (or ***v***) in order to converge.
@@ -50,7 +50,10 @@ Limits 1) the number of terms already in the model that can be considered as int
 ***0*** does not print progress reports during fitting. ***1*** prints a summary after running the ***fit*** method. ***2*** prints a summary after each boosting step.
 
 #### tweedie_power (default = 1.5)
-Species the variance power for the "tweedie" ***family*** and ***link_function***. Please note that this is not a tuning parameter because it defines how the loss function is calculated.
+Species the variance power for the "tweedie" ***family*** and ***link_function***. Please note that this is not a tuning parameter because it defines how the loss function is calculated. However, it can be tuned with ***get_validation_group_mse()*** as the tuning metric.
+
+#### group_size_for_validation_group_mse (default = 100)
+APLR calculates mean squared error on grouped data in the validation set. This can be useful for comparing models that have different ***family*** or ***tweedie_power*** parameters. The maximum number of observations in each group is specified by  ***group_size_for_validation_group_mse***. Some of the observations with the lowest or highest response values will belong to groups with less than   ***group_size_for_validation_group_mse*** observations. The minimum number of observations in a group is ***group_size_for_validation_group_mse/2***. If ***group_size_for_validation_group_mse*** is equal to or higher than the number of observations in the validation set, then there will only be one group (in this case the grouped validation MSE is not so useful). ***group_size_for_validation_group_mse*** should be large enough so that the Central Limit Theorem holds (at least 60, but 100 is a safer choice). Also, the number of observations in the validation set should be substantially higher than ***group_size_for_validation_group_mse*** for group validation MSE to be useful.
 
 
 ## Method: fit(X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[])
@@ -170,4 +173,9 @@ The index of the term selected. So ***0*** is the first term, ***1*** is the sec
 
 ## Method: get_m()
 
-***Returns the number of boosting steps in the model (the value that minimized validation error).***
+***Returns the number of boosting steps in the model (the value that minimized validation error).***
+
+
+## Method: get_validation_group_mse()
+
+***Returns mean squared error on grouped data in the validation set.*** See ***group_size_for_validation_group_mse*** for more information.
@@ -5,7 +5,7 @@
 
 
 class APLRRegressor():
-    def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5):
+    def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, family:str="gaussian", link_function:str="identity", n_jobs:int=0, validation_ratio:float=0.2, intercept:float=np.nan, bins:int=300, max_interaction_level:int=1, max_interactions:int=100000, min_observations_in_split:int=20, ineligible_boosting_steps_added:int=10, max_eligible_terms:int=5, verbosity:int=0, tweedie_power:float=1.5, group_size_for_validation_group_mse:int=100):
         self.m=m
         self.v=v
         self.random_state=random_state
@@ -22,6 +22,7 @@ def __init__(self, m:int=1000, v:float=0.1, random_state:int=0, family:str="gaus
         self.max_eligible_terms=max_eligible_terms
         self.verbosity=verbosity
         self.tweedie_power=tweedie_power
+        self.group_size_for_validation_group_mse=group_size_for_validation_group_mse
 
         #Creating aplr_cpp and setting parameters
         self.APLRRegressor=aplr_cpp.APLRRegressor()
@@ -45,6 +46,7 @@ def __set_params_cpp(self):
         self.APLRRegressor.max_eligible_terms=self.max_eligible_terms
         self.APLRRegressor.verbosity=self.verbosity
         self.APLRRegressor.tweedie_power=self.tweedie_power
+        self.APLRRegressor.group_size_for_validation_group_mse=self.group_size_for_validation_group_mse
 
     def fit(self, X:npt.ArrayLike, y:npt.ArrayLike, sample_weight:npt.ArrayLike = np.empty(0), X_names:List[str]=[], validation_set_indexes:List[int]=[]):
         self.__set_params_cpp()
@@ -89,9 +91,12 @@ def get_intercept_steps(self)->npt.ArrayLike:
     def get_m(self)->int:
         return self.APLRRegressor.get_m()
 
+    def get_validation_group_mse(self)->float:
+        return self.APLRRegressor.get_validation_group_mse()
+
     #For sklearn
     def get_params(self, deep=True):
-        return {"m": self.m, "v": self.v,"random_state":self.random_state,"family":self.family,"link_function":self.link_function,"n_jobs":self.n_jobs,"validation_ratio":self.validation_ratio,"intercept":self.intercept,"bins":self.bins,"max_interaction_level":self.max_interaction_level,"max_interactions":self.max_interactions,"verbosity":self.verbosity,"min_observations_in_split":self.min_observations_in_split,"ineligible_boosting_steps_added":self.ineligible_boosting_steps_added,"max_eligible_terms":self.max_eligible_terms,"tweedie_power":self.tweedie_power}
+        return {"m": self.m, "v": self.v,"random_state":self.random_state,"family":self.family,"link_function":self.link_function,"n_jobs":self.n_jobs,"validation_ratio":self.validation_ratio,"intercept":self.intercept,"bins":self.bins,"max_interaction_level":self.max_interaction_level,"max_interactions":self.max_interactions,"verbosity":self.verbosity,"min_observations_in_split":self.min_observations_in_split,"ineligible_boosting_steps_added":self.ineligible_boosting_steps_added,"max_eligible_terms":self.max_eligible_terms,"tweedie_power":self.tweedie_power,"group_size_for_validation_group_mse":self.group_size_for_validation_group_mse}
 
     #For sklearn
     def set_params(self, **parameters):
 
@@ -76,6 +76,7 @@ class APLRRegressor
     void name_terms(const MatrixXd &X, const std::vector<std::string> &X_names);
     void calculate_feature_importance_on_validation_set();
     void find_min_and_max_training_predictions();
+    void calculate_validation_group_mse();
     void cleanup_after_fit();
     void validate_that_model_can_be_used(const MatrixXd &X);
     void throw_error_if_family_does_not_exist();
@@ -122,12 +123,15 @@ class APLRRegressor
     double tweedie_power;
     double min_training_prediction;
     double max_training_prediction;
+    double validation_group_mse;
+    size_t group_size_for_validation_group_mse;
 
     //Methods
     APLRRegressor(size_t m=1000,double v=0.1,uint_fast32_t random_state=std::numeric_limits<uint_fast32_t>::lowest(),std::string family="gaussian",
         std::string link_function="identity", size_t n_jobs=0, double validation_ratio=0.2,double intercept=NAN_DOUBLE,
         size_t reserved_terms_times_num_x=100, size_t bins=300,size_t verbosity=0,size_t max_interaction_level=1,size_t max_interactions=100000,
-        size_t min_observations_in_split=20, size_t ineligible_boosting_steps_added=10, size_t max_eligible_terms=5,double tweedie_power=1.5);
+        size_t min_observations_in_split=20, size_t ineligible_boosting_steps_added=10, size_t max_eligible_terms=5,double tweedie_power=1.5,
+        size_t group_size_for_validation_group_mse=100);
     APLRRegressor(const APLRRegressor &other);
     ~APLRRegressor();
     void fit(const MatrixXd &X,const VectorXd &y,const VectorXd &sample_weight=VectorXd(0),const std::vector<std::string> &X_names={},const std::vector<size_t> &validation_set_indexes={});
@@ -144,19 +148,21 @@ class APLRRegressor
     double get_intercept();
     VectorXd get_intercept_steps();
     size_t get_m();
+    double get_validation_group_mse();
 };
 
 //Regular constructor
 APLRRegressor::APLRRegressor(size_t m,double v,uint_fast32_t random_state,std::string family,std::string link_function,size_t n_jobs,
     double validation_ratio,double intercept,size_t reserved_terms_times_num_x,size_t bins,size_t verbosity,size_t max_interaction_level,
-    size_t max_interactions,size_t min_observations_in_split,size_t ineligible_boosting_steps_added,size_t max_eligible_terms,double tweedie_power):
+    size_t max_interactions,size_t min_observations_in_split,size_t ineligible_boosting_steps_added,size_t max_eligible_terms,double tweedie_power,
+    size_t group_size_for_validation_group_mse):
         reserved_terms_times_num_x{reserved_terms_times_num_x},intercept{intercept},m{m},v{v},
         family{family},link_function{link_function},validation_ratio{validation_ratio},n_jobs{n_jobs},random_state{random_state},
         bins{bins},verbosity{verbosity},max_interaction_level{max_interaction_level},
         intercept_steps{VectorXd(0)},max_interactions{max_interactions},interactions_eligible{0},validation_error_steps{VectorXd(0)},
         min_observations_in_split{min_observations_in_split},ineligible_boosting_steps_added{ineligible_boosting_steps_added},
         max_eligible_terms{max_eligible_terms},number_of_base_terms{0},tweedie_power{tweedie_power},min_training_prediction{NAN_DOUBLE},
-        max_training_prediction{NAN_DOUBLE}
+        max_training_prediction{NAN_DOUBLE},validation_group_mse{NAN_DOUBLE},group_size_for_validation_group_mse{group_size_for_validation_group_mse}
 {
 }
 
@@ -171,7 +177,8 @@ APLRRegressor::APLRRegressor(const APLRRegressor &other):
     min_observations_in_split{other.min_observations_in_split},ineligible_boosting_steps_added{other.ineligible_boosting_steps_added},
     max_eligible_terms{other.max_eligible_terms},number_of_base_terms{other.number_of_base_terms},
     feature_importance{other.feature_importance},tweedie_power{other.tweedie_power},min_training_prediction{other.min_training_prediction},
-    max_training_prediction{other.max_training_prediction}
+    max_training_prediction{other.max_training_prediction},validation_group_mse{other.validation_group_mse},
+    group_size_for_validation_group_mse{other.group_size_for_validation_group_mse}
 {
 }
 
@@ -200,6 +207,7 @@ void APLRRegressor::fit(const MatrixXd &X,const VectorXd &y,const VectorXd &samp
     name_terms(X, X_names);
     calculate_feature_importance_on_validation_set();
     find_min_and_max_training_predictions();
+    calculate_validation_group_mse();
     cleanup_after_fit();
 }
 
@@ -1042,6 +1050,17 @@ void APLRRegressor::find_min_and_max_training_predictions()
     max_training_prediction=training_predictions.maxCoeff();
 }
 
+void APLRRegressor::calculate_validation_group_mse()
+{
+    VectorXd validation_predictions{predict(X_validation,false)};
+    VectorXi y_validation_sorted_index{sort_indexes_ascending(y_validation)};
+    VectorXd y_validation_centered{calculate_rolling_centered_mean(y_validation,y_validation_sorted_index,group_size_for_validation_group_mse,sample_weight_validation)};
+    VectorXd validation_predictions_centered{calculate_rolling_centered_mean(validation_predictions,y_validation_sorted_index,group_size_for_validation_group_mse,sample_weight_validation)};
+
+    VectorXd squared_residuals{(y_validation_centered-validation_predictions_centered).array().pow(2)};
+    validation_group_mse =  squared_residuals.mean();
+}
+
 void APLRRegressor::validate_that_model_can_be_used(const MatrixXd &X)
 {
     if(std::isnan(intercept) || number_of_base_terms==0) throw std::runtime_error("Model must be trained before predict() can be run.");
@@ -1186,4 +1205,9 @@ VectorXd APLRRegressor::get_intercept_steps()
 size_t APLRRegressor::get_m()
 {
     return m;
+}
+
+double APLRRegressor::get_validation_group_mse()
+{
+    return validation_group_mse;
 }
@@ -310,4 +310,65 @@ void throw_error_if_matrix_has_nan_or_infinite_elements(const T &x, const std::s
     {
         throw std::runtime_error(matrix_name + " has nan or infinite elements.");
     }
+}
+
+VectorXd calculate_rolling_centered_mean(const VectorXd &vector, const VectorXi &sorted_index, size_t rolling_window, const VectorXd &sample_weight=VectorXd(0))
+{
+    bool sample_weight_is_provided{sample_weight.rows()==vector.rows()};
+    bool rolling_window_contains_one_observation{rolling_window<=1};
+    bool rolling_window_encompasses_all_observations_in_validation_set{rolling_window >= static_cast<size_t>(vector.rows())};
+    size_t half_rolling_window{(rolling_window-1)/2};
+    
+    VectorXd rolling_centered_mean;
+    if(rolling_window_contains_one_observation)
+        rolling_centered_mean = vector;
+    else if(rolling_window_encompasses_all_observations_in_validation_set)
+    {
+        if(sample_weight_is_provided)
+        {
+            double weighted_centered_mean{(vector.array() * sample_weight.array()).sum() / sample_weight.sum()};
+            rolling_centered_mean = VectorXd::Constant(vector.rows(),weighted_centered_mean);
+        }
+        else
+            rolling_centered_mean = VectorXd::Constant(vector.rows(),vector.mean());
+    }
+    else
+    {
+        rolling_centered_mean = VectorXd::Constant(vector.rows(),0);
+
+        size_t vector_size{static_cast<size_t>(sorted_index.rows())};
+        for (size_t i = 0; i < vector_size; ++i)
+        {
+            size_t min_index;
+            if(i<half_rolling_window)
+                min_index=0;
+            else
+                min_index=i-half_rolling_window;
+            
+            size_t max_index{std::min(vector_size-1, i+half_rolling_window)};
+
+            double rolling_centered_weighted_sum{0};
+            if(sample_weight_is_provided)
+            {
+                double rolling_centered_sample_weight_sum{0};
+                for (size_t j = min_index; j <= max_index; ++j)
+                {
+                    rolling_centered_weighted_sum += vector[sorted_index[j]] * sample_weight[sorted_index[j]];
+                    rolling_centered_sample_weight_sum += sample_weight[sorted_index[j]];
+                }
+                rolling_centered_mean[sorted_index[i]] = rolling_centered_weighted_sum / rolling_centered_sample_weight_sum;
+            }
+            else
+            {
+                size_t observations{max_index-min_index+1};
+                for (size_t j = min_index; j <= max_index; ++j)
+                {
+                    rolling_centered_mean[sorted_index[i]] += vector[sorted_index[j]];
+                }
+                rolling_centered_mean[sorted_index[i]] /= observations;
+            }
+        }
+    }
+    
+    return rolling_centered_mean;
 }