add argument base_model_method for predict function

chenyangkang · chenyangkang · commit 101d501c61b5 · 2025-02-11T12:29:43.000-06:00
diff --git a/stemflow/model/AdaSTEM.py b/stemflow/model/AdaSTEM.py
@@ -672,7 +672,11 @@ def stixel_predict(self, stixel: pd.core.frame.DataFrame) -> Union[None, pd.core
         if model_x_names_tuple[0] is None:
             return None
 
-        pred = predict_one_stixel(stixel, self.task, model_x_names_tuple, **self.base_model_prediction_param)
+        pred = predict_one_stixel(X_test_stixel=stixel,
+                                  task=self.task,
+                                  model_x_names_tuple=model_x_names_tuple,
+                                  base_model_method=self.base_model_method,
+                                  **self.base_model_prediction_param)
 
         if pred is None:
             return None
@@ -814,6 +818,7 @@ def predict_proba(
         aggregation: str = "mean",
         return_by_separate_ensembles: bool = False,
         logit_agg: bool = False,
+        base_model_method: Union[None, str] = None,
         **base_model_prediction_param
     ) -> Union[np.ndarray, Tuple[np.ndarray]]:
         """Predict probability
@@ -836,7 +841,11 @@ def predict_proba(
             return_by_separate_ensembles (bool, optional):
                 Experimental function. return not by aggregation, but by separate ensembles.
             logit_agg:
-                Whether to use logit aggregation for the classification task. If True, the model is averaging the probability prediction estimated by all ensembles in logit scale, and then back-tranforms it to probability scale. It's recommended to be jointly used with the CalibratedClassifierCV class in sklearn as a wrapper of the classifier to estimate the calibrated probability. If False, the output is essentially the proportion of "1s" across the related ensembles; e.g., if 100 stixels covers this spatiotemporal points, and 90% of them predict that it is a "1", then the output probability is 0.9; Therefore it would be a probability estimated by the spatiotemporal neighborhood. Default is False, but can be set to truth for "real" probability averaging.
+                Whether to use logit aggregation for the classification task. Most likely only used when you are predicting "real" calibrated probability. If True, the model is averaging the probability prediction estimated by all ensembles in logit scale, and then back-tranforms it to probability scale. It's recommended to be jointly used with the CalibratedClassifierCV class in sklearn as a wrapper of the classifier to estimate the calibrated probability. Default is False, but can be set to true for "real" probability averaging.
+            base_model_method:
+                The name of the prediction method for base models. If None, `predict` or `predict_proba` will be used depending on the tasks. This argument is handy if you have a custom base model class that has a special prediction function. Notice that dummy model will still predict 0, so the ensemble-aggregated result is still an average of zeros and your special prediction function output. Therefore, it may only make sense if your special prediction function predicts 0 as the absense/control value. Defaults to None.
+            base_model_prediction_param:
+                Any other paramters to pass into the prediction method of the base models. e.g., base_model_prediction_param={'n_jobs':1}.
         Raises:
             TypeError:
                 X_test is not of type pd.core.frame.DataFrame.
@@ -855,6 +864,7 @@ def predict_proba(
         return_by_separate_ensembles, return_std = check_prediction_return(return_by_separate_ensembles, return_std)
         verbosity = check_verbosity(self, verbosity)
         n_jobs = check_transform_n_jobs(self, n_jobs)
+        self.base_model_method = base_model_method
         self.base_model_prediction_param = base_model_prediction_param
 
         # predict
@@ -889,7 +899,7 @@ def predict_proba(
                 res_mean = res.mean(axis=1, skipna=True)  # mean of all grid model that predicts this stixel
             elif aggregation == "median":
                 res_mean = res.median(axis=1, skipna=True)
-                
+        
         res_std = res.std(axis=1, skipna=True)
         
         # Nan count
@@ -935,6 +945,7 @@ def predict(
         aggregation: str = "mean",
         return_by_separate_ensembles: bool = False,
         logit_agg: bool = False,
+        base_model_method: Union[None, str] = None,
         **base_model_prediction_param
     ) -> Union[np.ndarray, Tuple[np.ndarray]]:
         pass
@@ -1406,6 +1417,7 @@ def predict(
         aggregation: str = "mean",
         return_by_separate_ensembles: bool = False,
         logit_agg: bool = False,
+        base_model_method: Union[None, str] = None,
         **base_model_prediction_param
     ) -> Union[np.ndarray, Tuple[np.ndarray]]:
         """A rewrite of predict_proba adapted for Classifier
@@ -1431,10 +1443,12 @@ def predict(
                 'mean' or 'median' for aggregation method across ensembles.
             return_by_separate_ensembles (bool, optional):
                 Experimental function. return not by aggregation, but by separate ensembles.
-            base_model_prediction_param:
-                Additional parameter passed to base_model.predict_proba or base_model.predict
             logit_agg:
-                Whether to use logit aggregation for the classification task. If True, the model is averaging the probability prediction estimated by all ensembles in logit scale, and then back-tranform it to probability scale. It's recommened to be combinedly used with the CalibratedClassifierCV class in sklearn as a wrapper of the classifier to estimate the calibrated probability. If False, the output is the essentially the proportion of "1s" acorss the related ensembles; e.g., if 100 stixels covers this spatiotemporal points, and 90% of them predict that it is a "1", then the ouput probability is 0.9; Therefore it would be a probability estimated by the spatiotemporal neiborhood.
+                Whether to use logit aggregation for the classification task. If True, the model is averaging the probability prediction estimated by all ensembles in logit scale, and then back-tranform it to probability scale. It's recommened to be combinedly used with the CalibratedClassifierCV class in sklearn as a wrapper of the classifier to estimate the calibrated probability.
+            base_model_method:
+                The name of the prediction method for base models. If None, `predict` or `predict_proba` will be used depending on the tasks. This argument is handy if you have a custom base model class that has a special prediction function. Defaults to None.
+            base_model_prediction_param:
+                Any other paramters to pass into the prediction method of the base models. e.g., base_model_prediction_param={'n_jobs':1}.
         Raises:
             TypeError:
                 X_test is not of type pd.core.frame.DataFrame.
@@ -1457,6 +1471,7 @@ def predict(
                 aggregation=aggregation,
                 return_by_separate_ensembles=return_by_separate_ensembles,
                 logit_agg=logit_agg,
+                base_model_method=base_model_method,
                 **base_model_prediction_param
             )
             mean = mean[:,1].flatten()
@@ -1473,6 +1488,7 @@ def predict(
                 aggregation=aggregation,
                 return_by_separate_ensembles=return_by_separate_ensembles,
                 logit_agg=logit_agg,
+                base_model_method=base_model_method,
                 **base_model_prediction_param
             )
             mean = mean[:,1].flatten()
@@ -1588,6 +1604,7 @@ def predict(
         n_jobs: Union[None, int] = 1,
         aggregation: str = "mean",
         return_by_separate_ensembles: bool = False,
+        base_model_method: Union[None, str] = None,
         **base_model_prediction_param
     ) -> Union[np.ndarray, Tuple[np.ndarray]]:
         """A rewrite of predict_proba
@@ -1609,8 +1626,10 @@ def predict(
                 'mean' or 'median' for aggregation method across ensembles.
             return_by_separate_ensembles (bool, optional):
                 Experimental function. return not by aggregation, but by separate ensembles.
+            base_model_method:
+                The name of the prediction method for base models. If None, `predict` or `predict_proba` will be used depending on the tasks. This argument is handy if you have a custom base model class that has a special prediction function. Defaults to None.
             base_model_prediction_param:
-                Additional parameter passed to base_model.predict_proba or base_model.predict
+                Any other paramters to pass into the prediction method of the base models. e.g., base_model_prediction_param={'n_jobs':1}.
                 
         Raises:
             TypeError:
@@ -1633,6 +1652,7 @@ def predict(
             n_jobs=n_jobs,
             aggregation=aggregation,
             return_by_separate_ensembles=return_by_separate_ensembles,
+            base_model_method = base_model_method,
             **base_model_prediction_param
         )
         
diff --git a/stemflow/model/static_func_AdaSTEM.py b/stemflow/model/static_func_AdaSTEM.py
@@ -431,6 +431,7 @@ def predict_one_stixel(
     X_test_stixel: pd.core.frame.DataFrame,
     task: str,
     model_x_names_tuple: Tuple[Union[None, BaseEstimator], list],
+    base_model_method: Union[None, str],
     **base_model_prediction_param
 ) -> pd.core.frame.DataFrame:
     """predict_one_stixel
@@ -439,6 +440,7 @@ def predict_one_stixel(
         X_test_stixel (pd.core.frame.DataFrame): Input testing variables
         task (str): One of 'regression', 'classification' and 'hurdle'
         model_x_names_tuple (tuple[Union[None, BaseEstimator], list]): A tuple of (model, stixel_specific_x_names)
+        base_model_method (Union[None, str]):  The name of the prediction method for base models. If None, `predict` or `predict_proba` will be used depending on the tasks. This argument is handy if you have a custom base model class that has a special prediction function.
         base_model_prediction_param: Additional parameter passed to base_model.predict_proba or base_model.predict
 
     Returns:
@@ -452,13 +454,25 @@ def predict_one_stixel(
         return None
 
     # get test data
-    if task == "regression":
-        pred = model_x_names_tuple[0].predict(X_test_stixel[model_x_names_tuple[1]])
-    else:
-        pred = model_x_names_tuple[0].predict_proba(X_test_stixel[model_x_names_tuple[1]], **base_model_prediction_param)
-        pred = pred[:,1]
-
-
+    pred = None
+    if base_model_method is not None:
+        if hasattr(model_x_names_tuple[0], base_model_method):
+            pred_func = getattr(model_x_names_tuple[0], base_model_method)
+            pred = pred_func(X_test_stixel[model_x_names_tuple[1]], **base_model_prediction_param)
+        else:
+            if isinstance(model_x_names_tuple[0], dummy_model1):
+                pass
+            else:
+                raise TypeError(f"{base_model_method} does not exists for base model {type(model_x_names_tuple[0])}")
+            
+    if pred is None:
+        # Still haven't found the pred function
+        if task == "regression":
+            pred = model_x_names_tuple[0].predict(X_test_stixel[model_x_names_tuple[1]])
+        else:
+            pred = model_x_names_tuple[0].predict_proba(X_test_stixel[model_x_names_tuple[1]], **base_model_prediction_param)
+            pred = pred[:,1]
+    
     res = pd.DataFrame({"index": list(X_test_stixel.index), "pred": np.array(pred).flatten()}).set_index("index")
 
     return res
diff --git a/tests/make_models.py b/tests/make_models.py
@@ -365,4 +365,31 @@ def make_AdaSTEMClassifier_caliP(fold_=2, min_req=1, **kwargs):
         min_class_sample=3,
         **kwargs
     )
+    return model
+
+
+def make_AdaSTEMClassifier_custom_pred_method(base_model_class, fold_=2, min_req=1, **kwargs):
+    
+    model = AdaSTEMClassifier(
+        base_model=base_model_class(),
+        save_gridding_plot=True,
+        ensemble_fold=fold_,
+        min_ensemble_required=min_req,
+        grid_len_upper_threshold=50,
+        grid_len_lower_threshold=20,
+        temporal_start=1,
+        temporal_end=366,
+        temporal_step=40,
+        temporal_bin_interval=80,
+        points_lower_threshold=30,
+        Spatio1="longitude",
+        Spatio2="latitude",
+        Temporal1="DOY",
+        temporal_bin_start_jitter="adaptive",
+        spatio_bin_jitter_magnitude="adaptive",
+        use_temporal_to_train=True,
+        n_jobs=1,
+        sample_weights_for_classifier=False,
+        **kwargs
+    )
     return model
diff --git a/tests/test_model_custom_base_model_method.py b/tests/test_model_custom_base_model_method.py
@@ -0,0 +1,49 @@
+import numpy as np
+import pandas as pd
+
+from stemflow.model.AdaSTEM import AdaSTEM
+from stemflow.model_selection import ST_train_test_split
+from xgboost import XGBClassifier, XGBRegressor
+
+from .make_models import (
+    make_AdaSTEMClassifier,
+    make_AdaSTEMClassifier_custom_pred_method
+)
+from .set_up_data import set_up_data
+
+x_names, (X, y) = set_up_data()
+X_train, X_test, y_train, y_test = ST_train_test_split(
+    X, y, Spatio_blocks_count=100, Temporal_blocks_count=100, random_state=42, test_size=0.3
+)
+def test_AdaSTEMClassifier():
+    
+    class my_base_model:
+        def __init__(self):
+            self.model = XGBClassifier(tree_method="hist", random_state=42, verbosity=0, n_jobs=1)
+            pass
+        def fit(self, X_train, y_train):
+            self.model.fit(X_train, y_train)
+            return self
+        def predict(self, X_test):
+            return self.model.predict(X_test)
+        def predict_proba(self, X_test):
+            return self.model.predict_proba(X_test)
+        def special_predict(self, X_test):
+            # Fold change
+            pred1 = self.model.predict_proba(X_test)[:,1]
+            pred2 = self.model.predict_proba(X_test + np.random.normal(loc=0, scale=1, size=X_test.shape))[:,1]
+            # Interaction
+            i_ = np.log(np.clip(1e-6, 1-1e-6, pred1) / np.clip(1e-6, 1-1e-6, pred2))
+            pred = i_ # Should be -inf to inf, 0 as no interaction
+            return pred
+        
+    model = make_AdaSTEMClassifier_custom_pred_method(base_model_class=my_base_model)
+    # model = make_AdaSTEMClassifier()
+    model = model.fit(X_train, np.where(y_train > 0, 1, 0))
+
+    pred_mean = model.predict_proba(X_test.reset_index(drop=True), return_std=False, verbosity=1, n_jobs=1, base_model_method='special_predict')[:,1]
+    pred_mean = pred_mean[~np.isnan(pred_mean)]
+    
+    # print(pred_mean)
+    # assert(np.sum((pred_mean < 1000) & (pred_mean > 1001)))
+