skip grid search cv in build leaf if param_dict is empty; version 0.2.4

ZebinYang · ZebinYang · commit 47267ed1a3be · 2021-12-27T18:55:46.000+08:00
diff --git a/examples/demo.ipynb b/examples/demo.ipynb
@@ -147,7 +147,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py37",
+   "display_name": "Python (py37)",
    "language": "python",
    "name": "py37"
   },
@@ -161,7 +161,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.7.9"
   },
   "latex_envs": {
    "LaTeX_envs_menu_present": true,
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='simtree',
-      version='0.2.3',
+      version='0.2.4',
       description='Single-index model tree',
       url='https://github.com/ZebinYang/SIMTree',
       author='Zebin Yang',
diff --git a/simtree/__init__.py b/simtree/__init__.py
@@ -8,5 +8,5 @@
         "SIMTreeRegressor", "SIMTreeClassifier",
         "CustomMobTreeRegressor", "CustomMobTreeClassifier"]
 
-__version__ = '0.2.3'
+__version__ = '0.2.4'
 __author__ = 'Zebin Yang'
diff --git a/simtree/customtree.py b/simtree/customtree.py
@@ -3,7 +3,7 @@
 from sklearn.utils.validation import check_is_fitted
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import make_scorer, roc_auc_score, mean_squared_error
-from sklearn.base import RegressorMixin, ClassifierMixin, is_regressor, is_classifier
+from sklearn.base import RegressorMixin, ClassifierMixin, is_regressor, is_classifier, clone
 
 from .mobtree import MoBTreeRegressor, MoBTreeClassifier
 
@@ -42,11 +42,23 @@ def build_root(self):
 
     def build_leaf(self, sample_indice):
 
-        grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
-                      scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
-                      cv=5, refit="mse", n_jobs=1, error_score=np.nan)
-        grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
-        best_estimator = grid.best_estimator_
+        if len(self.param_dict) == 0:
+            self.base_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+            best_estimator = self.base_estimator
+        else:
+            param_size = 1
+            for key, item in self.param_dict.items():
+                param_size *= len(item)
+            if param_size == 1:
+                self.base_estimator.set_params(**{key: item[0] for key, item in self.param_dict.items()})
+                self.base_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+                best_estimator = self.base_estimator
+            else:
+                grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
+                              scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
+                              cv=5, refit="mse", n_jobs=1, error_score=np.nan)
+                grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+                best_estimator = grid.best_estimator_
         predict_func = lambda x: best_estimator.predict(x)
         best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
         return predict_func, best_estimator, best_impurity
@@ -80,16 +92,33 @@ def build_root(self):
 
     def build_leaf(self, sample_indice):
 
-        if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
-            best_estimator = None
-            predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
-            best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
-        else:
-            grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
-                          scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
-                          cv=5, refit="auc", n_jobs=1, error_score=np.nan)
-            grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
-            best_estimator = grid.best_estimator_
+        if len(self.param_dict) == 0:
+            best_estimator = clone(self.base_estimator)
+            best_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
             predict_func = lambda x: best_estimator.decision_function(x)
             best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
+        else:
+            param_size = 1
+            for key, item in self.param_dict.items():
+                param_size *= len(item)
+            if param_size == 1:
+                best_estimator = clone(self.base_estimator)
+                best_estimator.set_params(**{key: item[0] for key, item in self.param_dict.items()})
+                best_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+                predict_func = lambda x: best_estimator.decision_function(x)
+                best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
+            else:
+                if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
+                    best_estimator = None
+                    predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
+                    best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
+                else:
+                    grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
+                                  scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
+                                  cv=5, refit="auc", n_jobs=1, error_score=np.nan)
+                    grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+                    best_estimator = grid.best_estimator_
+
+                predict_func = lambda x: best_estimator.decision_function(x)
+                best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
         return predict_func, best_estimator, best_impurity
diff --git a/simtree/glmtree.py b/simtree/glmtree.py
@@ -42,8 +42,16 @@ def build_leaf(self, sample_indice):
         sx = self.x[sample_indice].std(0) + self.EPSILON
         nx = (self.x[sample_indice] - mx) / sx
 
-        best_estimator = LassoCV(alphas=self.reg_lambda, cv=5, n_jobs=1, random_state=self.random_state)
-        best_estimator.fit(nx, self.y[sample_indice])
+        if len(self.reg_lambda) > 1:
+            best_estimator = LassoCV(alphas=self.reg_lambda, cv=5, n_jobs=self.n_jobs, precompute=False, random_state=self.random_state)
+            best_estimator.fit(nx, self.y[sample_indice], self.sample_weight[sample_indice])
+        else:
+            if self.reg_lambda[0] > 0:
+                best_estimator = Lasso(alpha=self.reg_lambda[0], precompute=False, random_state=self.random_state)
+            else:
+                best_estimator = LinearRegression()
+            best_estimator.fit(nx, self.y[sample_indice])
+
         best_estimator.coef_ = best_estimator.coef_ / sx
         best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
         xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_) + best_estimator.intercept_)
@@ -83,8 +91,12 @@ def build_leaf(self, sample_indice):
             predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
             best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
         else:
-            best_estimator = LogisticRegressionCV(Cs=self.reg_lambda, penalty="l1", solver="liblinear", scoring="roc_auc",
-                                      cv=5, n_jobs=1, random_state=self.random_state)
+            if len(self.reg_lambda) > 1:
+                best_estimator = LogisticRegressionCV(Cs=self.reg_lambda, penalty="l1", solver="liblinear", scoring="roc_auc",
+                                      cv=5, n_jobs=self.n_jobs, random_state=self.random_state)
+            else:
+                best_estimator = LogisticRegression(alpha=self.reg_lambda[0], precompute=False, random_state=self.random_state)
+
             mx = self.x[sample_indice].mean(0)
             sx = self.x[sample_indice].std(0) + self.EPSILON
             nx = (self.x[sample_indice] - mx) / sx
diff --git a/simtree/simtree.py b/simtree/simtree.py
@@ -434,13 +434,17 @@ def build_root(self):
 
     def build_leaf(self, sample_indice):
 
-        base = SimRegressor(reg_gamma=self.reg_gamma, degree=self.degree,
-                      knot_num=self.knot_num, random_state=self.random_state)
-        grid = GridSearchCV(base, param_grid={"reg_lambda": self.reg_lambda},
-                      scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
-                      cv=5, refit="mse", n_jobs=1, error_score=np.nan)
-        grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
-        best_estimator = grid.best_estimator_
+        param_size = len(self.reg_lambda)
+        if param_size == 1:
+            best_estimator = SimRegressor(reg_lambda=[self.reg_lambda[0]], reg_gamma=self.reg_gamma, degree=self.degree,
+                          knot_num=self.knot_num, random_state=self.random_state)
+            best_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+        else:
+            grid = GridSearchCV(base, param_grid={"reg_lambda": self.reg_lambda},
+                          scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
+                          cv=5, refit="mse", n_jobs=1, error_score=np.nan)
+            grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+            best_estimator = grid.best_estimator_
         predict_func = lambda x: best_estimator.predict(x)
         best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
         return predict_func, best_estimator, best_impurity
@@ -477,18 +481,26 @@ def build_root(self):
 
     def build_leaf(self, sample_indice):
 
-        if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
-            best_estimator = None
-            predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
-            best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
-        else:
-            base = SimClassifier(reg_gamma=self.reg_gamma, degree=self.degree,
+        param_size = len(self.reg_lambda)
+        if param_size == 1:
+            best_estimator = SimRegressor(reg_lambda=[self.reg_lambda[0]], reg_gamma=self.reg_gamma, degree=self.degree,
                           knot_num=self.knot_num, random_state=self.random_state)
-            grid = GridSearchCV(base, param_grid={"reg_lambda": self.reg_lambda},
-                          scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
-                          cv=5, refit="auc", n_jobs=1, error_score=np.nan)
-            grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
-            best_estimator = grid.best_estimator_
+            best_estimator.fit(self.x[sample_indice], self.y[sample_indice].ravel())
             predict_func = lambda x: best_estimator.decision_function(x)
             best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
+        else:
+            if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
+                best_estimator = None
+                predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
+                best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
+            else:
+                base = SimClassifier(reg_gamma=self.reg_gamma, degree=self.degree,
+                              knot_num=self.knot_num, random_state=self.random_state)
+                grid = GridSearchCV(base, param_grid={"reg_lambda": self.reg_lambda},
+                              scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
+                              cv=5, refit="auc", n_jobs=1, error_score=np.nan)
+                grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+                best_estimator = grid.best_estimator_
+                predict_func = lambda x: best_estimator.decision_function(x)
+                best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
         return predict_func, best_estimator, best_impurity