change glmtree classifier default predict function from pred proba to decision function; version 0.2.2

[zebinyang] · [zebinyang] · commit e723256a871f · 2021-12-07T14:48:11.000+08:00
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='simtree',
-      version='0.2.1',
+      version='0.2.2',
       description='Single-index model tree',
       url='https://github.com/ZebinYang/SIMTree',
       author='Zebin Yang',
diff --git a/simtree/__init__.py b/simtree/__init__.py
@@ -8,5 +8,5 @@
         "SIMTreeRegressor", "SIMTreeClassifier",
         "CustomMobTreeRegressor", "CustomMobTreeClassifier"]
 
-__version__ = '0.2.1'
+__version__ = '0.2.2'
 __author__ = 'Zebin Yang'
diff --git a/simtree/customtree.py b/simtree/customtree.py
@@ -90,6 +90,6 @@ def build_leaf(self, sample_indice):
                           cv=5, refit="auc", n_jobs=1, error_score=np.nan)
             grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
             best_estimator = grid.best_estimator_
-            predict_func = lambda x: best_estimator.predict_proba(x)[:, 1]
+            predict_func = lambda x: best_estimator.decision_function(x)
             best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
         return predict_func, best_estimator, best_impurity
diff --git a/simtree/glmtree.py b/simtree/glmtree.py
@@ -93,6 +93,6 @@ def build_leaf(self, sample_indice):
             best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
             xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
             xmax = np.max(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
-            predict_func = lambda x: 1 / (1 + np.exp(- np.clip(np.dot(x, best_estimator.coef_.ravel()), xmin, xmax) - best_estimator.intercept_))
+            predict_func = lambda x: np.clip(np.dot(x, best_estimator.coef_.ravel()), xmin, xmax) + best_estimator.intercept_
             best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
         return predict_func, best_estimator, best_impurity
diff --git a/simtree/mobtree.py b/simtree/mobtree.py
@@ -4,6 +4,7 @@
 from matplotlib import pyplot as plt
 from abc import ABCMeta, abstractmethod
 
+from sklearn.utils.extmath import softmax
 from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import check_X_y, column_or_1d
 from sklearn.utils.validation import check_is_fitted
@@ -475,41 +476,53 @@ def decision_path_indice(self, x, node_id):
 
     def decision_path(self, x):
 
+        check_is_fitted(self, "tree")
+
         n_samples = x.shape[0]
         path_all = np.zeros((n_samples, self.node_count))
-        for idx, row in enumerate(x):
+        for node_id in self.leaf_idx_list:
             path = []
-            node = self.tree[1]
-            while not node['is_leaf']:
-                path.append(node["node_id"] - 1)
-                if row[node['feature']] <= node['threshold']:
-                    node = self.tree[node['left_child_id']]
+            idx = node_id
+            sample_indice = np.ones((x.shape[0], )).astype(np.bool)
+            while True:
+                path.append(idx - 1)
+                current_node = self.tree[idx]
+                if current_node["parent_id"] is None:
+                    break
                 else:
-                    node = self.tree[node['right_child_id']]
-            path.append(node["node_id"] - 1)
-            path_all[idx][path] = 1
+                    parent_node = self.tree[current_node["parent_id"]]
+                    if current_node["is_left"]:
+                        sample_indice = np.logical_and(sample_indice, x[:, parent_node["feature"]] <= parent_node["threshold"])
+                    else:
+                        sample_indice = np.logical_and(sample_indice, x[:, parent_node["feature"]] > parent_node["threshold"])
+                idx = current_node["parent_id"]
+            if sample_indice.sum() > 0:
+                path_all[np.ix_(np.where(sample_indice)[0], path)] = 1
         return path_all
 
     def decision_function(self, x):
 
         check_is_fitted(self, "tree")
 
-        leaf_idx = []
         x = np.array(x)
-        for row in x:
-            node = self.tree[1]
-            while not node['is_leaf']:
-                if row[node['feature']] <= node['threshold']:
-                    node = self.tree[node['left_child_id']]
-                else:
-                    node = self.tree[node['right_child_id']]
-            leaf_idx.append(node['node_id'])
-
         n_samples = x.shape[0]
         pred = np.zeros((n_samples))
-        for node_id in np.unique(leaf_idx):
-            sample_indice = np.array(leaf_idx) == node_id
-            pred[sample_indice] = self.tree[node_id]['predict_func'](x[sample_indice, :]).ravel()
+        for node_id in self.leaf_idx_list:
+            idx = node_id
+            sample_indice = np.ones((x.shape[0], )).astype(np.bool)
+            while True:
+                current_node = self.tree[idx]
+                if current_node["parent_id"] is None:
+                    break
+                else:
+                    parent_node = self.tree[current_node["parent_id"]]
+                    if current_node["is_left"]:
+                        sample_indice = np.logical_and(sample_indice, x[:, parent_node["feature"]] <= parent_node["threshold"])
+                    else:
+                        sample_indice = np.logical_and(sample_indice, x[:, parent_node["feature"]] > parent_node["threshold"])
+                idx = current_node["parent_id"]
+            if sample_indice.sum() > 0:
+                pred[sample_indice] = self.tree[node_id]['predict_func'](x[sample_indice, :]).ravel()
         return pred
 
 
@@ -647,9 +660,10 @@ def evaluate_estimator(self, estimator, x, y):
         return loss
 
     def predict_proba(self, x):
-        proba = self.decision_function(x).reshape(-1, 1)
-        return np.hstack([1 - proba, proba])
+        pred = self.decision_function(x).reshape(-1, 1)
+        pred_proba = softmax(np.hstack([-pred, pred]) / 2, copy=False)
+        return pred_proba
 
     def predict(self, x):
-        pred_proba = self.decision_function(x)
+        pred_proba = self.predict_proba(x)
         return self._label_binarizer.inverse_transform(pred_proba)
diff --git a/simtree/simtree.py b/simtree/simtree.py
@@ -489,6 +489,6 @@ def build_leaf(self, sample_indice):
                           cv=5, refit="auc", n_jobs=1, error_score=np.nan)
             grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
             best_estimator = grid.best_estimator_
-            predict_func = lambda x: best_estimator.predict_proba(x)[:, 1]
+            predict_func = lambda x: best_estimator.decision_function(x)
             best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
         return predict_func, best_estimator, best_impurity