ZebinYang
diff --git a/‎examples/demo.ipynb‎
Lines changed: 20 additions & 24 deletions b/‎examples/demo.ipynb‎
Lines changed: 20 additions & 24 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎simtree-copy/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎simtree-copy/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎simtree-copy/cart.py‎
Lines changed: 169 additions & 0 deletions b/‎simtree-copy/cart.py‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎simtree-copy/customtree.py‎
Lines changed: 95 additions & 0 deletions b/‎simtree-copy/customtree.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎simtree-copy/glmtree.py‎
Lines changed: 98 additions & 0 deletions b/‎simtree-copy/glmtree.py‎
Lines changed: 98 additions & 0 deletions
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='simtree',
-      version='0.2.2',
+      version='0.2.3',
       description='Single-index model tree',
       url='https://github.com/ZebinYang/SIMTree',
       author='Zebin Yang',
 
@@ -0,0 +1,12 @@
+from .cart import CARTRegressor, CARTClassifier
+from .glmtree import GLMTreeRegressor, GLMTreeClassifier
+from .simtree import SIMTreeRegressor, SIMTreeClassifier
+from .customtree import CustomMobTreeRegressor, CustomMobTreeClassifier
+
+__all__ = ["CARTRegressor", "CARTClassifier",
+        "GLMTreeRegressor", "GLMTreeClassifier",
+        "SIMTreeRegressor", "SIMTreeClassifier",
+        "CustomMobTreeRegressor", "CustomMobTreeClassifier"]
+
+__version__ = '0.2.2'
+__author__ = 'Zebin Yang'
@@ -0,0 +1,169 @@
+import numpy as np
+from sklearn.base import RegressorMixin, ClassifierMixin
+from .mobtree import MoBTreeRegressor, MoBTreeClassifier
+
+
+__all__ = ["CARTRegressor", "CARTClassifier"]
+
+
+class CARTRegressor(MoBTreeRegressor, RegressorMixin):
+
+    def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0,
+                 split_features=None, feature_names=None, random_state=0):
+
+        super(CARTRegressor, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 split_features=split_features,
+                                 feature_names=feature_names,
+                                 random_state=random_state)
+
+    def build_root(self):
+
+        root_impurity = self.y.var()
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        best_estimator = None
+        predict_func = lambda x: np.mean(self.y[sample_indice])
+        best_impurity = self.y[sample_indice].var()
+        return predict_func, best_estimator, best_impurity
+
+    def node_split(self, sample_indice):
+
+        node_x = self.x[sample_indice]
+        node_y = self.y[sample_indice]
+        n_samples, n_features = node_x.shape
+
+        best_impurity = np.inf
+        best_feature = None
+        best_threshold = None
+        best_left_indice = None
+        best_right_indice = None
+        for feature_indice in self.split_features:
+
+            current_feature = node_x[:, feature_indice]
+            sortted_indice = np.argsort(current_feature)
+            sortted_feature = current_feature[sortted_indice]
+            feature_range = sortted_feature[-1] - sortted_feature[0]
+            if feature_range < self.EPSILON:
+                continue
+
+            sum_left = 0
+            sum_total = np.sum(node_y)
+            sq_sum_total = np.sum(node_y ** 2)
+            for i, _ in enumerate(sortted_indice):
+
+                if ((i + 1) < self.min_samples_leaf) or ((n_samples - i - 1) < self.min_samples_leaf):
+                    continue
+
+                n_left = i + 1
+                n_right = n_samples - i - 1
+                sum_left += node_y[sortted_indice[i]]
+                current_impurity = (sq_sum_total / n_samples - (sum_left / n_left) ** 2 * n_left / n_samples -
+                             ((sum_total - sum_left) / n_right) ** 2 * n_right / n_samples)
+
+                if current_impurity < best_impurity:
+                    best_position = i + 1
+                    best_feature = feature_indice
+                    best_impurity = current_impurity
+                    best_threshold = (sortted_feature[i] + sortted_feature[i + 1]) / 2
+
+        sortted_indice = np.argsort(node_x[:, best_feature])
+        best_left_indice = sample_indice[sortted_indice[:best_position]]
+        best_right_indice = sample_indice[sortted_indice[best_position:]]
+        best_left_impurity = node_y[sortted_indice[:best_position]].var()
+        best_right_impurity = node_y[sortted_indice[best_position:]].var()
+        node = {"feature": best_feature, "threshold": best_threshold, "left": best_left_indice, "right": best_right_indice,
+              "impurity": best_impurity, "left_impurity": best_left_impurity, "right_impurity": best_right_impurity}
+        return node
+
+
+class CARTClassifier(MoBTreeClassifier, ClassifierMixin):
+
+    def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0,
+                 split_features=None, feature_names=None, random_state=0):
+
+        super(CARTClassifier, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 split_features=split_features,
+                                 feature_names=feature_names,
+                                 random_state=random_state)
+
+    def build_root(self):
+
+        p = self.y.mean()
+        root_impurity = - p * np.log2(p) - (1 - p) * np.log2((1 - p)) if (p > 0) and (p < 1) else 0
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        best_estimator = None
+        predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
+        best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
+        return predict_func, best_estimator, best_impurity
+
+    def node_split(self, sample_indice):
+
+        node_x = self.x[sample_indice]
+        node_y = self.y[sample_indice]
+        n_samples, n_features = node_x.shape
+
+        best_feature = None
+        best_position = None
+        best_threshold = None
+        best_left_indice = None
+        best_right_indice = None
+        best_impurity = np.inf
+        best_left_impurity = np.inf
+        best_right_impurity = np.inf
+        for feature_indice in self.split_features:
+
+            current_feature = node_x[:, feature_indice]
+            sortted_indice = np.argsort(current_feature)
+            sortted_feature = current_feature[sortted_indice]
+            feature_range = sortted_feature[-1] - sortted_feature[0]
+            if feature_range < self.EPSILON:
+                continue
+
+            sum_left = 0
+            sum_total = np.sum(node_y)
+            for i, _ in enumerate(sortted_indice):
+
+                if ((i + 1) < self.min_samples_leaf) or ((n_samples - i - 1) < self.min_samples_leaf):
+                    continue
+
+                n_left = i + 1
+                n_right = n_samples - i - 1
+                sum_left += node_y[sortted_indice[i]]
+
+                left_impurity = 0
+                right_impurity = 0
+                pleft = sum_left / n_left
+                pright = (sum_total - sum_left) / n_right
+                if (pleft > 0) and (pleft < 1):
+                    left_impurity = (- pleft * np.log2(pleft) - (1 - pleft) * np.log2((1 - pleft)))
+                if (pright > 0) and (pright < 1):
+                    right_impurity = (- pright * np.log2(pright) - (1 - pright) * np.log2((1 - pright)))
+                current_impurity = (n_left / n_samples * left_impurity + n_right / n_samples * right_impurity)
+
+                if current_impurity < best_impurity:
+                    best_position = i + 1
+                    best_feature = feature_indice
+                    best_impurity = current_impurity
+                    best_threshold = (sortted_feature[i] + sortted_feature[i + 1]) / 2
+
+        if best_position is not None:
+            sortted_indice = np.argsort(node_x[:, best_feature])
+            best_left_indice = sample_indice[sortted_indice[:best_position]]
+            best_right_indice = sample_indice[sortted_indice[best_position:]]
+
+            pleft = node_y[sortted_indice[:best_position]].mean()
+            pright = node_y[sortted_indice[best_position:]].mean()
+            best_left_impurity = - pleft * np.log2(pleft) - (1 - pleft) * np.log2((1 - pleft)) if (pleft > 0) and (pleft < 1) else 0
+            best_right_impurity = - pright * np.log2(pright) - (1 - pright) * np.log2((1 - pright)) if (pright > 0) and (pright < 1) else 0
+        node = {"feature": best_feature, "threshold": best_threshold, "left": best_left_indice, "right": best_right_indice,
+             "impurity": best_impurity, "left_impurity": best_left_impurity, "right_impurity": best_right_impurity}
+        return node
@@ -0,0 +1,95 @@
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.utils.validation import check_is_fitted
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import make_scorer, roc_auc_score, mean_squared_error
+from sklearn.base import RegressorMixin, ClassifierMixin, is_regressor, is_classifier
+
+from .mobtree import MoBTreeRegressor, MoBTreeClassifier
+
+from warnings import simplefilter
+from sklearn.exceptions import ConvergenceWarning
+simplefilter("ignore", category=ConvergenceWarning)
+
+__all__ = ["CustomMobTreeRegressor", "CustomMobTreeClassifier"]
+
+
+class CustomMobTreeRegressor(MoBTreeRegressor, RegressorMixin):
+
+    def __init__(self, base_estimator, param_dict={}, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
+                 split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, random_state=0, **kargs):
+
+        super(CustomMobTreeRegressor, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 feature_names=feature_names,
+                                 split_features=split_features,
+                                 n_screen_grid=n_screen_grid,
+                                 n_feature_search=n_feature_search,
+                                 n_split_grid=n_split_grid,
+                                 random_state=random_state)
+        self.param_dict = param_dict
+        self.base_estimator = base_estimator
+        if "random_state" in self.base_estimator.get_params().keys():
+            self.base_estimator.set_params(**{"random_state": self.random_state})
+        self.base_estimator.set_params(**kargs)
+
+    def build_root(self):
+
+        self.base_estimator.fit(self.x, self.y)
+        root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
+                      scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
+                      cv=5, refit="mse", n_jobs=1, error_score=np.nan)
+        grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+        best_estimator = grid.best_estimator_
+        predict_func = lambda x: best_estimator.predict(x)
+        best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
+        return predict_func, best_estimator, best_impurity
+
+
+class CustomMobTreeClassifier(MoBTreeClassifier, RegressorMixin):
+
+    def __init__(self, base_estimator, param_dict={}, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
+                 split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, random_state=0, **kargs):
+
+        super(CustomMobTreeClassifier, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 feature_names=feature_names,
+                                 split_features=split_features,
+                                 n_screen_grid=n_screen_grid,
+                                 n_feature_search=n_feature_search,
+                                 n_split_grid=n_split_grid,
+                                 random_state=random_state)
+        self.param_dict = param_dict
+        self.base_estimator = base_estimator
+        if "random_state" in self.base_estimator.get_params().keys():
+            self.base_estimator.set_params(**{"random_state": self.random_state})
+        self.base_estimator.set_params(**kargs)
+
+    def build_root(self):
+
+        self.base_estimator.fit(self.x, self.y)
+        root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
+            best_estimator = None
+            predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
+            best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
+        else:
+            grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
+                          scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
+                          cv=5, refit="auc", n_jobs=1, error_score=np.nan)
+            grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
+            best_estimator = grid.best_estimator_
+            predict_func = lambda x: best_estimator.decision_function(x)
+            best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
+        return predict_func, best_estimator, best_impurity
@@ -0,0 +1,98 @@
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression, LogisticRegressionCV
+from sklearn.base import RegressorMixin, ClassifierMixin
+
+from .mobtree import MoBTreeRegressor, MoBTreeClassifier
+
+from warnings import simplefilter
+from sklearn.exceptions import ConvergenceWarning
+simplefilter("ignore", category=ConvergenceWarning)
+
+
+__all__ = ["GLMTreeRegressor", "GLMTreeClassifier"]
+
+
+class GLMTreeRegressor(MoBTreeRegressor, RegressorMixin):
+
+    def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
+                 split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, reg_lambda=0, random_state=0):
+
+        super(GLMTreeRegressor, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 feature_names=feature_names,
+                                 split_features=split_features,
+                                 n_screen_grid=n_screen_grid,
+                                 n_feature_search=n_feature_search,
+                                 n_split_grid=n_split_grid,
+                                 random_state=random_state)
+        self.reg_lambda = reg_lambda
+        self.base_estimator = LinearRegression()
+
+    def build_root(self):
+
+        self.base_estimator.fit(self.x, self.y)
+        root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        mx = self.x[sample_indice].mean(0)
+        sx = self.x[sample_indice].std(0) + self.EPSILON
+        nx = (self.x[sample_indice] - mx) / sx
+
+        best_estimator = LassoCV(alphas=self.reg_lambda, cv=5, n_jobs=1, random_state=self.random_state)
+        best_estimator.fit(nx, self.y[sample_indice])
+        best_estimator.coef_ = best_estimator.coef_ / sx
+        best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
+        xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_) + best_estimator.intercept_)
+        xmax = np.max(np.dot(self.x[sample_indice], best_estimator.coef_) + best_estimator.intercept_)
+        predict_func = lambda x: np.clip(best_estimator.predict(x), xmin, xmax)
+        best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
+        return predict_func, best_estimator, best_impurity
+
+
+class GLMTreeClassifier(MoBTreeClassifier, ClassifierMixin):
+
+    def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
+                 split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, reg_lambda=0, random_state=0):
+
+        super(GLMTreeClassifier, self).__init__(max_depth=max_depth,
+                                 min_samples_leaf=min_samples_leaf,
+                                 min_impurity_decrease=min_impurity_decrease,
+                                 feature_names=feature_names,
+                                 split_features=split_features,
+                                 n_screen_grid=n_screen_grid,
+                                 n_feature_search=n_feature_search,
+                                 n_split_grid=n_split_grid,
+                                 random_state=random_state)
+        self.reg_lambda = reg_lambda
+        self.base_estimator = LogisticRegression(penalty='none', random_state=self.random_state)
+
+    def build_root(self):
+
+        self.base_estimator.fit(self.x, self.y)
+        root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
+        return root_impurity
+
+    def build_leaf(self, sample_indice):
+
+        if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
+            best_estimator = None
+            predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
+            best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
+        else:
+            best_estimator = LogisticRegressionCV(Cs=self.reg_lambda, penalty="l1", solver="liblinear", scoring="roc_auc",
+                                      cv=5, n_jobs=1, random_state=self.random_state)
+            mx = self.x[sample_indice].mean(0)
+            sx = self.x[sample_indice].std(0) + self.EPSILON
+            nx = (self.x[sample_indice] - mx) / sx
+            best_estimator.fit(nx, self.y[sample_indice])
+            best_estimator.coef_ = best_estimator.coef_ / sx
+            best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
+            xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
+            xmax = np.max(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
+            predict_func = lambda x: np.clip(np.dot(x, best_estimator.coef_.ravel()), xmin, xmax) + best_estimator.intercept_
+            best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
+        return predict_func, best_estimator, best_impurity