Skip to content

Commit a9b40cd

Browse files
author
[zebinyang]
committed
fix a bug in CART Regressor and Classifier (sum_left update should be executed before checking min_samples_leaf); version 0.2.3
1 parent e723256 commit a9b40cd

File tree

12 files changed

+2336
-29
lines changed

12 files changed

+2336
-29
lines changed

examples/demo.ipynb

Lines changed: 20 additions & 24 deletions
Large diffs are not rendered by default.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup
22

33
setup(name='simtree',
4-
version='0.2.2',
4+
version='0.2.3',
55
description='Single-index model tree',
66
url='https://github.com/ZebinYang/SIMTree',
77
author='Zebin Yang',

simtree-copy/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from .cart import CARTRegressor, CARTClassifier
2+
from .glmtree import GLMTreeRegressor, GLMTreeClassifier
3+
from .simtree import SIMTreeRegressor, SIMTreeClassifier
4+
from .customtree import CustomMobTreeRegressor, CustomMobTreeClassifier
5+
6+
__all__ = ["CARTRegressor", "CARTClassifier",
7+
"GLMTreeRegressor", "GLMTreeClassifier",
8+
"SIMTreeRegressor", "SIMTreeClassifier",
9+
"CustomMobTreeRegressor", "CustomMobTreeClassifier"]
10+
11+
__version__ = '0.2.2'
12+
__author__ = 'Zebin Yang'

simtree-copy/cart.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import numpy as np
2+
from sklearn.base import RegressorMixin, ClassifierMixin
3+
from .mobtree import MoBTreeRegressor, MoBTreeClassifier
4+
5+
6+
__all__ = ["CARTRegressor", "CARTClassifier"]
7+
8+
9+
class CARTRegressor(MoBTreeRegressor, RegressorMixin):
10+
11+
def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0,
12+
split_features=None, feature_names=None, random_state=0):
13+
14+
super(CARTRegressor, self).__init__(max_depth=max_depth,
15+
min_samples_leaf=min_samples_leaf,
16+
min_impurity_decrease=min_impurity_decrease,
17+
split_features=split_features,
18+
feature_names=feature_names,
19+
random_state=random_state)
20+
21+
def build_root(self):
22+
23+
root_impurity = self.y.var()
24+
return root_impurity
25+
26+
def build_leaf(self, sample_indice):
27+
28+
best_estimator = None
29+
predict_func = lambda x: np.mean(self.y[sample_indice])
30+
best_impurity = self.y[sample_indice].var()
31+
return predict_func, best_estimator, best_impurity
32+
33+
def node_split(self, sample_indice):
34+
35+
node_x = self.x[sample_indice]
36+
node_y = self.y[sample_indice]
37+
n_samples, n_features = node_x.shape
38+
39+
best_impurity = np.inf
40+
best_feature = None
41+
best_threshold = None
42+
best_left_indice = None
43+
best_right_indice = None
44+
for feature_indice in self.split_features:
45+
46+
current_feature = node_x[:, feature_indice]
47+
sortted_indice = np.argsort(current_feature)
48+
sortted_feature = current_feature[sortted_indice]
49+
feature_range = sortted_feature[-1] - sortted_feature[0]
50+
if feature_range < self.EPSILON:
51+
continue
52+
53+
sum_left = 0
54+
sum_total = np.sum(node_y)
55+
sq_sum_total = np.sum(node_y ** 2)
56+
for i, _ in enumerate(sortted_indice):
57+
58+
if ((i + 1) < self.min_samples_leaf) or ((n_samples - i - 1) < self.min_samples_leaf):
59+
continue
60+
61+
n_left = i + 1
62+
n_right = n_samples - i - 1
63+
sum_left += node_y[sortted_indice[i]]
64+
current_impurity = (sq_sum_total / n_samples - (sum_left / n_left) ** 2 * n_left / n_samples -
65+
((sum_total - sum_left) / n_right) ** 2 * n_right / n_samples)
66+
67+
if current_impurity < best_impurity:
68+
best_position = i + 1
69+
best_feature = feature_indice
70+
best_impurity = current_impurity
71+
best_threshold = (sortted_feature[i] + sortted_feature[i + 1]) / 2
72+
73+
sortted_indice = np.argsort(node_x[:, best_feature])
74+
best_left_indice = sample_indice[sortted_indice[:best_position]]
75+
best_right_indice = sample_indice[sortted_indice[best_position:]]
76+
best_left_impurity = node_y[sortted_indice[:best_position]].var()
77+
best_right_impurity = node_y[sortted_indice[best_position:]].var()
78+
node = {"feature": best_feature, "threshold": best_threshold, "left": best_left_indice, "right": best_right_indice,
79+
"impurity": best_impurity, "left_impurity": best_left_impurity, "right_impurity": best_right_impurity}
80+
return node
81+
82+
83+
class CARTClassifier(MoBTreeClassifier, ClassifierMixin):
84+
85+
def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0,
86+
split_features=None, feature_names=None, random_state=0):
87+
88+
super(CARTClassifier, self).__init__(max_depth=max_depth,
89+
min_samples_leaf=min_samples_leaf,
90+
min_impurity_decrease=min_impurity_decrease,
91+
split_features=split_features,
92+
feature_names=feature_names,
93+
random_state=random_state)
94+
95+
def build_root(self):
96+
97+
p = self.y.mean()
98+
root_impurity = - p * np.log2(p) - (1 - p) * np.log2((1 - p)) if (p > 0) and (p < 1) else 0
99+
return root_impurity
100+
101+
def build_leaf(self, sample_indice):
102+
103+
best_estimator = None
104+
predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
105+
best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
106+
return predict_func, best_estimator, best_impurity
107+
108+
def node_split(self, sample_indice):
109+
110+
node_x = self.x[sample_indice]
111+
node_y = self.y[sample_indice]
112+
n_samples, n_features = node_x.shape
113+
114+
best_feature = None
115+
best_position = None
116+
best_threshold = None
117+
best_left_indice = None
118+
best_right_indice = None
119+
best_impurity = np.inf
120+
best_left_impurity = np.inf
121+
best_right_impurity = np.inf
122+
for feature_indice in self.split_features:
123+
124+
current_feature = node_x[:, feature_indice]
125+
sortted_indice = np.argsort(current_feature)
126+
sortted_feature = current_feature[sortted_indice]
127+
feature_range = sortted_feature[-1] - sortted_feature[0]
128+
if feature_range < self.EPSILON:
129+
continue
130+
131+
sum_left = 0
132+
sum_total = np.sum(node_y)
133+
for i, _ in enumerate(sortted_indice):
134+
135+
if ((i + 1) < self.min_samples_leaf) or ((n_samples - i - 1) < self.min_samples_leaf):
136+
continue
137+
138+
n_left = i + 1
139+
n_right = n_samples - i - 1
140+
sum_left += node_y[sortted_indice[i]]
141+
142+
left_impurity = 0
143+
right_impurity = 0
144+
pleft = sum_left / n_left
145+
pright = (sum_total - sum_left) / n_right
146+
if (pleft > 0) and (pleft < 1):
147+
left_impurity = (- pleft * np.log2(pleft) - (1 - pleft) * np.log2((1 - pleft)))
148+
if (pright > 0) and (pright < 1):
149+
right_impurity = (- pright * np.log2(pright) - (1 - pright) * np.log2((1 - pright)))
150+
current_impurity = (n_left / n_samples * left_impurity + n_right / n_samples * right_impurity)
151+
152+
if current_impurity < best_impurity:
153+
best_position = i + 1
154+
best_feature = feature_indice
155+
best_impurity = current_impurity
156+
best_threshold = (sortted_feature[i] + sortted_feature[i + 1]) / 2
157+
158+
if best_position is not None:
159+
sortted_indice = np.argsort(node_x[:, best_feature])
160+
best_left_indice = sample_indice[sortted_indice[:best_position]]
161+
best_right_indice = sample_indice[sortted_indice[best_position:]]
162+
163+
pleft = node_y[sortted_indice[:best_position]].mean()
164+
pright = node_y[sortted_indice[best_position:]].mean()
165+
best_left_impurity = - pleft * np.log2(pleft) - (1 - pleft) * np.log2((1 - pleft)) if (pleft > 0) and (pleft < 1) else 0
166+
best_right_impurity = - pright * np.log2(pright) - (1 - pright) * np.log2((1 - pright)) if (pright > 0) and (pright < 1) else 0
167+
node = {"feature": best_feature, "threshold": best_threshold, "left": best_left_indice, "right": best_right_indice,
168+
"impurity": best_impurity, "left_impurity": best_left_impurity, "right_impurity": best_right_impurity}
169+
return node

simtree-copy/customtree.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import numpy as np
2+
from sklearn.model_selection import train_test_split
3+
from sklearn.utils.validation import check_is_fitted
4+
from sklearn.model_selection import GridSearchCV
5+
from sklearn.metrics import make_scorer, roc_auc_score, mean_squared_error
6+
from sklearn.base import RegressorMixin, ClassifierMixin, is_regressor, is_classifier
7+
8+
from .mobtree import MoBTreeRegressor, MoBTreeClassifier
9+
10+
from warnings import simplefilter
11+
from sklearn.exceptions import ConvergenceWarning
12+
simplefilter("ignore", category=ConvergenceWarning)
13+
14+
__all__ = ["CustomMobTreeRegressor", "CustomMobTreeClassifier"]
15+
16+
17+
class CustomMobTreeRegressor(MoBTreeRegressor, RegressorMixin):
18+
19+
def __init__(self, base_estimator, param_dict={}, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
20+
split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, random_state=0, **kargs):
21+
22+
super(CustomMobTreeRegressor, self).__init__(max_depth=max_depth,
23+
min_samples_leaf=min_samples_leaf,
24+
min_impurity_decrease=min_impurity_decrease,
25+
feature_names=feature_names,
26+
split_features=split_features,
27+
n_screen_grid=n_screen_grid,
28+
n_feature_search=n_feature_search,
29+
n_split_grid=n_split_grid,
30+
random_state=random_state)
31+
self.param_dict = param_dict
32+
self.base_estimator = base_estimator
33+
if "random_state" in self.base_estimator.get_params().keys():
34+
self.base_estimator.set_params(**{"random_state": self.random_state})
35+
self.base_estimator.set_params(**kargs)
36+
37+
def build_root(self):
38+
39+
self.base_estimator.fit(self.x, self.y)
40+
root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
41+
return root_impurity
42+
43+
def build_leaf(self, sample_indice):
44+
45+
grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
46+
scoring={"mse": make_scorer(mean_squared_error, greater_is_better=False)},
47+
cv=5, refit="mse", n_jobs=1, error_score=np.nan)
48+
grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
49+
best_estimator = grid.best_estimator_
50+
predict_func = lambda x: best_estimator.predict(x)
51+
best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
52+
return predict_func, best_estimator, best_impurity
53+
54+
55+
class CustomMobTreeClassifier(MoBTreeClassifier, RegressorMixin):
56+
57+
def __init__(self, base_estimator, param_dict={}, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
58+
split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, random_state=0, **kargs):
59+
60+
super(CustomMobTreeClassifier, self).__init__(max_depth=max_depth,
61+
min_samples_leaf=min_samples_leaf,
62+
min_impurity_decrease=min_impurity_decrease,
63+
feature_names=feature_names,
64+
split_features=split_features,
65+
n_screen_grid=n_screen_grid,
66+
n_feature_search=n_feature_search,
67+
n_split_grid=n_split_grid,
68+
random_state=random_state)
69+
self.param_dict = param_dict
70+
self.base_estimator = base_estimator
71+
if "random_state" in self.base_estimator.get_params().keys():
72+
self.base_estimator.set_params(**{"random_state": self.random_state})
73+
self.base_estimator.set_params(**kargs)
74+
75+
def build_root(self):
76+
77+
self.base_estimator.fit(self.x, self.y)
78+
root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
79+
return root_impurity
80+
81+
def build_leaf(self, sample_indice):
82+
83+
if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
84+
best_estimator = None
85+
predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
86+
best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
87+
else:
88+
grid = GridSearchCV(self.base_estimator, param_grid=self.param_dict,
89+
scoring={"auc": make_scorer(roc_auc_score, needs_proba=True)},
90+
cv=5, refit="auc", n_jobs=1, error_score=np.nan)
91+
grid.fit(self.x[sample_indice], self.y[sample_indice].ravel())
92+
best_estimator = grid.best_estimator_
93+
predict_func = lambda x: best_estimator.decision_function(x)
94+
best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
95+
return predict_func, best_estimator, best_impurity

simtree-copy/glmtree.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import numpy as np
2+
from sklearn.model_selection import train_test_split
3+
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression, LogisticRegressionCV
4+
from sklearn.base import RegressorMixin, ClassifierMixin
5+
6+
from .mobtree import MoBTreeRegressor, MoBTreeClassifier
7+
8+
from warnings import simplefilter
9+
from sklearn.exceptions import ConvergenceWarning
10+
simplefilter("ignore", category=ConvergenceWarning)
11+
12+
13+
__all__ = ["GLMTreeRegressor", "GLMTreeClassifier"]
14+
15+
16+
class GLMTreeRegressor(MoBTreeRegressor, RegressorMixin):
17+
18+
def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
19+
split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, reg_lambda=0, random_state=0):
20+
21+
super(GLMTreeRegressor, self).__init__(max_depth=max_depth,
22+
min_samples_leaf=min_samples_leaf,
23+
min_impurity_decrease=min_impurity_decrease,
24+
feature_names=feature_names,
25+
split_features=split_features,
26+
n_screen_grid=n_screen_grid,
27+
n_feature_search=n_feature_search,
28+
n_split_grid=n_split_grid,
29+
random_state=random_state)
30+
self.reg_lambda = reg_lambda
31+
self.base_estimator = LinearRegression()
32+
33+
def build_root(self):
34+
35+
self.base_estimator.fit(self.x, self.y)
36+
root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
37+
return root_impurity
38+
39+
def build_leaf(self, sample_indice):
40+
41+
mx = self.x[sample_indice].mean(0)
42+
sx = self.x[sample_indice].std(0) + self.EPSILON
43+
nx = (self.x[sample_indice] - mx) / sx
44+
45+
best_estimator = LassoCV(alphas=self.reg_lambda, cv=5, n_jobs=1, random_state=self.random_state)
46+
best_estimator.fit(nx, self.y[sample_indice])
47+
best_estimator.coef_ = best_estimator.coef_ / sx
48+
best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
49+
xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_) + best_estimator.intercept_)
50+
xmax = np.max(np.dot(self.x[sample_indice], best_estimator.coef_) + best_estimator.intercept_)
51+
predict_func = lambda x: np.clip(best_estimator.predict(x), xmin, xmax)
52+
best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict(self.x[sample_indice]))
53+
return predict_func, best_estimator, best_impurity
54+
55+
56+
class GLMTreeClassifier(MoBTreeClassifier, ClassifierMixin):
57+
58+
def __init__(self, max_depth=3, min_samples_leaf=50, min_impurity_decrease=0, feature_names=None,
59+
split_features=None, n_screen_grid=1, n_feature_search=10, n_split_grid=20, reg_lambda=0, random_state=0):
60+
61+
super(GLMTreeClassifier, self).__init__(max_depth=max_depth,
62+
min_samples_leaf=min_samples_leaf,
63+
min_impurity_decrease=min_impurity_decrease,
64+
feature_names=feature_names,
65+
split_features=split_features,
66+
n_screen_grid=n_screen_grid,
67+
n_feature_search=n_feature_search,
68+
n_split_grid=n_split_grid,
69+
random_state=random_state)
70+
self.reg_lambda = reg_lambda
71+
self.base_estimator = LogisticRegression(penalty='none', random_state=self.random_state)
72+
73+
def build_root(self):
74+
75+
self.base_estimator.fit(self.x, self.y)
76+
root_impurity = self.evaluate_estimator(self.base_estimator, self.x, self.y.ravel())
77+
return root_impurity
78+
79+
def build_leaf(self, sample_indice):
80+
81+
if (self.y[sample_indice].std() == 0) | (self.y[sample_indice].sum() < 5) | ((1 - self.y[sample_indice]).sum() < 5):
82+
best_estimator = None
83+
predict_func = lambda x: np.ones(x.shape[0]) * self.y[sample_indice].mean()
84+
best_impurity = self.get_loss(self.y[sample_indice], predict_func(self.x[sample_indice]))
85+
else:
86+
best_estimator = LogisticRegressionCV(Cs=self.reg_lambda, penalty="l1", solver="liblinear", scoring="roc_auc",
87+
cv=5, n_jobs=1, random_state=self.random_state)
88+
mx = self.x[sample_indice].mean(0)
89+
sx = self.x[sample_indice].std(0) + self.EPSILON
90+
nx = (self.x[sample_indice] - mx) / sx
91+
best_estimator.fit(nx, self.y[sample_indice])
92+
best_estimator.coef_ = best_estimator.coef_ / sx
93+
best_estimator.intercept_ = best_estimator.intercept_ - np.dot(mx, best_estimator.coef_.T)
94+
xmin = np.min(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
95+
xmax = np.max(np.dot(self.x[sample_indice], best_estimator.coef_.ravel()))
96+
predict_func = lambda x: np.clip(np.dot(x, best_estimator.coef_.ravel()), xmin, xmax) + best_estimator.intercept_
97+
best_impurity = self.get_loss(self.y[sample_indice], best_estimator.predict_proba(self.x[sample_indice])[:, 1])
98+
return predict_func, best_estimator, best_impurity

0 commit comments

Comments
 (0)