Skip to content

Commit ea4c5d3

Browse files
author
Jean-Matthieu Schertzer
committed
clean score_top_rules and examples
1 parent 5955a0f commit ea4c5d3

File tree

4 files changed

+97
-29
lines changed

4 files changed

+97
-29
lines changed

examples/plot_credit_default.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
rf = GridSearchCV(
9393
RandomForestClassifier(
9494
random_state=rng,
95-
n_estimators=30,
95+
n_estimators=50,
9696
class_weight='balanced'),
9797
param_grid={'max_depth': range(3, 8, 1),
9898
'max_features': np.linspace(0.1, 1., 5)},
@@ -149,19 +149,20 @@
149149
# fit the model
150150

151151
clf = SkopeRules(
152-
similarity_thres=.9, max_depth=3, max_features=0.5,
153-
max_samples_features=0.5, random_state=rng, n_estimators=30,
154-
feature_names=feature_names, recall_min=0.02, precision_min=0.6)
152+
similarity_thres=.8, max_depth=3, max_features=0.5,
153+
max_samples_features=0.5, random_state=rng, n_estimators=20,
154+
feature_names=feature_names, recall_min=0.04, precision_min=0.6)
155155
clf.fit(X_train, y_train)
156156

157-
# in the separate_rules_score method, a score of k means that rule number k
157+
# in the score_top_rules method, a score of k means that rule number k
158158
# vote positively, but not rules 1, ..., k-1. It will allow us to plot
159-
# performance of each rule separately on ROC and PR plots.
160-
scoring = clf.separate_rules_score(X_test)
159+
# performance of each rule separately on the ROC and PR plots.
160+
scoring = clf.score_top_rules(X_test)
161161

162162
print(str(len(clf.rules_)) + ' rules have been built.')
163-
print('The most precise rules are the following:')
164-
print(clf.rules_[:5])
163+
print('The 5 most precise rules are the following:')
164+
for rule in clf.rules_[:5]:
165+
print(rule[0])
165166

166167
curves = [roc_curve, precision_recall_curve]
167168
xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
@@ -174,8 +175,9 @@
174175
ax = axes[0]
175176
fpr, tpr, _ = roc_curve(y_test, scoring)
176177
fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf)
177-
ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10)
178-
ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post')
178+
ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10, label="rules of SkopeRules")
179+
ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post',
180+
label="Random Forest")
179181
ax.set_title("ROC", fontsize=20)
180182
ax.legend(loc='upper center', fontsize=8)
181183
ax.set_xlabel('False Positive Rate', fontsize=18)
@@ -184,8 +186,10 @@
184186
ax = axes[1]
185187
precision, recall, _ = precision_recall_curve(y_test, scoring)
186188
precision_rf, recall_rf, _ = precision_recall_curve(y_test, scoring_rf)
187-
ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10)
188-
ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post')
189+
ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10,
190+
label="rules of SkopeRules")
191+
ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post',
192+
label="Random Forest")
189193
ax.set_title("Precision-Recall", fontsize=20)
190194
ax.set_xlabel('Recall (True Positive Rate)', fontsize=18)
191195
ax.set_ylabel('Precision', fontsize=18)
@@ -195,10 +199,11 @@
195199
# The ROC and Precision-Recall curves show the performance of the rules
196200
# generated by SkopeRules the (the blue points) and the performance of the
197201
# Random Forest classifier fitted above.
198-
# Each blue point represents the performance of a set of rules: The kth point
202+
# Each blue point represents the performance of a set of rules: Starting from
203+
# the left on the precision-recall cruve, the kth point
199204
# represents the score associated to the concatenation (union) of the k first
200205
# rules, etc. Thus, each blue point is associated with an interpretable
201-
# classifier.
206+
# classifier, which is a combination of a few rules.
202207
# In terms of performance, each of these interpretable classifiers compare well
203208
# with Random Forest, while offering complete interpretation.
204209
# The range of recall and precision can be controlled by the precision_min and

examples/plot_skope_rules.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@
3333
X_train = np.r_[X_inliers, X_outliers]
3434
y_train = [0] * n_inliers + [1] * n_outliers
3535

36+
37+
###############################################################################
38+
# Training the SkopeRules classifier
39+
# ..................................
40+
3641
# fit the model
3742
clf = SkopeRules(random_state=rng, n_estimators=10)
3843
clf.fit(X_train, y_train)
@@ -42,7 +47,7 @@
4247
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
4348
Z = Z.reshape(xx.shape)
4449

45-
plt.title("Skope Rules")
50+
plt.title("Skope Rules, value of the decision_function method")
4651
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues)
4752

4853
a = plt.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white',
@@ -56,3 +61,35 @@
5661
["inliers", "outliers"],
5762
loc="upper left")
5863
plt.show()
64+
65+
###############################################################################
66+
# Extracting top rules
67+
# ....................
68+
#
69+
# On the 4 following figures, the predict_top_rules method is used with
70+
# several values of n_rules. n_rules = 2 means that the prediction is
71+
# done using only the 2 best rules.
72+
73+
print('The 4 most precise rules are the following:')
74+
for rule in clf.rules_[:4]:
75+
print(rule[0])
76+
77+
fig, axes = plt.subplots(2, 2, figsize=(12, 5),
78+
sharex=True, sharey=True)
79+
for i_ax, ax in enumerate(np.ravel(axes)):
80+
Z = clf.predict_top_rules(np.c_[xx.ravel(), yy.ravel()], i_ax+1)
81+
Z = Z.reshape(xx.shape)
82+
ax.set_title("Prediction with predict_top_rules, n_rules="+str(i_ax+1))
83+
ax.contourf(xx, yy, Z, cmap=plt.cm.Blues)
84+
85+
a = ax.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white',
86+
s=20, edgecolor='k')
87+
b = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
88+
s=20, edgecolor='k')
89+
ax.axis('tight')
90+
plt.xlim((-5, 5))
91+
plt.ylim((-5, 5))
92+
plt.legend([a, b],
93+
["inliers", "outliers"],
94+
loc="upper left")
95+
plt.show()

skrules/skope_rules.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def __init__(self,
138138
precision_min=0.5,
139139
recall_min=0.01,
140140
n_estimators=10,
141-
similarity_thres=0.99,
141+
similarity_thres=0.95,
142142
max_samples=.8,
143143
max_samples_features=1.,
144144
bootstrap=False,
@@ -438,8 +438,8 @@ def decision_function(self, X):
438438
" Please reshape your data."
439439
% (X.shape[1], self.n_features_))
440440

441-
selected_rules = self.rules_[:self.n_estimators]
442441
df = pandas.DataFrame(X, columns=self.feature_names_)
442+
selected_rules = self.rules_
443443

444444
scores = np.zeros(X.shape[0])
445445
for (r, w) in selected_rules:
@@ -479,20 +479,19 @@ def rules_vote(self, X):
479479
" Please reshape your data."
480480
% (X.shape[1], self.n_features_))
481481

482-
selected_rules = self.rules_[:self.n_estimators]
483482
df = pandas.DataFrame(X, columns=self.feature_names_)
483+
selected_rules = self.rules_
484484

485485
scores = np.zeros(X.shape[0])
486486
for (r, _) in selected_rules:
487487
scores[list(df.query(r).index)] += 1
488488

489489
return scores
490490

491-
def separate_rules_score(self, X):
491+
def score_top_rules(self, X):
492492
"""Score representing an ordering between the base classifiers (rules).
493493
494-
The score of an input sample is computed as the number of the more
495-
precise rule voting positively.
494+
The score is high when the instance is detected by a performing rule.
496495
If there are n rules, ordered by increasing OOB precision, a score of k
497496
means than the kth rule has voted positively, but not the (k-1) first
498497
rules.
@@ -522,16 +521,42 @@ def separate_rules_score(self, X):
522521
" Please reshape your data."
523522
% (X.shape[1], self.n_features_))
524523

525-
selected_rules = self.rules_[:self.n_estimators]
526524
df = pandas.DataFrame(X, columns=self.feature_names_)
525+
selected_rules = self.rules_
527526

528527
scores = np.zeros(X.shape[0])
529528
for (k, r) in enumerate(list((selected_rules))):
530529
scores[list(df.query(r[0]).index)] = np.maximum(
531-
k + 1, scores[list(df.query(r[0]).index)])
530+
len(selected_rules) - k,
531+
scores[list(df.query(r[0]).index)])
532532

533533
return scores
534534

535+
def predict_top_rules(self, X, n_rules):
536+
"""Predict if a particular sample is an outlier or not,
537+
using the n_rules most performing rules.
538+
539+
Parameters
540+
----------
541+
X : array-like, shape (n_samples, n_features)
542+
The input samples. Internally, it will be converted to
543+
``dtype=np.float32``
544+
545+
n_rules : int
546+
The number of rules used for the prediction. If one of the
547+
n_rules most performing rules is activated, the prediction
548+
is equal to 1.
549+
550+
Returns
551+
-------
552+
is_outlier : array, shape (n_samples,)
553+
For each observations, tells whether or not (1 or 0) it should
554+
be considered as an outlier according to the selected rules.
555+
"""
556+
557+
return np.array((self.score_top_rules(X) > len(self.rules_) - n_rules),
558+
dtype=int)
559+
535560
def _tree_to_rules(self, tree, feature_names):
536561
"""
537562
Return a list of rules from a tree

skrules/tests/test_skope_rules.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def test_skope_rules_error():
100100
assert_raises(ValueError, SkopeRules().fit(X, y).decision_function,
101101
X[:, 1:])
102102
assert_raises(ValueError, SkopeRules().fit(X, y).rules_vote, X[:, 1:])
103-
assert_raises(ValueError, SkopeRules().fit(X, y).separate_rules_score,
103+
assert_raises(ValueError, SkopeRules().fit(X, y).score_top_rules,
104104
X[:, 1:])
105105

106106

@@ -133,15 +133,16 @@ def test_skope_rules_works():
133133
clf.fit(X, y)
134134
decision_func = clf.decision_function(X_test)
135135
rules_vote = clf.rules_vote(X_test)
136-
separate_rules_score = clf.separate_rules_score(X_test)
136+
score_top_rules = clf.score_top_rules(X_test)
137137
pred = clf.predict(X_test)
138+
pred_score_top_rules = clf.predict_top_rules(X_test,1)
138139
# assert detect outliers:
139140
assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
140141
assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
141-
assert_greater(np.min(separate_rules_score[-2:]),
142-
np.max(separate_rules_score[:-2]))
142+
assert_greater(np.min(score_top_rules[-2:]),
143+
np.max(score_top_rules[:-2]))
143144
assert_array_equal(pred, 6 * [0] + 2 * [1])
144-
145+
assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
145146

146147
def test_performances():
147148
X, y = make_blobs(n_samples=1000, random_state=0, centers=2)

0 commit comments

Comments
 (0)