clean score_top_rules and examples

Jean-Matthieu Schertzer · Jean-Matthieu Schertzer · commit ea4c5d309f61 · 2018-01-19T19:08:34.000+01:00
diff --git a/examples/plot_credit_default.py b/examples/plot_credit_default.py
@@ -92,7 +92,7 @@
 rf = GridSearchCV(
     RandomForestClassifier(
         random_state=rng,
-        n_estimators=30,
+        n_estimators=50,
         class_weight='balanced'),
     param_grid={'max_depth': range(3, 8, 1),
                 'max_features': np.linspace(0.1, 1., 5)},
@@ -149,19 +149,20 @@
 # fit the model
 
 clf = SkopeRules(
-    similarity_thres=.9, max_depth=3, max_features=0.5,
-    max_samples_features=0.5, random_state=rng, n_estimators=30,
-    feature_names=feature_names, recall_min=0.02, precision_min=0.6)
+    similarity_thres=.8, max_depth=3, max_features=0.5,
+    max_samples_features=0.5, random_state=rng, n_estimators=20,
+    feature_names=feature_names, recall_min=0.04, precision_min=0.6)
 clf.fit(X_train, y_train)
 
-# in the separate_rules_score method, a score of k means that rule number k
+# in the score_top_rules method, a score of k means that rule number k
 # vote positively, but not rules 1, ..., k-1. It will allow us to plot
-# performance of each rule separately on ROC and PR plots.
-scoring = clf.separate_rules_score(X_test)
+# performance of each rule separately on the ROC and PR plots.
+scoring = clf.score_top_rules(X_test)
 
 print(str(len(clf.rules_)) + ' rules have been built.')
-print('The most precise rules are the following:')
-print(clf.rules_[:5])
+print('The 5 most precise rules are the following:')
+for rule in clf.rules_[:5]:
+    print(rule[0])
 
 curves = [roc_curve, precision_recall_curve]
 xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
@@ -174,8 +175,9 @@
 ax = axes[0]
 fpr, tpr, _ = roc_curve(y_test, scoring)
 fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf)
-ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10)
-ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post')
+ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10, label="rules of SkopeRules")
+ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post',
+        label="Random Forest")
 ax.set_title("ROC", fontsize=20)
 ax.legend(loc='upper center', fontsize=8)
 ax.set_xlabel('False Positive Rate', fontsize=18)
@@ -184,8 +186,10 @@
 ax = axes[1]
 precision, recall, _ = precision_recall_curve(y_test, scoring)
 precision_rf, recall_rf, _ = precision_recall_curve(y_test, scoring_rf)
-ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10)
-ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post')
+ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10,
+           label="rules of SkopeRules")
+ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post',
+        label="Random Forest")
 ax.set_title("Precision-Recall", fontsize=20)
 ax.set_xlabel('Recall (True Positive Rate)', fontsize=18)
 ax.set_ylabel('Precision', fontsize=18)
@@ -195,10 +199,11 @@
 # The ROC and Precision-Recall curves show the performance of the rules
 # generated by SkopeRules the (the blue points) and the performance of the
 # Random Forest classifier fitted above.
-# Each blue point represents the performance of a set of rules: The kth point
+# Each blue point represents the performance of a set of rules: Starting from
+# the left on the precision-recall cruve, the kth point
 # represents the score associated to the concatenation (union) of the k first
 # rules, etc. Thus, each blue point is associated with an interpretable
-# classifier.
+# classifier, which is a combination of a few rules.
 # In terms of performance, each of these interpretable classifiers compare well
 # with Random Forest, while offering complete interpretation.
 # The range of recall and precision can be controlled by the precision_min and
diff --git a/examples/plot_skope_rules.py b/examples/plot_skope_rules.py
@@ -33,6 +33,11 @@
 X_train = np.r_[X_inliers, X_outliers]
 y_train = [0] * n_inliers + [1] * n_outliers
 
+
+###############################################################################
+# Training the SkopeRules classifier
+# ..................................
+
 # fit the model
 clf = SkopeRules(random_state=rng, n_estimators=10)
 clf.fit(X_train, y_train)
@@ -42,7 +47,7 @@
 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
 
-plt.title("Skope Rules")
+plt.title("Skope Rules, value of the decision_function method")
 plt.contourf(xx, yy, Z, cmap=plt.cm.Blues)
 
 a = plt.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white',
@@ -56,3 +61,35 @@
            ["inliers", "outliers"],
            loc="upper left")
 plt.show()
+
+###############################################################################
+# Extracting top rules
+# ....................
+#
+# On the 4 following figures, the predict_top_rules method is used with
+# several values of n_rules. n_rules = 2 means that the prediction is
+# done using only the 2 best rules.
+
+print('The 4 most precise rules are the following:')
+for rule in clf.rules_[:4]:
+    print(rule[0])
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 5),
+                         sharex=True, sharey=True)
+for i_ax, ax in enumerate(np.ravel(axes)):
+    Z = clf.predict_top_rules(np.c_[xx.ravel(), yy.ravel()], i_ax+1)
+    Z = Z.reshape(xx.shape)
+    ax.set_title("Prediction with predict_top_rules, n_rules="+str(i_ax+1))
+    ax.contourf(xx, yy, Z, cmap=plt.cm.Blues)
+
+    a = ax.scatter(X_inliers[:, 0], X_inliers[:, 1], c='white',
+                   s=20, edgecolor='k')
+    b = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
+                   s=20, edgecolor='k')
+    ax.axis('tight')
+plt.xlim((-5, 5))
+plt.ylim((-5, 5))
+plt.legend([a, b],
+           ["inliers", "outliers"],
+           loc="upper left")
+plt.show()
diff --git a/skrules/skope_rules.py b/skrules/skope_rules.py
@@ -138,7 +138,7 @@ def __init__(self,
                  precision_min=0.5,
                  recall_min=0.01,
                  n_estimators=10,
-                 similarity_thres=0.99,
+                 similarity_thres=0.95,
                  max_samples=.8,
                  max_samples_features=1.,
                  bootstrap=False,
@@ -438,8 +438,8 @@ def decision_function(self, X):
                              " Please reshape your data."
                              % (X.shape[1], self.n_features_))
 
-        selected_rules = self.rules_[:self.n_estimators]
         df = pandas.DataFrame(X, columns=self.feature_names_)
+        selected_rules = self.rules_
 
         scores = np.zeros(X.shape[0])
         for (r, w) in selected_rules:
@@ -479,20 +479,19 @@ def rules_vote(self, X):
                              " Please reshape your data."
                              % (X.shape[1], self.n_features_))
 
-        selected_rules = self.rules_[:self.n_estimators]
         df = pandas.DataFrame(X, columns=self.feature_names_)
+        selected_rules = self.rules_
 
         scores = np.zeros(X.shape[0])
         for (r, _) in selected_rules:
             scores[list(df.query(r).index)] += 1
 
         return scores
 
-    def separate_rules_score(self, X):
+    def score_top_rules(self, X):
         """Score representing an ordering between the base classifiers (rules).
 
-        The score of an input sample is computed as the number of the more
-        precise rule voting positively.
+        The score is high when the instance is detected by a performing rule.
         If there are n rules, ordered by increasing OOB precision, a score of k
         means than the kth rule has voted positively, but not the (k-1) first
         rules.
@@ -522,16 +521,42 @@ def separate_rules_score(self, X):
                              " Please reshape your data."
                              % (X.shape[1], self.n_features_))
 
-        selected_rules = self.rules_[:self.n_estimators]
         df = pandas.DataFrame(X, columns=self.feature_names_)
+        selected_rules = self.rules_
 
         scores = np.zeros(X.shape[0])
         for (k, r) in enumerate(list((selected_rules))):
             scores[list(df.query(r[0]).index)] = np.maximum(
-                k + 1, scores[list(df.query(r[0]).index)])
+                len(selected_rules) - k,
+                scores[list(df.query(r[0]).index)])
 
         return scores
 
+    def predict_top_rules(self, X, n_rules):
+        """Predict if a particular sample is an outlier or not,
+        using the n_rules most performing rules.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32``
+
+        n_rules : int
+            The number of rules used for the prediction. If one of the
+            n_rules most performing rules is activated, the prediction
+            is equal to 1.
+
+        Returns
+        -------
+        is_outlier : array, shape (n_samples,)
+            For each observations, tells whether or not (1 or 0) it should
+            be considered as an outlier according to the selected rules.
+        """
+
+        return np.array((self.score_top_rules(X) > len(self.rules_) - n_rules),
+                        dtype=int)
+
     def _tree_to_rules(self, tree, feature_names):
         """
         Return a list of rules from a tree
diff --git a/skrules/tests/test_skope_rules.py b/skrules/tests/test_skope_rules.py
@@ -100,7 +100,7 @@ def test_skope_rules_error():
     assert_raises(ValueError, SkopeRules().fit(X, y).decision_function,
                   X[:, 1:])
     assert_raises(ValueError, SkopeRules().fit(X, y).rules_vote, X[:, 1:])
-    assert_raises(ValueError, SkopeRules().fit(X, y).separate_rules_score,
+    assert_raises(ValueError, SkopeRules().fit(X, y).score_top_rules,
                   X[:, 1:])
 
 
@@ -133,15 +133,16 @@ def test_skope_rules_works():
     clf.fit(X, y)
     decision_func = clf.decision_function(X_test)
     rules_vote = clf.rules_vote(X_test)
-    separate_rules_score = clf.separate_rules_score(X_test)
+    score_top_rules = clf.score_top_rules(X_test)
     pred = clf.predict(X_test)
+    pred_score_top_rules = clf.predict_top_rules(X_test,1)
     # assert detect outliers:
     assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
     assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
-    assert_greater(np.min(separate_rules_score[-2:]),
-                   np.max(separate_rules_score[:-2]))
+    assert_greater(np.min(score_top_rules[-2:]),
+                   np.max(score_top_rules[:-2]))
     assert_array_equal(pred, 6 * [0] + 2 * [1])
-
+    assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
 
 def test_performances():
     X, y = make_blobs(n_samples=1000, random_state=0, centers=2)