Merge pull request #20 from agramfort/cosmits

ngoix · web-flow · commit 99d4c40b6be2 · 2017-11-08T23:28:51.000+01:00
cosmit
diff --git a/examples/plot_credit_default.py b/examples/plot_credit_default.py
@@ -17,7 +17,7 @@
 
 ###############################################################################
 # Data import and preparation
-# ..................
+# ...........................
 #
 # There are 3 categorical variables (SEX, EDUCATION and MARRIAGE) and 20
 # numerical variables.
@@ -69,9 +69,7 @@
 data['PAY_AMT_old_std'] = data[old_PAY_AMT].apply(
     lambda x: np.std(x), axis=1)
 
-data = data.drop(old_PAY_AMT, axis=1)
-data = data.drop(old_BILL_AMT, axis=1)
-data = data.drop(old_PAY, axis=1)
+data.drop(old_PAY_AMT + old_BILL_AMT + old_PAY, axis=1, inplace=True)
 
 # Creating the train/test split
 feature_names = list(data.columns)
@@ -85,44 +83,42 @@
 X_test = data[n_samples_train:]
 
 ###############################################################################
-# Benchmark with a Random Forest classifier.
-# ..................
+# Benchmark with a Random Forest classifier
+# .........................................
 #
 # This part shows the training and performance evaluation of a random forest
 # model. The objective remains to extract rules which targets credit defaults.
 
-RF = GridSearchCV(
+rf = GridSearchCV(
     RandomForestClassifier(
         random_state=rng,
         n_estimators=30,
         class_weight='balanced'),
-    param_grid={
-        'max_depth': range(3, 8, 1),
-        'max_features': np.linspace(0.1, 1., 5)
-        },
+    param_grid={'max_depth': range(3, 8, 1),
+                'max_features': np.linspace(0.1, 1., 5)},
     scoring={'AUC': 'roc_auc'}, cv=5,
     refit='AUC', n_jobs=-1)
 
-RF.fit(X_train, y_train)
-scoring_RF = RF.predict_proba(X_test)[:, 1]
+rf.fit(X_train, y_train)
+scoring_rf = rf.predict_proba(X_test)[:, 1]
 
-print("Random Forest selected parameters : " + str(RF.best_params_))
+print("Random Forest selected parameters : %s" % rf.best_params_)
 
 # Plot ROC and PR curves
 
 fig, axes = plt.subplots(1, 2, figsize=(12, 5),
                          sharex=True, sharey=True)
 
 ax = axes[0]
-fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF)
+fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_rf)
 ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("ROC", fontsize=20)
 ax.legend(loc='upper center', fontsize=8)
 ax.set_xlabel('False Positive Rate', fontsize=18)
 ax.set_ylabel('True Positive Rate (Recall)', fontsize=18)
 
 ax = axes[1]
-precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF)
+precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_rf)
 ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("Precision-Recall", fontsize=20)
 ax.set_xlabel('Recall (True Positive Rate)', fontsize=18)
@@ -145,7 +141,7 @@
 
 ###############################################################################
 # Getting rules with skrules
-# ..................
+# ..........................
 #
 # This part shows how SkopeRules can be fitted to detect credit defaults.
 # Performances are compared with the random forest model previously trained.
@@ -155,8 +151,7 @@
 clf = SkopeRules(
     similarity_thres=.9, max_depth=3, max_features=0.5,
     max_samples_features=0.5, random_state=rng, n_estimators=30,
-    feature_names=feature_names, recall_min=0.02, precision_min=0.6
-    )
+    feature_names=feature_names, recall_min=0.02, precision_min=0.6)
 clf.fit(X_train, y_train)
 
 # in the separate_rules_score method, a score of k means that rule number k
@@ -178,7 +173,7 @@
 
 ax = axes[0]
 fpr, tpr, _ = roc_curve(y_test, scoring)
-fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF)
+fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf)
 ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10)
 ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("ROC", fontsize=20)
@@ -188,7 +183,7 @@
 
 ax = axes[1]
 precision, recall, _ = precision_recall_curve(y_test, scoring)
-precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF)
+precision_rf, recall_rf, _ = precision_recall_curve(y_test, scoring_rf)
 ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10)
 ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("Precision-Recall", fontsize=20)
@@ -198,7 +193,7 @@
 
 ###############################################################################
 # The ROC and Precision-Recall curves show the performance of the rules
-# generated by SkopeRulesthe (the blue points) and the performance of the
+# generated by SkopeRules the (the blue points) and the performance of the
 # Random Forest classifier fitted above.
 # Each blue point represents the performance of a set of rules: The kth point
 # represents the score associated to the concatenation (union) of the k first