|
17 | 17 |
|
18 | 18 | ############################################################################### |
19 | 19 | # Data import and preparation |
20 | | -# .................. |
| 20 | +# ........................... |
21 | 21 | # |
22 | 22 | # There are 3 categorical variables (SEX, EDUCATION and MARRIAGE) and 20 |
23 | 23 | # numerical variables. |
|
69 | 69 | data['PAY_AMT_old_std'] = data[old_PAY_AMT].apply( |
70 | 70 | lambda x: np.std(x), axis=1) |
71 | 71 |
|
72 | | -data = data.drop(old_PAY_AMT, axis=1) |
73 | | -data = data.drop(old_BILL_AMT, axis=1) |
74 | | -data = data.drop(old_PAY, axis=1) |
| 72 | +data.drop(old_PAY_AMT + old_BILL_AMT + old_PAY, axis=1, inplace=True) |
75 | 73 |
|
76 | 74 | # Creating the train/test split |
77 | 75 | feature_names = list(data.columns) |
|
85 | 83 | X_test = data[n_samples_train:] |
86 | 84 |
|
87 | 85 | ############################################################################### |
88 | | -# Benchmark with a Random Forest classifier. |
89 | | -# .................. |
| 86 | +# Benchmark with a Random Forest classifier |
| 87 | +# ......................................... |
90 | 88 | # |
91 | 89 | # This part shows the training and performance evaluation of a random forest |
92 | 90 | # model. The objective remains to extract rules which targets credit defaults. |
93 | 91 |
|
94 | | -RF = GridSearchCV( |
| 92 | +rf = GridSearchCV( |
95 | 93 | RandomForestClassifier( |
96 | 94 | random_state=rng, |
97 | 95 | n_estimators=30, |
98 | 96 | class_weight='balanced'), |
99 | | - param_grid={ |
100 | | - 'max_depth': range(3, 8, 1), |
101 | | - 'max_features': np.linspace(0.1, 1., 5) |
102 | | - }, |
| 97 | + param_grid={'max_depth': range(3, 8, 1), |
| 98 | + 'max_features': np.linspace(0.1, 1., 5)}, |
103 | 99 | scoring={'AUC': 'roc_auc'}, cv=5, |
104 | 100 | refit='AUC', n_jobs=-1) |
105 | 101 |
|
106 | | -RF.fit(X_train, y_train) |
107 | | -scoring_RF = RF.predict_proba(X_test)[:, 1] |
| 102 | +rf.fit(X_train, y_train) |
| 103 | +scoring_rf = rf.predict_proba(X_test)[:, 1] |
108 | 104 |
|
109 | | -print("Random Forest selected parameters : " + str(RF.best_params_)) |
| 105 | +print("Random Forest selected parameters : %s" % rf.best_params_) |
110 | 106 |
|
111 | 107 | # Plot ROC and PR curves |
112 | 108 |
|
113 | 109 | fig, axes = plt.subplots(1, 2, figsize=(12, 5), |
114 | 110 | sharex=True, sharey=True) |
115 | 111 |
|
116 | 112 | ax = axes[0] |
117 | | -fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF) |
| 113 | +fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_rf) |
118 | 114 | ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post') |
119 | 115 | ax.set_title("ROC", fontsize=20) |
120 | 116 | ax.legend(loc='upper center', fontsize=8) |
121 | 117 | ax.set_xlabel('False Positive Rate', fontsize=18) |
122 | 118 | ax.set_ylabel('True Positive Rate (Recall)', fontsize=18) |
123 | 119 |
|
124 | 120 | ax = axes[1] |
125 | | -precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF) |
| 121 | +precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_rf) |
126 | 122 | ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post') |
127 | 123 | ax.set_title("Precision-Recall", fontsize=20) |
128 | 124 | ax.set_xlabel('Recall (True Positive Rate)', fontsize=18) |
|
145 | 141 |
|
146 | 142 | ############################################################################### |
147 | 143 | # Getting rules with skrules |
148 | | -# .................. |
| 144 | +# .......................... |
149 | 145 | # |
150 | 146 | # This part shows how SkopeRules can be fitted to detect credit defaults. |
151 | 147 | # Performances are compared with the random forest model previously trained. |
|
155 | 151 | clf = SkopeRules( |
156 | 152 | similarity_thres=.9, max_depth=3, max_features=0.5, |
157 | 153 | max_samples_features=0.5, random_state=rng, n_estimators=30, |
158 | | - feature_names=feature_names, recall_min=0.02, precision_min=0.6 |
159 | | - ) |
| 154 | + feature_names=feature_names, recall_min=0.02, precision_min=0.6) |
160 | 155 | clf.fit(X_train, y_train) |
161 | 156 |
|
162 | 157 | # in the separate_rules_score method, a score of k means that rule number k |
|
178 | 173 |
|
179 | 174 | ax = axes[0] |
180 | 175 | fpr, tpr, _ = roc_curve(y_test, scoring) |
181 | | -fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF) |
| 176 | +fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf) |
182 | 177 | ax.scatter(fpr[:-1], tpr[:-1], c='b', s=10) |
183 | 178 | ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post') |
184 | 179 | ax.set_title("ROC", fontsize=20) |
|
188 | 183 |
|
189 | 184 | ax = axes[1] |
190 | 185 | precision, recall, _ = precision_recall_curve(y_test, scoring) |
191 | | -precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF) |
| 186 | +precision_rf, recall_rf, _ = precision_recall_curve(y_test, scoring_rf) |
192 | 187 | ax.scatter(recall[1:-1], precision[1:-1], c='b', s=10) |
193 | 188 | ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post') |
194 | 189 | ax.set_title("Precision-Recall", fontsize=20) |
|
198 | 193 |
|
199 | 194 | ############################################################################### |
200 | 195 | # The ROC and Precision-Recall curves show the performance of the rules |
201 | | -# generated by SkopeRulesthe (the blue points) and the performance of the |
| 196 | +# generated by SkopeRules the (the blue points) and the performance of the |
202 | 197 | # Random Forest classifier fitted above. |
203 | 198 | # Each blue point represents the performance of a set of rules: The kth point |
204 | 199 | # represents the score associated to the concatenation (union) of the k first |
|
0 commit comments