|
5 | 5 |
|
6 | 6 |
|
7 | 7 | SkopeRules finds logical rules with high precision and fuse them. Finding |
8 | | -good rules is done by fitting classification or regression trees |
| 8 | +good rules is done by fitting classification and regression trees |
9 | 9 | to sub-samples. |
10 | 10 | A fitted tree defines a set of rules (each tree node defines a rule); rules |
11 | 11 | are then tested out of the bag, and the ones with higher precision are kept. |
12 | | -This set of rules is decision function, reflecting for |
13 | | -each new samples how many rules have find it abnormal. |
14 | 12 |
|
15 | 13 | This example aims at finding logical rules to predict credit defaults. The |
16 | 14 | analysis shows that setting. |
17 | 15 |
|
18 | | -The dataset comes from BLABLABLA. |
19 | 16 | """ |
20 | 17 |
|
21 | 18 | ############################################################################### |
|
54 | 51 | for col in ['ID']: |
55 | 52 | del data[col] |
56 | 53 |
|
57 | | -# data = pd.get_dummies(data, columns = ['SEX', 'EDUCATION', 'MARRIAGE']) |
58 | | - |
59 | 54 | # Quick feature engineering |
60 | 55 | data = data.rename(columns={"PAY_0": "PAY_1"}) |
61 | 56 | old_PAY = ['PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'] |
|
80 | 75 |
|
81 | 76 | # Creating the train/test split |
82 | 77 | feature_names = list(data.columns) |
83 | | -print(feature_names) |
| 78 | +print("List of variables used to train models : " + str(feature_names)) |
84 | 79 | data = data.values |
85 | 80 | n_samples = data.shape[0] |
86 | 81 | n_samples_train = int(n_samples / 2) |
|
90 | 85 | X_test = data[n_samples_train:] |
91 | 86 |
|
92 | 87 | ############################################################################### |
93 | | -# Benchmark with a Decision Tree and Random Forests |
| 88 | +# Benchmark with a Random Forest classifier. |
94 | 89 | # .................. |
95 | 90 | # |
96 | | -# This part shows the training and performance evaluation of |
97 | | -# two tree-based models. |
98 | | -# The objective remains to extract rules which targets credit defaults. |
99 | | -# This benchmark shows the performance reached with a decision tree and a |
100 | | -# random forest. |
| 91 | +# This part shows the training and performance evaluation of a random forest |
| 92 | +# model. The objective remains to extract rules which targets credit defaults. |
101 | 93 |
|
102 | 94 | RF = GridSearchCV( |
103 | 95 | RandomForestClassifier( |
|
106 | 98 | class_weight='balanced'), |
107 | 99 | param_grid={ |
108 | 100 | 'max_depth': range(3, 8, 1), |
109 | | - 'max_features': np.linspace(0.1, 0.2, 1.) |
| 101 | + 'max_features': np.linspace(0.1, 1., 5) |
110 | 102 | }, |
111 | 103 | scoring={'AUC': 'roc_auc'}, cv=5, |
112 | 104 | refit='AUC', n_jobs=-1) |
113 | 105 |
|
114 | 106 | RF.fit(X_train, y_train) |
115 | 107 | scoring_RF = RF.predict_proba(X_test)[:, 1] |
116 | 108 |
|
117 | | -# print("Decision Tree selected parameters : "+str(DT.best_params_)) |
118 | | -print("Random Forest selected parameters : "+str(RF.best_params_)) |
| 109 | +print("Random Forest selected parameters : " + str(RF.best_params_)) |
119 | 110 |
|
120 | 111 | # Plot ROC and PR curves |
121 | 112 |
|
122 | 113 | fig, axes = plt.subplots(1, 2, figsize=(12, 5), |
123 | 114 | sharex=True, sharey=True) |
124 | 115 |
|
125 | 116 | ax = axes[0] |
126 | | -# fpr_DT, tpr_DT, _ = roc_curve(y_test, scoring_DT) |
127 | 117 | fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF) |
128 | 118 | ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post') |
129 | 119 | ax.set_title("ROC", fontsize=20) |
|
132 | 122 | ax.set_ylabel('True Positive Rate (Recall)', fontsize=18) |
133 | 123 |
|
134 | 124 | ax = axes[1] |
135 | | -# precision_DT, recall_DT, _ = precision_recall_curve(y_test, scoring_DT) |
136 | 125 | precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF) |
137 | 126 | ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post') |
138 | 127 | ax.set_title("Precision-Recall", fontsize=20) |
|
209 | 198 |
|
210 | 199 | ############################################################################### |
211 | 200 | # The ROC and Precision-Recall curves show the performance of the rules |
212 | | -# generated by SkopeRulesthe (blue points) and the performance of the Random |
213 | | -# Forest classifier fitted above. |
| 201 | +# generated by SkopeRulesthe (the blue points) and the performance of the |
| 202 | +# Random Forest classifier fitted above. |
214 | 203 | # Each blue point represents the performance of a set of rules: The kth point |
215 | 204 | # represents the score associated to the concatenation (union) of the k first |
216 | 205 | # rules, etc. Thus, each blue point is associated with an interpretable |
|
0 commit comments