|
11 | 11 | from sklearn.ensemble import RandomForestClassifier
|
12 | 12 | from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
|
13 | 13 | from sklearn.preprocessing import MinMaxScaler
|
14 |
| -from sklearn.ensemble import BaggingClassifier |
15 | 14 | from sklearn.pipeline import Pipeline
|
16 |
| -from sklearn.svm import SVC |
| 15 | +from sklearn.svm import SVC, LinearSVC |
17 | 16 | from utils import load_data
|
18 | 17 |
|
19 | 18 |
|
|
27 | 26 | database = database[database.nrow >= 50]
|
28 | 27 |
|
29 | 28 |
|
30 |
| -def evaluate_pipeline_helper(X, y, pipeline, param_grid, random_state=0): |
| 29 | +def evaluate_pipeline_helper(X, y, pipeline, param_grid, scoring="roc_auc_ovr_weighted", random_state=0): |
31 | 30 | inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
|
32 | 31 | outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
|
33 |
| - clf = GridSearchCV( |
34 |
| - estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS |
35 |
| - ) |
36 |
| - nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS) |
| 32 | + clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring=scoring, n_jobs=N_JOBS) |
| 33 | + nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, scoring=scoring, n_jobs=N_JOBS) |
37 | 34 | return nested_score
|
38 | 35 |
|
39 | 36 |
|
40 | 37 | def define_and_evaluate_pipelines(X, y, random_state=0):
|
41 | 38 | # LinearSVC
|
42 | 39 | pipeline1 = Pipeline(
|
43 |
| - [("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))] |
| 40 | + [ |
| 41 | + ("scaler", MinMaxScaler()), |
| 42 | + ( |
| 43 | + "svc", |
| 44 | + SVC(kernel="linear", class_weight="balanced", probability=True, tol=1e-4, random_state=random_state), |
| 45 | + ), |
| 46 | + ] |
44 | 47 | )
|
45 | 48 | param_grid1 = {
|
46 | 49 | "svc__C": [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],
|
@@ -97,6 +100,7 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
|
97 | 100 | evaluated_datasets.append(dataset_name)
|
98 | 101 | times.append(elapsed)
|
99 | 102 | print("done. elapsed:", elapsed)
|
| 103 | + print("scores:", np.mean(nested_scores1), np.mean(nested_scores2), np.mean(nested_scores3)) |
100 | 104 |
|
101 | 105 | #
|
102 | 106 | results1 = np.array(results1)
|
|
0 commit comments