5
5
import numpy as np
6
6
import pandas as pd
7
7
import os
8
+ import time
8
9
from scipy .io import arff
9
10
from sklearn .svm import SVC
10
11
from sklearn .linear_model import RidgeClassifier , LogisticRegression
11
12
from sklearn .model_selection import GridSearchCV , cross_val_score , StratifiedKFold
12
13
from sklearn .preprocessing import MinMaxScaler
13
14
from sklearn .ensemble import BaggingClassifier
14
15
from sklearn .pipeline import Pipeline
15
- from sklearn .calibration import CalibratedClassifierCV
16
+ from sklearn .svm import SVC
16
17
17
18
18
- N_JOBS = 24
19
+ N_JOBS = 4 * 4 * 9
19
20
20
21
21
22
database = pd .read_json ("database.json" ).T
@@ -53,12 +54,10 @@ def evaluate_pipeline_helper(X, y, pipeline, param_grid, random_state=0):
53
54
54
55
55
56
def define_and_evaluate_pipelines (X , y , random_state = 0 ):
56
- # SVC
57
- pipeline1 = Pipeline (
58
- [("scaler" , MinMaxScaler ()), ("svc" , SVC (kernel = "linear" , probability = True , random_state = random_state ))]
59
- )
57
+ # LinearSVC
58
+ pipeline1 = Pipeline ([("scaler" , MinMaxScaler ()), ("svc" , SVC (kernel = "linear" , probability = True , random_state = random_state ))])
60
59
param_grid1 = {
61
- "svc__C" : np . logspace ( - 7 , 2 , 10 ) ,
60
+ "svc__C" : [ 1e-4 , 1e-3 , 5e-3 , 1e- 2 , 5e-2 , 1e-1 , 1e1 , 1e2 ] ,
62
61
}
63
62
64
63
# logistic regression
@@ -69,15 +68,15 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
69
68
]
70
69
)
71
70
param_grid2 = {
72
- "logistic__C" : np . logspace ( - 7 , 2 , 10 ) ,
71
+ "logistic__C" : [ 1e-4 , 1e-3 , 5e-3 , 1e- 2 , 5e-2 , 1e-1 , 1e1 , 1e2 ] ,
73
72
}
74
73
75
- # ridge has no predict_proba, but can become probabalistic with bagging classifier
74
+ # bagged ridge
76
75
pipeline3 = BaggingClassifier (
77
76
Pipeline ([("scaler" , MinMaxScaler ()), ("ridge" , RidgeClassifier (random_state = random_state )),])
78
77
)
79
78
param_grid3 = {
80
- "base_estimator__ridge__alpha" : np . logspace ( - 7 , 2 , 10 ) ,
79
+ "base_estimator__ridge__alpha" : [ 1e-4 , 1e-3 , 5e-3 , 1e- 2 , 5e-2 , 1e-1 , 1e1 , 1e2 ] ,
81
80
}
82
81
83
82
nested_scores1 = evaluate_pipeline_helper (X , y , pipeline1 , param_grid1 , random_state = random_state )
@@ -91,21 +90,27 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
91
90
results1 = []
92
91
results2 = []
93
92
results3 = []
93
+ results4 = []
94
94
evaluated_datasets = []
95
+ times = []
95
96
for i , dataset_name in enumerate (database .index .values ):
96
97
if dataset_name not in evaluated_datasets :
97
98
X , y = load_data (dataset_name )
98
99
# datasets might have too few samples per class
99
100
if len (y ) > 0 and np .sum (pd .value_counts (y ) <= 15 ) == 0 :
100
101
np .random .seed (0 )
101
102
if len (y ) > 10000 :
102
- # subset to 10000
103
+ # subset to 10000 if too large
103
104
random_idx = np .random .choice (len (y ), 10000 , replace = False )
104
105
X = X [random_idx , :]
105
106
y = y [random_idx ]
106
- print (i , dataset_name , len (y ))
107
+ print ("starting:" , dataset_name , X .shape )
108
+ start = time .time ()
107
109
nested_scores1 , nested_scores2 , nested_scores3 = define_and_evaluate_pipelines (X , y )
108
110
results1 .append (nested_scores1 )
109
111
results2 .append (nested_scores2 )
110
112
results3 .append (nested_scores3 )
113
+ elapsed = time .time () - start
111
114
evaluated_datasets .append (dataset_name )
115
+ times .append (elapsed )
116
+ print ("done. elapsed:" , elapsed )
0 commit comments