Skip to content

Commit c3952b6

Browse files
author
Sergey Feldman
committed
more script refinements
1 parent bca6885 commit c3952b6

File tree

1 file changed

+17
-12
lines changed

1 file changed

+17
-12
lines changed

01_compare_linear_models.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,18 @@
55
import numpy as np
66
import pandas as pd
77
import os
8+
import time
89
from scipy.io import arff
910
from sklearn.svm import SVC
1011
from sklearn.linear_model import RidgeClassifier, LogisticRegression
1112
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
1213
from sklearn.preprocessing import MinMaxScaler
1314
from sklearn.ensemble import BaggingClassifier
1415
from sklearn.pipeline import Pipeline
15-
from sklearn.calibration import CalibratedClassifierCV
16+
from sklearn.svm import SVC
1617

1718

18-
N_JOBS = 24
19+
N_JOBS = 4 * 4 * 9
1920

2021

2122
database = pd.read_json("database.json").T
@@ -53,12 +54,10 @@ def evaluate_pipeline_helper(X, y, pipeline, param_grid, random_state=0):
5354

5455

5556
def define_and_evaluate_pipelines(X, y, random_state=0):
56-
# SVC
57-
pipeline1 = Pipeline(
58-
[("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))]
59-
)
57+
# LinearSVC
58+
pipeline1 = Pipeline([("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))])
6059
param_grid1 = {
61-
"svc__C": np.logspace(-7, 2, 10),
60+
"svc__C": [1e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 1e1, 1e2],
6261
}
6362

6463
# logistic regression
@@ -69,15 +68,15 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
6968
]
7069
)
7170
param_grid2 = {
72-
"logistic__C": np.logspace(-7, 2, 10),
71+
"logistic__C": [1e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 1e1, 1e2],
7372
}
7473

75-
# ridge has no predict_proba, but can become probabalistic with bagging classifier
74+
# bagged ridge
7675
pipeline3 = BaggingClassifier(
7776
Pipeline([("scaler", MinMaxScaler()), ("ridge", RidgeClassifier(random_state=random_state)),])
7877
)
7978
param_grid3 = {
80-
"base_estimator__ridge__alpha": np.logspace(-7, 2, 10),
79+
"base_estimator__ridge__alpha": [1e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 1e1, 1e2],
8180
}
8281

8382
nested_scores1 = evaluate_pipeline_helper(X, y, pipeline1, param_grid1, random_state=random_state)
@@ -91,21 +90,27 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
9190
results1 = []
9291
results2 = []
9392
results3 = []
93+
results4 = []
9494
evaluated_datasets = []
95+
times = []
9596
for i, dataset_name in enumerate(database.index.values):
9697
if dataset_name not in evaluated_datasets:
9798
X, y = load_data(dataset_name)
9899
# datasets might have too few samples per class
99100
if len(y) > 0 and np.sum(pd.value_counts(y) <= 15) == 0:
100101
np.random.seed(0)
101102
if len(y) > 10000:
102-
# subset to 10000
103+
# subset to 10000 if too large
103104
random_idx = np.random.choice(len(y), 10000, replace=False)
104105
X = X[random_idx, :]
105106
y = y[random_idx]
106-
print(i, dataset_name, len(y))
107+
print("starting:", dataset_name, X.shape)
108+
start = time.time()
107109
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
108110
results1.append(nested_scores1)
109111
results2.append(nested_scores2)
110112
results3.append(nested_scores3)
113+
elapsed = time.time() - start
111114
evaluated_datasets.append(dataset_name)
115+
times.append(elapsed)
116+
print("done. elapsed:", elapsed)

0 commit comments

Comments
 (0)