1
1
"""
2
- Which linear models are optimal ?
2
+ Which baseline models are best ?
3
3
"""
4
4
5
- import os
6
5
import time
7
6
import pickle
8
7
import numpy as np
9
8
import pandas as pd
10
- from scipy .io import arff
11
9
from sklearn .svm import SVC
12
10
from sklearn .linear_model import LogisticRegression
13
11
from sklearn .ensemble import RandomForestClassifier
16
14
from sklearn .ensemble import BaggingClassifier
17
15
from sklearn .pipeline import Pipeline
18
16
from sklearn .svm import SVC
17
+ from utils import load_data
19
18
20
19
21
20
N_JOBS = 4 * 4 * 9
28
27
database = database [database .nrow >= 50 ]
29
28
30
29
31
- def load_data (data_name ):
32
- file_path = f"datasets/{ data_name } .arff"
33
- if os .path .exists (file_path ):
34
- data , meta = arff .loadarff (file_path )
35
- df = pd .DataFrame (data ).apply (lambda x : pd .to_numeric (x , errors = "ignore" ))
36
- X = pd .get_dummies (df .loc [:, df .columns != "Class" ]).values
37
- unique_labels = df ["Class" ].unique ()
38
- labels_dict = dict (zip (unique_labels , range (len (unique_labels ))))
39
- df .loc [:, "Class" ] = df .applymap (lambda s : labels_dict .get (s ) if s in labels_dict else s )
40
- y = df ["Class" ].values
41
- return X , y
42
- return [], []
43
-
44
-
45
30
def evaluate_pipeline_helper (X , y , pipeline , param_grid , random_state = 0 ):
46
31
inner_cv = StratifiedKFold (n_splits = 4 , shuffle = True , random_state = random_state )
47
32
outer_cv = StratifiedKFold (n_splits = 4 , shuffle = True , random_state = random_state )
@@ -58,7 +43,7 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
58
43
[("scaler" , MinMaxScaler ()), ("svc" , SVC (kernel = "linear" , probability = True , random_state = random_state ))]
59
44
)
60
45
param_grid1 = {
61
- "svc__C" : [1e-4 , 1e-3 , 5e-3 , 1e-2 , 5e-2 , 1e-1 , 1e1 , 1e2 ],
46
+ "svc__C" : [1e-4 , 1e-3 , 1e-2 , 1e-1 , 1e0 , 1e1 , 1e2 ],
62
47
}
63
48
64
49
# logistic regression
@@ -69,7 +54,7 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
69
54
]
70
55
)
71
56
param_grid2 = {
72
- "logistic__C" : [1e-4 , 1e-3 , 5e-3 , 1e-2 , 5e-2 , 1e-1 , 1e1 , 1e2 ],
57
+ "logistic__C" : [1e-4 , 1e-3 , 1e-2 , 1e-1 , 1e0 , 1e1 , 1e2 ],
73
58
}
74
59
75
60
# random forest
0 commit comments