4
4
5
5
import numpy as np
6
6
import pandas as pd
7
+ import os
7
8
from scipy .io import arff
8
9
from sklearn .svm import SVC
9
10
from sklearn .linear_model import RidgeClassifier , LogisticRegression
24
25
# which means we are just going to be using each feature as continuous even though it
25
26
# may not be
26
27
database = database [database .mv == 0 ]
27
- database = database [databaes .nrow >= 50 ]
28
+ database = database [database .nrow >= 50 ]
28
29
29
30
30
31
def load_data (data_name ):
31
- data , meta = arff .loadarff (f"datasets/{ data_name } .arff" )
32
- df = pd .DataFrame (data ).apply (lambda x : pd .to_numeric (x , errors = "ignore" ))
33
- X = pd .get_dummies (df .loc [:, df .columns != "Class" ]).values
34
- unique_labels = df ["Class" ].unique ()
35
- labels_dict = dict (zip (unique_labels , range (len (unique_labels ))))
36
- df .loc [:, "Class" ] = df .applymap (lambda s : labels_dict .get (s ) if s in labels_dict else s )
37
- y = df ["Class" ].values
38
- return X , y
32
+ file_path = f"datasets/{ data_name } .arff"
33
+ if os .path .exists (file_path ):
34
+ data , meta = arff .loadarff (file_path )
35
+ df = pd .DataFrame (data ).apply (lambda x : pd .to_numeric (x , errors = "ignore" ))
36
+ X = pd .get_dummies (df .loc [:, df .columns != "Class" ]).values
37
+ unique_labels = df ["Class" ].unique ()
38
+ labels_dict = dict (zip (unique_labels , range (len (unique_labels ))))
39
+ df .loc [:, "Class" ] = df .applymap (lambda s : labels_dict .get (s ) if s in labels_dict else s )
40
+ y = df ["Class" ].values
41
+ return X , y
42
+ return [], []
39
43
40
44
41
45
def evaluate_pipeline_helper (X , y , pipeline , p_grid , random_state = 0 ):
@@ -49,13 +53,15 @@ def evaluate_pipeline_helper(X, y, pipeline, p_grid, random_state=0):
49
53
50
54
51
55
def define_and_evaluate_pipelines (X , y , random_state = 0 ):
56
+ # SVC
52
57
pipeline1 = Pipeline (
53
58
[("scaler" , MinMaxScaler ()), ("svc" , SVC (kernel = "linear" , probability = True , random_state = random_state ))]
54
59
)
55
60
param_grid1 = {
56
61
"svc__C" : np .logspace (- 7 , 2 , 10 ),
57
62
}
58
63
64
+ # logistic regression
59
65
pipeline2 = Pipeline (
60
66
[
61
67
("scaler" , MinMaxScaler ()),
@@ -66,8 +72,9 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
66
72
"logistic__C" : np .logspace (- 7 , 2 , 10 ),
67
73
}
68
74
75
+ # ridge has no predict_proba, but can become probabalistic with bagging classifier
69
76
pipeline3 = BaggingClassifier (
70
- Pipeline ([("scaler" , MinMaxScaler ()), ("ridge" , RidgeClassifier (solver = "saga" , random_state = random_state )),])
77
+ Pipeline ([("scaler" , MinMaxScaler ()), ("ridge" , RidgeClassifier (random_state = random_state )),])
71
78
)
72
79
param_grid3 = {
73
80
"base_estimator__ridge__alpha" : np .logspace (- 7 , 2 , 10 ),
@@ -87,15 +94,17 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
87
94
evaluated_datasets = []
88
95
for i , dataset_name in enumerate (database .index .values ):
89
96
X , y = load_data (dataset_name )
90
- numpy .random .seed (0 )
91
- if len (y ) > 10000 :
92
- # subset to 10000
93
- random_idx = np .random .choice (len (y ), 10000 replace = False )
94
- X = X [random_idx , :]
95
- y = y [random_idx ]
96
- print (i , dataset_name , len (y ))
97
- nested_scores1 , nested_scores2 , nested_scores3 = define_and_evaluate_pipelines (X , y )
98
- results1 .append (nested_scores1 )
99
- results2 .append (nested_scores2 )
100
- results3 .append (nested_scores3 )
101
- evaluated_datasets .append (dataset_name )
97
+ # datasets might have too few samples overall or per class
98
+ if len (y ) > 50 and len (pd .value_counts (y ) > 16 ):
99
+ np .random .seed (0 )
100
+ if len (y ) > 10000 :
101
+ # subset to 10000
102
+ random_idx = np .random .choice (len (y ), 10000 , replace = False )
103
+ X = X [random_idx , :]
104
+ y = y [random_idx ]
105
+ print (i , dataset_name , len (y ))
106
+ nested_scores1 , nested_scores2 , nested_scores3 = define_and_evaluate_pipelines (X , y )
107
+ results1 .append (nested_scores1 )
108
+ results2 .append (nested_scores2 )
109
+ results3 .append (nested_scores3 )
110
+ evaluated_datasets .append (dataset_name )
0 commit comments