@@ -42,11 +42,11 @@ def load_data(data_name):
42
42
return [], []
43
43
44
44
45
- def evaluate_pipeline_helper (X , y , pipeline , p_grid , random_state = 0 ):
45
+ def evaluate_pipeline_helper (X , y , pipeline , param_grid , random_state = 0 ):
46
46
inner_cv = StratifiedKFold (n_splits = 4 , shuffle = True , random_state = random_state )
47
47
outer_cv = StratifiedKFold (n_splits = 4 , shuffle = True , random_state = random_state )
48
48
clf = GridSearchCV (
49
- estimator = pipeline , param_grid = p_grid , cv = inner_cv , scoring = "roc_auc_ovr_weighted" , n_jobs = N_JOBS
49
+ estimator = pipeline , param_grid = param_grid , cv = inner_cv , scoring = "roc_auc_ovr_weighted" , n_jobs = N_JOBS
50
50
)
51
51
nested_score = cross_val_score (clf , X = X , y = y , cv = outer_cv , scoring = "roc_auc_ovr_weighted" , n_jobs = N_JOBS )
52
52
return nested_score
@@ -93,18 +93,19 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
93
93
results3 = []
94
94
evaluated_datasets = []
95
95
for i , dataset_name in enumerate (database .index .values ):
96
- X , y = load_data (dataset_name )
97
- # datasets might have too few samples per class
98
- if np .sum (pd .value_counts (y ) <= 15 ) == 0 :
99
- np .random .seed (0 )
100
- if len (y ) > 10000 :
101
- # subset to 10000
102
- random_idx = np .random .choice (len (y ), 10000 , replace = False )
103
- X = X [random_idx , :]
104
- y = y [random_idx ]
105
- print (i , dataset_name , len (y ))
106
- nested_scores1 , nested_scores2 , nested_scores3 = define_and_evaluate_pipelines (X , y )
107
- results1 .append (nested_scores1 )
108
- results2 .append (nested_scores2 )
109
- results3 .append (nested_scores3 )
110
- evaluated_datasets .append (dataset_name )
96
+ if dataset_name not in evaluated_datasets :
97
+ X , y = load_data (dataset_name )
98
+ # datasets might have too few samples per class
99
+ if len (y ) > 0 and np .sum (pd .value_counts (y ) <= 15 ) == 0 :
100
+ np .random .seed (0 )
101
+ if len (y ) > 10000 :
102
+ # subset to 10000
103
+ random_idx = np .random .choice (len (y ), 10000 , replace = False )
104
+ X = X [random_idx , :]
105
+ y = y [random_idx ]
106
+ print (i , dataset_name , len (y ))
107
+ nested_scores1 , nested_scores2 , nested_scores3 = define_and_evaluate_pipelines (X , y )
108
+ results1 .append (nested_scores1 )
109
+ results2 .append (nested_scores2 )
110
+ results3 .append (nested_scores3 )
111
+ evaluated_datasets .append (dataset_name )
0 commit comments