Skip to content

Commit bca6885

Browse files
author
Sergey Feldman
committed
delete another one
1 parent 699845c commit bca6885

File tree

2 files changed

+18
-6312
lines changed

2 files changed

+18
-6312
lines changed

01_compare_linear_models.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ def load_data(data_name):
4242
return [], []
4343

4444

45-
def evaluate_pipeline_helper(X, y, pipeline, p_grid, random_state=0):
45+
def evaluate_pipeline_helper(X, y, pipeline, param_grid, random_state=0):
4646
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
4747
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
4848
clf = GridSearchCV(
49-
estimator=pipeline, param_grid=p_grid, cv=inner_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS
49+
estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS
5050
)
5151
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS)
5252
return nested_score
@@ -93,18 +93,19 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
9393
results3 = []
9494
evaluated_datasets = []
9595
for i, dataset_name in enumerate(database.index.values):
96-
X, y = load_data(dataset_name)
97-
# datasets might have too few samples per class
98-
if np.sum(pd.value_counts(y) <= 15) == 0:
99-
np.random.seed(0)
100-
if len(y) > 10000:
101-
# subset to 10000
102-
random_idx = np.random.choice(len(y), 10000, replace=False)
103-
X = X[random_idx, :]
104-
y = y[random_idx]
105-
print(i, dataset_name, len(y))
106-
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
107-
results1.append(nested_scores1)
108-
results2.append(nested_scores2)
109-
results3.append(nested_scores3)
110-
evaluated_datasets.append(dataset_name)
96+
if dataset_name not in evaluated_datasets:
97+
X, y = load_data(dataset_name)
98+
# datasets might have too few samples per class
99+
if len(y) > 0 and np.sum(pd.value_counts(y) <= 15) == 0:
100+
np.random.seed(0)
101+
if len(y) > 10000:
102+
# subset to 10000
103+
random_idx = np.random.choice(len(y), 10000, replace=False)
104+
X = X[random_idx, :]
105+
y = y[random_idx]
106+
print(i, dataset_name, len(y))
107+
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
108+
results1.append(nested_scores1)
109+
results2.append(nested_scores2)
110+
results3.append(nested_scores3)
111+
evaluated_datasets.append(dataset_name)

0 commit comments

Comments
 (0)