|
24 | 24 | # which means we are just going to be using each feature as continuous even though it
|
25 | 25 | # may not be
|
26 | 26 | database = database[database.mv == 0]
|
| 27 | +database = database[databaes.nrow >= 50] |
27 | 28 |
|
28 | 29 |
|
29 | 30 | def load_data(data_name):
|
@@ -86,10 +87,15 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
|
86 | 87 | evaluated_datasets = []
|
87 | 88 | for i, dataset_name in enumerate(database.index.values):
|
88 | 89 | X, y = load_data(dataset_name)
|
89 |
| - if len(y) > 25 and len(y) < 1000: |
90 |
| - print(i, dataset_name, len(y)) |
91 |
| - nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y) |
92 |
| - results1.append(nested_scores1) |
93 |
| - results2.append(nested_scores2) |
94 |
| - results3.append(nested_scores3) |
95 |
| - evaluated_datasets.append(dataset_name) |
| 90 | + numpy.random.seed(0) |
| 91 | + if len(y) > 10000: |
| 92 | + # subset to 10000 |
| 93 | + random_idx = np.random.choice(len(y), 10000 replace=False) |
| 94 | + X = X[random_idx, :] |
| 95 | + y = y[random_idx] |
| 96 | + print(i, dataset_name, len(y)) |
| 97 | + nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y) |
| 98 | + results1.append(nested_scores1) |
| 99 | + results2.append(nested_scores2) |
| 100 | + results3.append(nested_scores3) |
| 101 | + evaluated_datasets.append(dataset_name) |
0 commit comments