Skip to content

Commit b725a9d

Browse files
committed
random subset of big datasets
1 parent 16ccbee commit b725a9d

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

01_compare_linear_models.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
# which means we are just going to be using each feature as continuous even though it
2525
# may not be
2626
database = database[database.mv == 0]
27+
database = database[databaes.nrow >= 50]
2728

2829

2930
def load_data(data_name):
@@ -86,10 +87,15 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
8687
evaluated_datasets = []
8788
for i, dataset_name in enumerate(database.index.values):
8889
X, y = load_data(dataset_name)
89-
if len(y) > 25 and len(y) < 1000:
90-
print(i, dataset_name, len(y))
91-
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
92-
results1.append(nested_scores1)
93-
results2.append(nested_scores2)
94-
results3.append(nested_scores3)
95-
evaluated_datasets.append(dataset_name)
90+
numpy.random.seed(0)
91+
if len(y) > 10000:
92+
# subset to 10000
93+
random_idx = np.random.choice(len(y), 10000 replace=False)
94+
X = X[random_idx, :]
95+
y = y[random_idx]
96+
print(i, dataset_name, len(y))
97+
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
98+
results1.append(nested_scores1)
99+
results2.append(nested_scores2)
100+
results3.append(nested_scores3)
101+
evaluated_datasets.append(dataset_name)

0 commit comments

Comments
 (0)