Skip to content

Commit 1511fb8

Browse files
author
Sergey Feldman
committed
starting autogluon
1 parent 7e8e825 commit 1511fb8

File tree

3 files changed

+77
-2
lines changed

3 files changed

+77
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ dmypy.json
132132
.pyre/
133133

134134
.vscode
135+
.autogluon_temp
135136
foo-*
136137
*.dat
137138
*.cpp

03_autogluon.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import time
2+
import pickle
3+
import numpy as np
4+
import pandas as pd
5+
import autogluon as ag
6+
from autogluon import TabularPrediction as task
7+
from sklearn.model_selection import cross_val_score, StratifiedKFold
8+
from sklearn.metrics import roc_auc_score
9+
from utils import load_data
10+
11+
12+
N_JOBS = 4 * 4 * 9
13+
SEC = 60
14+
15+
16+
def define_and_evaluate_autogluon_pipeline(X, y, random_state=0):
17+
# autogluon dataframes
18+
data_df = pd.DataFrame(X)
19+
data_df["y"] = y
20+
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
21+
nested_scores = []
22+
for train_inds, test_inds in outer_cv.split(X, y):
23+
data_df_train = data_df.iloc[train_inds, :]
24+
data_df_test = data_df.iloc[test_inds, :]
25+
predictor = task.fit(
26+
data_df_train,
27+
"y",
28+
time_limits=SEC,
29+
auto_stack=True,
30+
output_directory=".autogluon_temp",
31+
eval_metric="f1_weighted",
32+
verbosity=0,
33+
)
34+
y_pred = predictor.predict_proba(data_df.iloc[test_inds, :])
35+
# same as roc_auc_ovr_weighted
36+
score = roc_auc_score(data_df_test["y"], y_pred, average="weighted", multi_class="ovr")
37+
nested_scores.append(score)
38+
return nested_scores
39+
40+
41+
# run model on all datasets
42+
with open("results/01_compare_baseline_models.pickle", "rb") as f:
43+
_, _, random_forest_results, evaluated_datasets, _ = pickle.load(f)
44+
45+
results = []
46+
times = []
47+
for i, dataset_name in enumerate(evaluated_datasets):
48+
X, y = load_data(dataset_name)
49+
np.random.seed(0)
50+
if len(y) > 10000:
51+
# subset to 10000 if too large
52+
random_idx = np.random.choice(len(y), 10000, replace=False)
53+
X = X[random_idx, :]
54+
y = y[random_idx]
55+
print("starting:", dataset_name, X.shape)
56+
start = time.time()
57+
nested_scores = define_and_evaluate_autogluon_pipeline(X, y)
58+
results.append(nested_scores)
59+
elapsed = time.time() - start
60+
times.append(elapsed)
61+
print("done. elapsed:", elapsed)
62+
print(f"AutoGluone score: {np.mean(nested_scores)}, Random Forest score: {np.mean(random_forest_results[i])}")
63+
64+
#
65+
results = np.array(results)
66+
times = np.array(times)
67+
68+
# save everything to disk so we can make plots elsewhere
69+
with open(f"results/03_autoglun_sec_{SEC}.pickle", "wb") as f:
70+
pickle.dump((results, times), f)

requirements.in

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1+
numpy
12
scipy
23
pandas
3-
scikit-learn
4-
seaborn
4+
scikit-learn>=0.24
5+
lightgbm>=3.1.1
6+
seaborn
7+
mxnet_cu101 # or mxnet if don't have a gpu
8+
autogluon

0 commit comments

Comments
 (0)