|
| 1 | +import time |
| 2 | +import pickle |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +import autogluon as ag |
| 6 | +from autogluon import TabularPrediction as task |
| 7 | +from sklearn.model_selection import cross_val_score, StratifiedKFold |
| 8 | +from sklearn.metrics import roc_auc_score |
| 9 | +from utils import load_data |
| 10 | + |
| 11 | + |
| 12 | +N_JOBS = 4 * 4 * 9 |
| 13 | +SEC = 60 |
| 14 | + |
| 15 | + |
| 16 | +def define_and_evaluate_autogluon_pipeline(X, y, random_state=0): |
| 17 | + # autogluon dataframes |
| 18 | + data_df = pd.DataFrame(X) |
| 19 | + data_df["y"] = y |
| 20 | + outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state) |
| 21 | + nested_scores = [] |
| 22 | + for train_inds, test_inds in outer_cv.split(X, y): |
| 23 | + data_df_train = data_df.iloc[train_inds, :] |
| 24 | + data_df_test = data_df.iloc[test_inds, :] |
| 25 | + predictor = task.fit( |
| 26 | + data_df_train, |
| 27 | + "y", |
| 28 | + time_limits=SEC, |
| 29 | + auto_stack=True, |
| 30 | + output_directory=".autogluon_temp", |
| 31 | + eval_metric="f1_weighted", |
| 32 | + verbosity=0, |
| 33 | + ) |
| 34 | + y_pred = predictor.predict_proba(data_df.iloc[test_inds, :]) |
| 35 | + # same as roc_auc_ovr_weighted |
| 36 | + score = roc_auc_score(data_df_test["y"], y_pred, average="weighted", multi_class="ovr") |
| 37 | + nested_scores.append(score) |
| 38 | + return nested_scores |
| 39 | + |
| 40 | + |
| 41 | +# run model on all datasets |
| 42 | +with open("results/01_compare_baseline_models.pickle", "rb") as f: |
| 43 | + _, _, random_forest_results, evaluated_datasets, _ = pickle.load(f) |
| 44 | + |
| 45 | +results = [] |
| 46 | +times = [] |
| 47 | +for i, dataset_name in enumerate(evaluated_datasets): |
| 48 | + X, y = load_data(dataset_name) |
| 49 | + np.random.seed(0) |
| 50 | + if len(y) > 10000: |
| 51 | + # subset to 10000 if too large |
| 52 | + random_idx = np.random.choice(len(y), 10000, replace=False) |
| 53 | + X = X[random_idx, :] |
| 54 | + y = y[random_idx] |
| 55 | + print("starting:", dataset_name, X.shape) |
| 56 | + start = time.time() |
| 57 | + nested_scores = define_and_evaluate_autogluon_pipeline(X, y) |
| 58 | + results.append(nested_scores) |
| 59 | + elapsed = time.time() - start |
| 60 | + times.append(elapsed) |
| 61 | + print("done. elapsed:", elapsed) |
| 62 | + print(f"AutoGluone score: {np.mean(nested_scores)}, Random Forest score: {np.mean(random_forest_results[i])}") |
| 63 | + |
| 64 | +# |
| 65 | +results = np.array(results) |
| 66 | +times = np.array(times) |
| 67 | + |
| 68 | +# save everything to disk so we can make plots elsewhere |
| 69 | +with open(f"results/03_autoglun_sec_{SEC}.pickle", "wb") as f: |
| 70 | + pickle.dump((results, times), f) |
0 commit comments