|
| 1 | +from datasets import load_dataset, concatenate_datasets |
| 2 | +import pandas as pd |
| 3 | +import logging |
| 4 | +import numpy as np |
| 5 | +import argparse |
| 6 | +from sklearn.metrics import classification_report |
| 7 | + |
| 8 | +import wandb |
| 9 | + |
| 10 | +logging.basicConfig(level="INFO") |
| 11 | + |
| 12 | + |
| 13 | +def load_data(dataset_name: str) -> tuple[pd.DataFrame, pd.DataFrame]: |
| 14 | + """ |
| 15 | + Load a dataset from the Hugging Face datasets library. |
| 16 | +
|
| 17 | + Args: |
| 18 | + dataset_name (str): The name of the dataset to load. |
| 19 | +
|
| 20 | + Returns: |
| 21 | + DatasetDict: A dictionary containing the train, validation, and test splits of the dataset. |
| 22 | + """ |
| 23 | + # Load the dataset |
| 24 | + dataset = load_dataset(dataset_name) |
| 25 | + |
| 26 | + if "train_0" in dataset: |
| 27 | + for col in ["train", "validation"]: |
| 28 | + dataset[col] = concatenate_datasets([dataset[f"{col}_0"], dataset[f"{col}_1"]]) |
| 29 | + dataset.pop(f"{col}_0") |
| 30 | + dataset.pop(f"{col}_1") |
| 31 | + |
| 32 | + train_data = dataset["train"] |
| 33 | + test_data = dataset["test"] |
| 34 | + |
| 35 | + train_df = train_data.to_pandas() |
| 36 | + max_label = train_df["label"].max() |
| 37 | + train_df.loc[train_df["label"].isna(), "label"] = max_label + 1 |
| 38 | + |
| 39 | + test_df = test_data.to_pandas() |
| 40 | + test_df.loc[test_df["label"].isna(), "label"] = max_label + 1 |
| 41 | + return train_df, test_df |
| 42 | + |
| 43 | + |
| 44 | +def evalute_fedot(train_df: pd.DataFrame, test_df: pd.DataFrame): |
| 45 | + """ |
| 46 | + Train a Fedot model on the provided training and testing data. |
| 47 | +
|
| 48 | + Args: |
| 49 | + train_df (pd.DataFrame): The training data. |
| 50 | + test_df (pd.DataFrame): The testing data. |
| 51 | + """ |
| 52 | + # !pip install fedot |
| 53 | + from fedot.api.main import Fedot |
| 54 | + |
| 55 | + X_train, y_train = train_df[["utterance"]], train_df["label"].astype(int) |
| 56 | + X_test, y_test = test_df[["utterance"]], test_df["label"].astype(int) |
| 57 | + model = Fedot(problem="classification", timeout=5, preset="best_quality", n_jobs=-1) |
| 58 | + model.fit(features=X_train, target=y_train) |
| 59 | + prediction = model.predict(features=X_test) |
| 60 | + return prediction |
| 61 | + |
| 62 | + |
| 63 | +def evaluate_h2o(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame: |
| 64 | + """ |
| 65 | + Train an H2O model on the provided training and testing data. |
| 66 | +
|
| 67 | + Args: |
| 68 | + train_df (pd.DataFrame): The training data. |
| 69 | + test_df (pd.DataFrame): The testing data. |
| 70 | + """ |
| 71 | + # !pip install h2o |
| 72 | + import h2o |
| 73 | + from h2o.estimators import H2OGradientBoostingEstimator |
| 74 | + from h2o.estimators.word2vec import H2OWord2vecEstimator |
| 75 | + from h2o.automl import H2OAutoML |
| 76 | + |
| 77 | + max_models: int = 20 |
| 78 | + max_runtime_secs: int = 600 |
| 79 | + seed: int = 42 |
| 80 | + |
| 81 | + h2o.init() |
| 82 | + |
| 83 | + train_h2o = h2o.H2OFrame(train_df) |
| 84 | + test_h2o = h2o.H2OFrame(test_df) |
| 85 | + train_h2o["label"] = train_h2o["label"].asfactor() |
| 86 | + test_h2o["label"] = test_h2o["label"].asfactor() |
| 87 | + train, valid = train_h2o.split_frame(ratios=[0.8]) |
| 88 | + text_col = "utterance" |
| 89 | + label_col = "label" |
| 90 | + train_tokens = train[text_col].tokenize("\\s+") |
| 91 | + valid_tokens = valid[text_col].tokenize("\\s+") |
| 92 | + test_tokens = test_h2o[text_col].tokenize( |
| 93 | + "\\s+" |
| 94 | + ) # Word2Vec needs token lists :contentReference[oaicite:0]{index=0} |
| 95 | + |
| 96 | + w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10) |
| 97 | + w2v_model.train(training_frame=train_tokens) |
| 98 | + |
| 99 | + train_vecs = w2v_model.transform(train_tokens, aggregate_method="AVERAGE") |
| 100 | + valid_vecs = w2v_model.transform(valid_tokens, aggregate_method="AVERAGE") |
| 101 | + test_vecs = w2v_model.transform(test_tokens, aggregate_method="AVERAGE") |
| 102 | + |
| 103 | + train_ext = train_vecs.cbind(train[label_col]) |
| 104 | + valid_ext = valid_vecs.cbind(valid[label_col]) |
| 105 | + test_ext = test_vecs.cbind(test_h2o[label_col]) |
| 106 | + |
| 107 | + x_cols = train_vecs.columns |
| 108 | + y_col = label_col |
| 109 | + |
| 110 | + # 9. Run H2OAutoML |
| 111 | + aml = H2OAutoML( |
| 112 | + max_models=max_models, |
| 113 | + max_runtime_secs=max_runtime_secs, |
| 114 | + seed=seed, |
| 115 | + balance_classes=True, |
| 116 | + sort_metric="mean_per_class_error", |
| 117 | + ) |
| 118 | + aml.train(x=x_cols, y=y_col, training_frame=train_ext, validation_frame=valid_ext, leaderboard_frame=test_ext) |
| 119 | + |
| 120 | + preds = aml.leader.predict(test_ext) |
| 121 | + return preds["predict"] |
| 122 | + |
| 123 | + |
| 124 | +def evaluate_lama(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame: |
| 125 | + """ |
| 126 | + Train a LAMA model on the provided training and testing data. |
| 127 | +
|
| 128 | + Args: |
| 129 | + train_df (pd.DataFrame): The training data. |
| 130 | + test_df (pd.DataFrame): The testing data. |
| 131 | + """ |
| 132 | + # !pip install lightautoml[nlp] |
| 133 | + from lightautoml.automl.presets.text_presets import TabularNLPAutoML |
| 134 | + from lightautoml.tasks import Task |
| 135 | + # pytorch<2.7.0 |
| 136 | + # https://github.com/sb-ai-lab/LightAutoML/issues/173 |
| 137 | + |
| 138 | + automl = TabularNLPAutoML(task=Task(name="multiclass", metric="f1_macro")) |
| 139 | + automl.fit_predict(train_df, roles={"target": "label"}) |
| 140 | + test_preds = automl.predict(test_df).data |
| 141 | + return np.argmax(test_preds, axis=-1) |
| 142 | + |
| 143 | + |
| 144 | +def evaluate_gama(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame: |
| 145 | + """ |
| 146 | + Train a GAMA model on the provided training and testing data. |
| 147 | +
|
| 148 | + Args: |
| 149 | + train_df (pd.DataFrame): The training data. |
| 150 | + test_df (pd.DataFrame): The testing data. |
| 151 | + """ |
| 152 | + # NOT WORKING |
| 153 | + # ValueError: population must be at least size 3 for a pair to be selected |
| 154 | + raise NotImplementedError("GAMA is not working yet.") |
| 155 | + # !pip install gama |
| 156 | + from gama import GamaClassifier |
| 157 | + |
| 158 | + automl = GamaClassifier(max_total_time=180, store="nothing") |
| 159 | + automl.fit(train_df[["utterance"]], train_df[["label"]]) |
| 160 | + |
| 161 | + |
| 162 | +def evaluate_glueon(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame: |
| 163 | + """ |
| 164 | + Train a GlueOn model on the provided training and testing data. |
| 165 | +
|
| 166 | + Args: |
| 167 | + train_df (pd.DataFrame): The training data. |
| 168 | + test_df (pd.DataFrame): The testing data. |
| 169 | + """ |
| 170 | + #!pip install autogluon |
| 171 | + from autogluon.multimodal import MultiModalPredictor |
| 172 | + import uuid |
| 173 | + |
| 174 | + model_path = f"/tmp/{uuid.uuid4().hex}-automm_sst" |
| 175 | + predictor = MultiModalPredictor(label="label", problem_type="multiclass", eval_metric="acc", path=model_path) |
| 176 | + predictor.fit(train_df, time_limit=180) |
| 177 | + predictions = predictor.predict(test_df) |
| 178 | + return predictions |
| 179 | + |
| 180 | + |
| 181 | +def main(): |
| 182 | + parser = argparse.ArgumentParser(description="Evaluate AutoML models on a dataset.") |
| 183 | + parser.add_argument( |
| 184 | + "--dataset", |
| 185 | + type=str, |
| 186 | + help="The name of the dataset to evaluate.", |
| 187 | + ) |
| 188 | + parser.add_argument( |
| 189 | + "--framework", |
| 190 | + type=str, |
| 191 | + choices=["fedot", "h2o", "lama", "gama", "glueon"], |
| 192 | + help="The name of the model to evaluate.", |
| 193 | + ) |
| 194 | + args = parser.parse_args() |
| 195 | + dataset_name = args.dataset |
| 196 | + framework = args.framework |
| 197 | + run = wandb.init( |
| 198 | + project="AutoML-Eval", |
| 199 | + name=f"eval-{dataset_name}-{framework}", |
| 200 | + tags=[dataset_name, framework], |
| 201 | + config={ |
| 202 | + "dataset": dataset_name, |
| 203 | + "framework": framework, |
| 204 | + }, |
| 205 | + ) |
| 206 | + # Load the dataset |
| 207 | + train_df, test_df = load_data(dataset_name) |
| 208 | + |
| 209 | + # Evaluate the model |
| 210 | + if framework == "fedot": |
| 211 | + predictions = evalute_fedot(train_df, test_df) |
| 212 | + elif framework == "h2o": |
| 213 | + predictions = evaluate_h2o(train_df, test_df) |
| 214 | + elif framework == "lama": |
| 215 | + predictions = evaluate_lama(train_df, test_df) |
| 216 | + elif framework == "gama": |
| 217 | + predictions = evaluate_gama(train_df, test_df) |
| 218 | + elif framework == "glueon": |
| 219 | + predictions = evaluate_glueon(train_df, test_df) |
| 220 | + else: |
| 221 | + raise ValueError(f"Unknown framework: {framework}") |
| 222 | + # Log the predictions |
| 223 | + run.log({"predictions": wandb.Table(dataframe=pd.DataFrame(predictions))}) |
| 224 | + # Log the classification report |
| 225 | + report = classification_report(test_df["label"], predictions, output_dict=True) |
| 226 | + run.log(report) |
| 227 | + # Finish the run |
| 228 | + run.finish() |
| 229 | + |
| 230 | + |
| 231 | +if __name__ == "__main__": |
| 232 | + main() |
0 commit comments