Implement training evolution eval

constantinpape · constantinpape · commit f81b40094f79 · 2023-08-16T17:46:04.000+02:00
diff --git a/finetuning/generalists/evaluate_training_evolution.py b/finetuning/generalists/evaluate_training_evolution.py
@@ -1,57 +1,77 @@
 import argparse
 import os
+
 from glob import glob
+from subprocess import run
 
 import pandas as pd
 from util import evaluate_checkpoint_for_datasets, get_generalist_predictor
 
-CHECKPOINT_ROOT = "/scratch-grete/projects/nim00007/sam/LM/generalist"
-EXPERIMENT_ROOT = "/scratch-grete/projects/nim00007/sam/experiments/generalists/lm"
+CHECKPOINT_ROOT = "/scratch/projects/nim00007/sam/models/LM/generalist/v2"
+EXPERIMENT_ROOT = "/scratch/projects/nim00007/sam/experiments/training-evolution"
 # We evaluate these three datasets for the training evolution.
 # These are chosen based on observations from preliminary experiments.
 # - covid-if: out-of-domain dataset that shows the expected improvement (over vanilla).
 # - deepbacs: in domain dataset where we see the biggest gap to the specialist.
-# - plantseg-root: out-of-domain dataset that doesn't show an improvement.
-DATASETS = ("covid-if", "deepbacs", "plantseg-root")
-
+# - lizard: out-of-domain that is furthest from the training data.
+EVAL_DATASETS = ("covid-if", "deepbacs", "lizard")
 
-def evaluate_training_evolution(model_type):
-    checkpoints = sorted(glob(
-        os.path.join(CHECKPOINT_ROOT, model_type, "*.pt")
-    ))
-    assert len(checkpoints) > 0
 
-    epochs, results = [], []
-    for checkpoint in checkpoints:
+def evaluate_checkpoint_slurm(model_type, job_id, checkpoints):
+    checkpoint = checkpoints[job_id]
 
-        predictor, state = get_generalist_predictor(checkpoint, model_type, return_state=True)
-        epoch = state["epoch"] + 1
+    predictor, state = get_generalist_predictor(
+        checkpoint, model_type, is_custom_model=True, return_state=True
+    )
+    epoch = state["epoch"] + 1
 
-        if epoch in epochs:
-            continue
+    print("Run evaluation for", model_type, "epoch", epoch)
+    experiment_root = os.path.join(EXPERIMENT_ROOT, f"{model_type}-epoch-{epoch}")
+    result = evaluate_checkpoint_for_datasets(
+        None, None, experiment_root, EVAL_DATASETS,
+        run_default_evaluation=True, run_amg=False,
+        is_custom_model=True, predictor=predictor,
+    )
 
-        print("Run evaluation for", model_type, "epoch", epoch)
-        experiment_root = os.path.join(EXPERIMENT_ROOT, f"{model_type}-epoch-{epoch}")
-        result = evaluate_checkpoint_for_datasets(
-            None, None, experiment_root, DATASETS,
-            run_default_evaluation=True, run_amg=False,
-            predictor=predictor,
-        )
-        result.insert(0, "epoch", [epoch] * result.shape[0])
-        results.append(result)
+    result.insert(0, "epoch", [epoch] * result.shape[0])
+    return result
 
-        epochs.append(epoch)
 
+def evaluate_training_evolution(model_type, checkpoints):
+    results = []
+    for i in range(len(checkpoints)):
+        result = evaluate_checkpoint_slurm(model_type, i, checkpoints)
+        results.append(result)
     results = pd.concat(results)
     save_path = os.path.join(EXPERIMENT_ROOT, f"{model_type}.csv")
     results.to_csv(save_path, index=False)
 
 
+def submit_array_job(model_type, checkpoints):
+    n_checkpoints = len(checkpoints)
+    cmd = ["sbatch", "-a", f"0-{n_checkpoints-1}", "evaluate_training_evolution.sbatch", model_type]
+    run(cmd)
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("model_type")
+    parser.add_argument("-e", "--evaluate", action="store_true")
     args = parser.parse_args()
-    evaluate_training_evolution(args.model_type)
+
+    checkpoints = sorted(glob(os.path.join(CHECKPOINT_ROOT, args.model_type, "epoch-*.pt")))
+    assert len(checkpoints) > 0
+
+    if args.evaluate:
+        evaluate_training_evolution(args.model_type, checkpoints)
+        return
+
+    job_id = os.environ.get("SLURM_ARRAY_TASK_ID", None)
+    if job_id is None:  # this is the main script that submits slurm jobs
+        submit_array_job(args.model_type, checkpoints)
+    else:  # we're in a slurm job
+        job_id = int(job_id)
+        evaluate_checkpoint_slurm(args.model_type, job_id, checkpoints)
 
 
 if __name__ == "__main__":
diff --git a/finetuning/generalists/evaluate_training_evolution.sbatch b/finetuning/generalists/evaluate_training_evolution.sbatch
@@ -1,7 +1,7 @@
 #! /bin/bash
 #SBATCH -c 4
 #SBATCH --mem 96G
-#SBATCH -t 2880
+#SBATCH -t 240
 #SBATCH -p grete:shared
 #SBATCH -G A100:1
 #SBATCH -A nim00007