Update generalist evaluation scripts

constantinpape · constantinpape · commit 930eca7cb98b · 2023-08-11T23:02:07.000+02:00
diff --git a/finetuning/generalists/evaluate_generalist.py b/finetuning/generalists/evaluate_generalist.py
@@ -1,18 +1,79 @@
-from util import evaluate_checkpoint_for_datasets
+import argparse
+import os
+from subprocess import run
 
+from util import evaluate_checkpoint_for_dataset, ALL_DATASETS, EM_DATASETS, LM_DATASETS
 
-# TODO extend this to run the full evaluation protocol for a generalist.
+EXPERIMENT_ROOT = "/scratch/projects/nim00007/sam/experiments/generalists"
+CHECKPOINTS = {
+    "vit_b": "/home/nimcpape/.sam_models/sam_vit_b_01ec64.pth",
+    "vit_h": "/home/nimcpape/.sam_models/sam_vit_h_4b8939.pth",
+}
 
-checkpoint = "/scratch-grete/projects/nim00007/sam/LM/generalist/vit_b/epoch-30.pt"
-root = "/scratch-grete/projects/nim00007/sam/experiments/generalists/lm/test"
-datasets = ["covid-if"]
 
-evaluate_checkpoint_for_datasets(
-    checkpoint=checkpoint,
-    model_type="vit_b",
-    experiment_root=root,
-    datasets=datasets,
-    run_default_evaluation=True,
-    run_amg=True,
-    max_num_val_images=10,
-)
+def submit_array_job(model_name, datasets, amg):
+    n_datasets = len(datasets)
+    cmd = ["sbatch", "-a", f"0-{n_datasets-1}", "evaluate_generalist.sbatch", model_name, "--datasets"]
+    cmd.extend(datasets)
+    if amg:
+        cmd.append("--amg")
+    run(cmd)
+
+
+def evaluate_dataset_slurm(model_name, dataset, run_amg):
+    max_num_val_images = None
+    if run_amg:
+        if dataset in EM_DATASETS:
+            run_amg = False
+        else:
+            run_amg = True
+            max_num_val_images = 100
+
+    is_custom_model = model_name not in ("vit_h", "vit_b")
+    checkpoint = CHECKPOINTS[model_name]
+    model_type = model_name[:5]
+
+    experiment_folder = os.path.join(EXPERIMENT_ROOT, model_name, dataset)
+    evaluate_checkpoint_for_dataset(
+        checkpoint, model_type, dataset, experiment_folder,
+        run_default_evaluation=True, run_amg=run_amg,
+        is_custom_model=is_custom_model,
+        max_num_val_images=max_num_val_images,
+    )
+
+
+def _get_datasets(lm, em):
+    assert lm or em
+    datasets = []
+    if lm:
+        datasets.extend(LM_DATASETS)
+    if em:
+        datasets.extend(EM_DATASETS)
+    return datasets
+
+
+# evaluation on slurm
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name")
+    parser.add_argument("--lm", action="store_true")
+    parser.add_argument("--em", action="store_true")
+    parser.add_argument("--amg", action="store_true")
+    parser.add_argument("--datasets", nargs="+")
+    args = parser.parse_args()
+
+    datasets = args.datasets
+    if datasets is None or len(datasets) == 0:
+        datasets = _get_datasets(args.lm, args.em)
+    assert all(ds in ALL_DATASETS for ds in datasets)
+
+    job_id = os.environ.get("SLURM_ARRAY_TASK_ID", None)
+    if job_id is None:  # this is the main script that submits slurm jobs
+        submit_array_job(args.model_name, datasets, args.amg)
+    else:  # we're in a slurm job and precompute a setting
+        job_id = int(job_id)
+        evaluate_dataset_slurm(args.model_name, datasets[job_id], args.amg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finetuning/generalists/evaluate_generalist.sbatch b/finetuning/generalists/evaluate_generalist.sbatch
@@ -0,0 +1,10 @@
+#! /bin/bash
+#SBATCH -c 4
+#SBATCH --mem 48G
+#SBATCH -t 720
+#SBATCH -p grete:shared
+#SBATCH -G A100:1
+#SBATCH -A nim00007
+
+source activate sam
+python evaluate_generalist.py $@
diff --git a/finetuning/generalists/precompute_prompts.py b/finetuning/generalists/precompute_prompts.py
@@ -1,10 +1,12 @@
 import argparse
 import os
+import pickle
 
 from subprocess import run
 
 import micro_sam.evaluation as evaluation
-from util import get_data_paths, ALL_DATASETS
+from util import get_data_paths, ALL_DATASETS, LM_DATASETS
+from tqdm import tqdm
 
 PROMPT_ROOT = "/scratch/projects/nim00007/sam/experiments/prompts"
 
@@ -32,17 +34,63 @@ def submit_array_job():
     run(cmd)
 
 
+def _check_prompts(dataset, settings, expected_len):
+    prompt_folder = os.path.join(PROMPT_ROOT, dataset)
+
+    def check_prompt_file(prompt_file):
+        assert os.path.exists(prompt_file), prompt_file
+        with open(prompt_file, "rb") as f:
+            prompts = pickle.load(f)
+        assert len(prompts) == expected_len, f"{len(prompts)}, {expected_len}"
+
+    for setting in settings:
+        pos, neg = setting["n_positives"], setting["n_negatives"]
+        prompt_file = os.path.join(prompt_folder, f"points-p{pos}-n{neg}.pkl")
+        if pos == 0 and neg == 0:
+            prompt_file = os.path.join(prompt_folder, "boxes.pkl")
+        check_prompt_file(prompt_file)
+
+
+def check_prompts_and_datasets():
+
+    def check_dataset(dataset):
+        try:
+            images, _ = get_data_paths(dataset, "test")
+        except AssertionError as e:
+            print("Checking test split failed for datasset", dataset, "due to", e)
+
+        if dataset not in LM_DATASETS:
+            return len(images)
+
+        try:
+            get_data_paths(dataset, "val")
+        except AssertionError as e:
+            print("Checking val split failed for datasset", dataset, "due to", e)
+
+        return len(images)
+
+    settings = evaluation.default_experiment_settings()
+    for ds in tqdm(ALL_DATASETS, desc="Checking datasets"):
+        n_images = check_dataset(ds)
+        _check_prompts(ds, settings, n_images)
+    print("All checks done!")
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("-d", "--dataset")
+    parser.add_argument("--check", "-c", action="store_true")
     args = parser.parse_args()
+
+    if args.check:
+        check_prompts_and_datasets()
+        return
+
     if args.dataset is not None:
         precompute_prompts(args.dataset)
         return
 
-    # this will fail if the dataset is invalid
     job_id = os.environ.get("SLURM_ARRAY_TASK_ID", None)
-
     if job_id is None:  # this is the main script that submits slurm jobs
         submit_array_job()
     else:  # we're in a slurm job and precompute a setting
diff --git a/finetuning/generalists/precompute_prompts.sbatch b/finetuning/generalists/precompute_prompts.sbatch
@@ -1,7 +1,7 @@
 #! /bin/bash
 #SBATCH -c 4
 #SBATCH --mem 48G
-#SBATCH -t 720
+#SBATCH -t 2000
 #SBATCH -p grete:shared
 #SBATCH -G A100:1
 #SBATCH -A nim00007
diff --git a/finetuning/generalists/util.py b/finetuning/generalists/util.py
@@ -1,12 +1,9 @@
-import argparse
 import json
 import os
-import pickle
 import warnings
 
 from glob import glob
 from pathlib import Path
-from tqdm import tqdm
 
 import pandas as pd
 from micro_sam.evaluation import (
@@ -67,68 +64,25 @@ def get_data_paths(dataset, split, max_num_images=None):
     return image_paths, gt_paths
 
 
-def _check_prompts(dataset, settings, expected_len):
-    prompt_folder = os.path.join(PROMPT_ROOT, dataset)
-
-    def check_prompt_file(prompt_file):
-        assert os.path.exists(prompt_file), prompt_file
-        with open(prompt_file, "rb") as f:
-            prompts = pickle.load(f)
-        assert len(prompts) == expected_len, f"{len(prompts)}, {expected_len}"
-
-    for setting in settings:
-        pos, neg = setting["n_positives"], setting["n_negatives"]
-        prompt_file = os.path.join(prompt_folder, f"points-p{pos}-n{neg}.pkl")
-        if pos == 0 and neg == 0:
-            prompt_file = os.path.join(prompt_folder, "boxes.pkl")
-        check_prompt_file(prompt_file)
-
-    print("All files checked!")
-
-
-def check_all_datasets(check_prompts=False):
-
-    def check_dataset(dataset):
-        try:
-            images, _ = get_data_paths(dataset, "test")
-        except AssertionError as e:
-            print("Checking test split failed for datasset", dataset, "due to", e)
-
-        if dataset not in LM_DATASETS:
-            return len(images)
-
-        try:
-            get_data_paths(dataset, "val")
-        except AssertionError as e:
-            print("Checking val split failed for datasset", dataset, "due to", e)
-
-        return len(images)
-
-    settings = default_experiment_settings()
-    for ds in tqdm(ALL_DATASETS, desc="Checking datasets"):
-        n_images = check_dataset(ds)
-        if check_prompts:
-            _check_prompts(ds, settings, n_images)
-    print("All checks done!")
-
-
 ###
 # Evaluation functionality
 ###
 
 
-def get_generalist_predictor(checkpoint, model_type, return_state=False):
+def get_generalist_predictor(checkpoint, model_type, is_custom_model, return_state=False):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         return inference.get_predictor(
-            checkpoint, model_type=model_type, return_state=return_state, is_custom_model=True
+            checkpoint, model_type=model_type,
+            return_state=return_state, is_custom_model=is_custom_model
         )
 
 
+# TODO use model comparison func to generate the image data for qualitative comp
 def evaluate_checkpoint_for_dataset(
     checkpoint, model_type, dataset, experiment_folder,
-    run_default_evaluation, run_amg, predictor=None,
-    max_num_val_images=None,
+    run_default_evaluation, run_amg, is_custom_model,
+    predictor=None, max_num_val_images=None,
 ):
     """Evaluate a generalist checkpoint for a given dataset.
     """
@@ -137,7 +91,7 @@ def evaluate_checkpoint_for_dataset(
     prompt_dir = os.path.join(PROMPT_ROOT, dataset)
 
     if predictor is None:
-        predictor = get_generalist_predictor(checkpoint, model_type)
+        predictor = get_generalist_predictor(checkpoint, model_type, is_custom_model)
     test_image_paths, test_gt_paths = get_data_paths(dataset, "test")
 
     embedding_dir = os.path.join(experiment_folder, "test", "embeddings")
@@ -208,11 +162,11 @@ def evaluate_checkpoint_for_dataset(
 
 def evaluate_checkpoint_for_datasets(
     checkpoint, model_type, experiment_root, datasets,
-    run_default_evaluation, run_amg, predictor=None,
-    max_num_val_images=None,
+    run_default_evaluation, run_amg, is_custom_model,
+    predictor=None, max_num_val_images=None,
 ):
     if predictor is None:
-        predictor = get_generalist_predictor(checkpoint, model_type)
+        predictor = get_generalist_predictor(checkpoint, model_type, is_custom_model)
 
     results = []
     for dataset in datasets:
@@ -221,23 +175,9 @@ def evaluate_checkpoint_for_datasets(
         result = evaluate_checkpoint_for_dataset(
             None, None, dataset, experiment_folder,
             run_default_evaluation=run_default_evaluation,
-            run_amg=run_amg, predictor=predictor,
-            max_num_val_images=max_num_val_images,
+            run_amg=run_amg, is_custom_model=is_custom_model,
+            predictor=predictor, max_num_val_images=max_num_val_images,
         )
         results.append(result)
 
     return pd.concat(results)
-
-
-def evaluate_checkpoint_for_datasets_slurm(
-    checkpoint, model_type, experiment_root, datasets,
-    run_default_evaluation, run_amg,
-):
-    raise NotImplementedError
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--check_prompts", "-c", action="store_true")
-    args = parser.parse_args()
-    check_all_datasets(args.check_prompts)