Fix eval + add smoke test (axolotl-ai-cloud#2586)

djsaunde · web-flow · commit c7d07de6b47b · 2025-04-29T12:58:54.000-04:00
* fix evaluate CLI

* add smoke test

* fix naming

* lint
diff --git a/src/axolotl/cli/evaluate.py b/src/axolotl/cli/evaluate.py
@@ -1,6 +1,7 @@
 """CLI to run evaluation on a model."""
 
 import logging
+import os
 from pathlib import Path
 from typing import Union
 
@@ -14,6 +15,7 @@
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
 
 LOG = logging.getLogger(__name__)
@@ -29,10 +31,14 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: CLI arguments.
     """
+    # Enable expandable segments for cuda allocation to improve VRAM usage
+    set_pytorch_cuda_alloc_conf()
+
     # pylint: disable=duplicate-code
     print_axolotl_text_art()
     check_accelerate_default_config()
-    check_user_token()
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        check_user_token()
 
     if cfg.rl:
         dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
diff --git a/src/axolotl/evaluate.py b/src/axolotl/evaluate.py
@@ -12,19 +12,20 @@
 from transformers.trainer import Trainer
 
 from axolotl.logging_config import configure_logging
-from axolotl.train import TrainDatasetMeta
-from axolotl.utils import set_pytorch_cuda_alloc_conf
+from axolotl.train import (
+    TrainDatasetMeta,
+    setup_model_and_tokenizer,
+)
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
-from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
 
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 
 configure_logging()
-LOG = get_logger("axolotl.evaluate")
+LOG = get_logger(__name__)
 
 
 def evaluate_dataset(
@@ -75,37 +76,22 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
     Returns:
         Dictionary mapping metric names to their values.
     """
-    # pylint: disable=duplicate-code
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    set_pytorch_cuda_alloc_conf()
-
-    # Load tokenizer
-    LOG.debug(
-        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
-        main_process_only=True,
-    )
-    tokenizer = load_tokenizer(cfg)
-
-    # Load processor for multimodal models if needed
-    processor = None
-    if cfg.is_multimodal:
-        processor = load_processor(cfg, tokenizer)
+    # Load tokenizer, processor and model
+    LOG.debug("loading model for evaluation...")
+    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
 
     # Get datasets
+    # pylint: disable=duplicate-code
     train_dataset = dataset_meta.train_dataset
     eval_dataset = dataset_meta.eval_dataset
     total_num_steps = dataset_meta.total_num_steps
 
-    # Load model
-    LOG.debug("loading model for evaluation...")
-    model, _ = load_model(cfg, tokenizer, processor=processor)
-
     # Set up trainer
     trainer = setup_trainer(
-        cfg,
+        cfg=cfg,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        model=(model, None, None),  # No need for model_ref or peft_config
+        model=model,
         tokenizer=tokenizer,
         processor=processor,
         total_num_steps=total_num_steps,
diff --git a/tests/e2e/test_evaluate.py b/tests/e2e/test_evaluate.py
@@ -0,0 +1,65 @@
+"""E2E smoke test for evaluate CLI command"""
+
+import os
+from pathlib import Path
+
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+from transformers.testing_utils import get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+os.environ["WANDB_DISABLED"] = "true"
+
+
+class TestE2eEvaluate:
+    """Test cases for evaluate CLI"""
+
+    def test_evaluate(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "JackFram/llama-68m",
+                "tokenizer_type": "LlamaTokenizer",
+                "sequence_len": 1024,
+                "val_set_size": 0.02,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "micro_batch_size": 8,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "max_steps": 20,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
+                "-m",
+                "axolotl.cli.evaluate",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )