add option to run mmlu with 5 shots (#6146)

helunwencser · facebook-github-bot · commit e95aa9d2ef99 · 2024-10-11T12:48:41.000-07:00
Summary: Pull Request resolved: #6146 This PR does the following changes: - add `--num_fewshot` option which is required for running MMLU task with 5 shots - set the default value of `--limit` to none such that we can actually run all examples - update `eval_llama` to call `simple_evaluate` which is a wrapper of `evaluate` and does some extra work for us like getting the task dict Test Plan: - Make sure WikiText perplexity for llama 3.2 1B stays the same before and after the change. Before, run eval_llama for llama 3.2 1B with limit set to None: ``` wikitext: {'word_perplexity,none': 12.78246428138387, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.610432252171856, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6874479705552373, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` After, run eval_llama for llama 3.2 1B: ``` wikitext: {'word_perplexity,none': 12.78246428138387, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.610432252171856, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6874479705552373, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` - Make sure that lm_eval(v0.4.2, which is used by eval_llama) and eval_llama reports similar number for llama 3.2 1B and 3B BF16 for MMLU task with 5 shots. Example command for lm_eval: ``` lm_eval --model hf \ --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \ --tasks mmlu \ --device cuda \ -f 5 \ --batch_size auto ``` Example command for eval_llama: ``` python -m examples.models.llama2.eval_llama \ -c /home/lunwenh/models/1B_Instruct/consolidated.00.pth \ -p /home/lunwenh/models/1B_Instruct/params.json \ -t /home/lunwenh/models/1B_Instruct/tokenizer.model \ -kv \ -d bf16 \ --tasks mmlu \ -f 5 \ --max_seq_length 2048 ``` imported-using-ghimport Reviewed By: mergennachin Differential Revision: D64215268 Pulled By: helunwencser fbshipit-source-id: 606dd279201c4165cf8d218da50cef1457288ed6
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -21,8 +21,9 @@
 )
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from lm_eval.api.model import LM
+from lm_eval.evaluator import simple_evaluate
 
-from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model
+from .evaluate.eager_eval import EagerEvalWrapper
 
 from .export_llama_lib import (
     _prepare_for_llama_export,
@@ -246,9 +247,19 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2",
     )
     parser.add_argument(
-        "--limit", type=int, default=5, help="number of samples to evalulate"
+        "--limit",
+        type=int,
+        default=None,
+        help="number of samples to evalulate. If not set, evaluate all samples",
+    )
+    parser.add_argument(
+        "-f",
+        "--num_fewshot",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
     )
-
     # Add additional args specific to eval via an ET Runner
     # Note: For initial integration, the tokenizer.model is also required
     parser.add_argument(
@@ -281,11 +292,13 @@ def eval_llama(
     eval_wrapper = gen_eval_wrapper(model_name, args)
 
     # Evaluate the model
-    eval_results = evaluate_model(
-        eval_wrapper,
-        args.tasks,  # pyre-ignore
-        args.limit,  # pyre-ignore
-    )
+    with torch.no_grad():
+        eval_results = simple_evaluate(
+            model=eval_wrapper,
+            tasks=args.tasks,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `tasks`
+            num_fewshot=args.num_fewshot,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `num_fewshot`
+            limit=args.limit,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `limit`
+        )
 
     for task, res in eval_results["results"].items():
         print(f"{task}: {res}")
diff --git a/examples/models/llama2/evaluate/__init__.py b/examples/models/llama2/evaluate/__init__.py
@@ -4,9 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .eager_eval import EagerEvalWrapper, evaluate_model
+from .eager_eval import EagerEvalWrapper
 
 __all__ = [
-    "evaluate_model",
     "EagerEvalWrapper",
 ]
diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
@@ -7,17 +7,13 @@
 
 from typing import Optional, Union
 
-import lm_eval
 import torch
 from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
 from executorch.extension.llm.tokenizer.tokenizer import (
     Tokenizer as SentencePieceTokenizer,
 )
 
-from lm_eval.api.model import LM
-from lm_eval.evaluator import evaluate
 from lm_eval.models.huggingface import HFLM as eval_wrapper
-from lm_eval.tasks import get_task_dict
 
 from torch import nn
 
@@ -79,39 +75,3 @@ def _model_call(self, inps):
 
     def _model_generate(self, context, max_length, eos_token_id):
         raise Exception("unimplemented")
-
-
-@torch.no_grad()
-def evaluate_model(
-    eval_wrapper: LM,
-    tasks: Optional[list] = None,
-    limit: Optional[int] = None,
-) -> dict:
-    """
-    Evaluates a language model on a specified task using the lm-evaluation-harness library.
-
-    Args:
-        eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation
-        tasks: Optional[list]: The names of the evaluation tasks to perform.
-        limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
-
-    Returns:
-        eval_results (dict): A dictionary of evaluation results for the specified task(s).
-    """
-
-    if tasks is None:
-        tasks = ["wikitext"]
-
-    if "hendrycks_test" in tasks:
-        tasks.remove("hendrycks_test")
-        tasks += list(
-            lm_eval.tasks.hendrycks_test.create_all_tasks().keys()  # pyre-ignore
-        )
-    task_dict = get_task_dict(tasks)
-
-    eval_results = evaluate(
-        eval_wrapper,
-        task_dict,
-        limit=limit,
-    )
-    return eval_results

Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,8 @@`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
`6`	`6`
`7`		`-from .eager_eval import EagerEvalWrapper, evaluate_model`
	`7`	`+from .eager_eval import EagerEvalWrapper`
`8`	`8`
`9`	`9`	`__all__ = [`
`10`		`- "evaluate_model",`
`11`	`10`	`"EagerEvalWrapper",`
`12`	`11`	`]`