From 634d8785ef512c7ce54b6f4a3da07de9e23e597c Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Wed, 9 Oct 2024 17:24:03 -0700 Subject: [PATCH] add option to run mmlu with 5 shots --- examples/models/llama2/eval_llama_lib.py | 26 +++++++++---- examples/models/llama2/evaluate/eager_eval.py | 39 +------------------ 2 files changed, 19 insertions(+), 46 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 3061d290bdc..4cf469a4acf 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -21,8 +21,9 @@ ) from executorch.extension.llm.tokenizer.utils import get_tokenizer from lm_eval.api.model import LM +from lm_eval.evaluator import simple_evaluate -from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model +from .evaluate.eager_eval import EagerEvalWrapper from .export_llama_lib import ( _prepare_for_llama_export, @@ -246,9 +247,16 @@ def build_args_parser() -> argparse.ArgumentParser: help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2", ) parser.add_argument( - "--limit", type=int, default=5, help="number of samples to evalulate" + "--limit", type=int, default=None, help="number of samples to evalulate" + ) + parser.add_argument( + "-f", + "--num_fewshot", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", ) - # Add additional args specific to eval via an ET Runner # Note: For initial integration, the tokenizer.model is also required parser.add_argument( @@ -281,11 +289,13 @@ def eval_llama( eval_wrapper = gen_eval_wrapper(model_name, args) # Evaluate the model - eval_results = evaluate_model( - eval_wrapper, - args.tasks, # pyre-ignore - args.limit, # pyre-ignore - ) + with torch.no_grad(): + eval_results = simple_evaluate( + model=eval_wrapper, + tasks=args.tasks, + num_fewshot=args.num_fewshot, + limit=args.limit, + ) for task, res in eval_results["results"].items(): print(f"{task}: {res}") diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py index cd845f577ef..c403242c9fe 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama2/evaluate/eager_eval.py @@ -15,9 +15,8 @@ ) from lm_eval.api.model import LM -from lm_eval.evaluator import evaluate +from lm_eval.evaluator import simple_evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper -from lm_eval.tasks import get_task_dict from torch import nn @@ -79,39 +78,3 @@ def _model_call(self, inps): def _model_generate(self, context, max_length, eos_token_id): raise Exception("unimplemented") - - -@torch.no_grad() -def evaluate_model( - eval_wrapper: LM, - tasks: Optional[list] = None, - limit: Optional[int] = None, -) -> dict: - """ - Evaluates a language model on a specified task using the lm-evaluation-harness library. - - Args: - eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation - tasks: Optional[list]: The names of the evaluation tasks to perform. - limit (Optional[int]): The maximum number of samples to evaluate (None for all available). - - Returns: - eval_results (dict): A dictionary of evaluation results for the specified task(s). - """ - - if tasks is None: - tasks = ["wikitext"] - - if "hendrycks_test" in tasks: - tasks.remove("hendrycks_test") - tasks += list( - lm_eval.tasks.hendrycks_test.create_all_tasks().keys() # pyre-ignore - ) - task_dict = get_task_dict(tasks) - - eval_results = evaluate( - eval_wrapper, - task_dict, - limit=limit, - ) - return eval_results