diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 3061d290bdc..57258fdbc11 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -21,8 +21,9 @@ ) from executorch.extension.llm.tokenizer.utils import get_tokenizer from lm_eval.api.model import LM +from lm_eval.evaluator import simple_evaluate -from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model +from .evaluate.eager_eval import EagerEvalWrapper from .export_llama_lib import ( _prepare_for_llama_export, @@ -246,9 +247,19 @@ def build_args_parser() -> argparse.ArgumentParser: help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2", ) parser.add_argument( - "--limit", type=int, default=5, help="number of samples to evalulate" + "--limit", + type=int, + default=None, + help="number of samples to evalulate. If not set, evaluate all samples", + ) + parser.add_argument( + "-f", + "--num_fewshot", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", ) - # Add additional args specific to eval via an ET Runner # Note: For initial integration, the tokenizer.model is also required parser.add_argument( @@ -281,11 +292,13 @@ def eval_llama( eval_wrapper = gen_eval_wrapper(model_name, args) # Evaluate the model - eval_results = evaluate_model( - eval_wrapper, - args.tasks, # pyre-ignore - args.limit, # pyre-ignore - ) + with torch.no_grad(): + eval_results = simple_evaluate( + model=eval_wrapper, + tasks=args.tasks, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `tasks` + num_fewshot=args.num_fewshot, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `num_fewshot` + limit=args.limit, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `limit` + ) for task, res in eval_results["results"].items(): print(f"{task}: {res}") diff --git a/examples/models/llama2/evaluate/__init__.py b/examples/models/llama2/evaluate/__init__.py index c7f7d2be6e5..f1ba33ccbbe 100644 --- a/examples/models/llama2/evaluate/__init__.py +++ b/examples/models/llama2/evaluate/__init__.py @@ -4,9 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from .eager_eval import EagerEvalWrapper, evaluate_model +from .eager_eval import EagerEvalWrapper __all__ = [ - "evaluate_model", "EagerEvalWrapper", ] diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py index cd845f577ef..784112e052b 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama2/evaluate/eager_eval.py @@ -7,17 +7,13 @@ from typing import Optional, Union -import lm_eval import torch from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) -from lm_eval.api.model import LM -from lm_eval.evaluator import evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper -from lm_eval.tasks import get_task_dict from torch import nn @@ -79,39 +75,3 @@ def _model_call(self, inps): def _model_generate(self, context, max_length, eos_token_id): raise Exception("unimplemented") - - -@torch.no_grad() -def evaluate_model( - eval_wrapper: LM, - tasks: Optional[list] = None, - limit: Optional[int] = None, -) -> dict: - """ - Evaluates a language model on a specified task using the lm-evaluation-harness library. - - Args: - eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation - tasks: Optional[list]: The names of the evaluation tasks to perform. - limit (Optional[int]): The maximum number of samples to evaluate (None for all available). - - Returns: - eval_results (dict): A dictionary of evaluation results for the specified task(s). - """ - - if tasks is None: - tasks = ["wikitext"] - - if "hendrycks_test" in tasks: - tasks.remove("hendrycks_test") - tasks += list( - lm_eval.tasks.hendrycks_test.create_all_tasks().keys() # pyre-ignore - ) - task_dict = get_task_dict(tasks) - - eval_results = evaluate( - eval_wrapper, - task_dict, - limit=limit, - ) - return eval_results