add option to run mmlu with 5 shots

helunwencser · helunwencser · commit 7c0f083cda59 · 2024-10-09T17:24:03.000-07:00
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -248,7 +248,14 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--limit", type=int, default=5, help="number of samples to evalulate"
     )
-
+    parser.add_argument(
+        "-f",
+        "--num_fewshot",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
+    )
     # Add additional args specific to eval via an ET Runner
     # Note: For initial integration, the tokenizer.model is also required
     parser.add_argument(
@@ -282,9 +289,10 @@ def eval_llama(
 
     # Evaluate the model
     eval_results = evaluate_model(
-        eval_wrapper,
-        args.tasks,  # pyre-ignore
-        args.limit,  # pyre-ignore
+        eval_wrapper=eval_wrapper,
+        tasks=args.tasks,
+        num_fewshot=args.num_fewshot,
+        limit=args.limit,
     )
 
     for task, res in eval_results["results"].items():
diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
@@ -15,9 +15,8 @@
 )
 
 from lm_eval.api.model import LM
-from lm_eval.evaluator import evaluate
+from lm_eval.evaluator import simple_evaluate
 from lm_eval.models.huggingface import HFLM as eval_wrapper
-from lm_eval.tasks import get_task_dict
 
 from torch import nn
 
@@ -85,6 +84,7 @@ def _model_generate(self, context, max_length, eos_token_id):
 def evaluate_model(
     eval_wrapper: LM,
     tasks: Optional[list] = None,
+    num_fewshot: Optional[int] = None,
     limit: Optional[int] = None,
 ) -> dict:
     """
@@ -93,6 +93,7 @@ def evaluate_model(
     Args:
         eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation
         tasks: Optional[list]: The names of the evaluation tasks to perform.
+        num_fewshot: Optional[int]: Number of examples in few-shot context.
         limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
 
     Returns:
@@ -107,11 +108,11 @@ def evaluate_model(
         tasks += list(
             lm_eval.tasks.hendrycks_test.create_all_tasks().keys()  # pyre-ignore
         )
-    task_dict = get_task_dict(tasks)
 
-    eval_results = evaluate(
-        eval_wrapper,
-        task_dict,
+    eval_results = simple_evaluate(
+        model=eval_wrapper,
+        tasks=tasks,
+        num_fewshot=num_fewshot,
         limit=limit,
     )
     return eval_results