fixed commit and add nltk download

wukaixingxp · wukaixingxp · commit d691843a1253 · 2024-08-20T16:25:53.000-07:00
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
@@ -28,6 +28,7 @@ Please install our lm-evaluation-harness and llama-recipe repo by following:
 ```
 git clone git@github.com:EleutherAI/lm-evaluation-harness.git
 cd lm-evaluation-harness
+git checkout a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622
 pip install -e .[math,ifeval,sentencepiece,vllm]
 cd ../
 git clone git@github.com:meta-llama/llama-recipes.git
@@ -203,6 +204,8 @@ Here is the comparison between our reported numbers and the reproduced numbers i
 
 From the table above, we can see that most of our reproduced results are very close to our reported number in the [Meta Llama website](https://llama.meta.com/).
 
+**NOTE**: We used the average of `inst_level_strict_acc,none` and `prompt_level_strict_acc,none` to get the final number for `IFeval` as stated [here](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about#task-evaluations-and-parameters)
+
 **NOTE**: In the [Meta Llama website](https://llama.meta.com/), we reported the `macro_avg` metric, which is the average of all subtask average score, for `MMLU-Pro `task, but here we are reproducing the `micro_avg` metric, which is the average score for all the individual samples, and those `micro_avg`  numbers can be found in the [eval_details.md](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/eval_details.md#mmlu-pro).
 
 **NOTE**: The reproduced numbers may be slightly different, as we observed around ±0.01 differences between each reproduce run because the latest VLLM inference is not very deterministic even with temperature=0. This behavior maybe related [this issue](https://github.com/vllm-project/vllm/issues/5404).
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
@@ -2,11 +2,12 @@
 # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
 
 import argparse
-import errno, shutil
+import errno
+import shutil
 import glob
 import os
 from pathlib import Path
-
+import nltk
 import yaml
 from datasets import Dataset, load_dataset
 
@@ -51,7 +52,7 @@ def get_ifeval_data(model_name, output_dir):
         ]
     )
     joined.rename_column("output_prediction_text", "previous_output_prediction_text")
-    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
+    joined.to_parquet(output_dir + "/joined_ifeval.parquet")
 
 
 # get the math_hard data from the evals dataset and join it with the original math_hard dataset
@@ -94,7 +95,7 @@ def get_math_data(model_name, output_dir):
         "output_prediction_text", "previous_output_prediction_text"
     )
 
-    joined.to_parquet(output_dir + f"/joined_math.parquet")
+    joined.to_parquet(output_dir + "/joined_math.parquet")
 
 
 # get the question from the ifeval dataset
@@ -137,6 +138,8 @@ def change_yaml(args, base_name):
 
 # copy the files and change the yaml file to use the correct model name
 def copy_and_prepare(args):
+    # nltk punkt_tab package is needed
+    nltk.download('punkt_tab')
     if not os.path.exists(args.work_dir):
         # Copy the all files, including yaml files and python files, from template folder to the work folder