changed readme and added more comments

wukaixingxp · wukaixingxp · commit eef8b88899ef · 2024-08-16T10:41:18.000-07:00
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
@@ -6,7 +6,7 @@ As Meta Llama models gain popularity, evaluating these models has become increas
 ## Important Notes
 
 1. **This tutorial is not the official implementation** of Meta Llama evaluation. It is based on public third-party libraries, and the implementation may differ slightly from our internal evaluation, leading to minor differences in the reproduced numbers.
-2. **Model Compatibility**: This tutorial is specifically for Llama 3 based models, as our prompts include Meta Llama 3 special tokens, e.g. `<|start_header_id|>user<|end_header_id|`. It will not work with models that are not based on Llama 3.
+2. **Model Compatibility**: This tutorial is specifically for Llama 3 based models, as our prompts include Meta Llama 3 special tokens, e.g. `<|start_header_id|>user<|end_header_id|>`. It will not work with models that are not based on Llama 3.
 
 
 ### Hugging Face setups
@@ -39,12 +39,12 @@ Here, we aim to reproduce the Meta reported benchmark numbers on the aforementio
 
 There are 4 major differences in terms of the eval configurations and prompts between this tutorial implementation and Hugging Face [leaderboard implementation](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard).
 
-- **Prompts**: We use Chain-of-Thought(COT) prompts while Hugging Face leaderboard does not. The prompts that define the output format are also sometime different.
+- **Prompts**: We use Chain-of-Thought(COT) prompts while Hugging Face leaderboard does not. The prompts that define the output format are also different.
 - **Task type**: For MMLU-Pro, BBH, GPQA tasks, we ask the model to generate response and score the parsed answer from generated response, while Hugging Face leaderboard evaluation is comparing log likelihood of all label words, such as [ (A),(B),(C),(D) ].
-- **Parsers**: For generative tasks, where the final answer needs to be parsed before scoring, the parser functions can be different between ours and Hugging Face leaderboard evaluation, as our prompts that define the model output format are sometime designed differently.
+- **Parsers**: For generative tasks, where the final answer needs to be parsed before scoring, the parser functions can be different between ours and Hugging Face leaderboard evaluation, as our prompts that define the model output format are designed differently.
 - **Inference**: We use internal LLM inference solution that loads pytorch checkpoints and do not use padding, while Hugging Face leaderboard uses Hugging Face format model and sometimes will use padding depending on the tasks type and batch size.
 
-Given those differences, our reproduced number can not be apple to apple compared to the numbers in the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard), even if the task names are the same.
+Given those differences, our reproduced number can not be compared to the numbers in the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard), even if the task names are the same.
 
 ### Create task config
 
@@ -61,6 +61,8 @@ dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
 test_split: latest
 ```
 
+If you want to run evaluation on 70B-Instruct, then it is recommended to change the `dataset_path` and  `dataset_name` from 8B to 70B, even though 70B-instruct and 8B-instruct share the same prompts, the `is_correct` column, which can be used to get the difference between current reproduced result and the reported results for each sample, is different.
+
 **Note**: Config files for Meta-Llama-3.1-8B-Instruct are already provided in each task subfolder under [meta_template folder](./meta_template/). Remember to change the eval dataset name according to the model type and DO NOT use pretrained evals dataset on instruct models or vice versa.
 
 **2.Configure preprocessing, prompts and ground truth**
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
@@ -1,22 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
 
 import argparse
-import json
-import logging
 import os
-import re
-import sys
 from pathlib import Path
 import glob
-import numpy as np
-import lm_eval
-from lm_eval import tasks
-from lm_eval.utils import make_table
 import shutil, errno
 import yaml
 from datasets import load_dataset,Dataset
 
+# get the ifeval  from the evals dataset and join it with the original ifeval datasets
 def get_ifeval_data(model_name,output_dir):
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
@@ -36,16 +29,16 @@ def get_ifeval_data(model_name,output_dir):
     meta_df = meta_data.to_pandas()
     ifeval_df = ifeval_data.to_pandas()
     ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
-
+    # join the two datasets on the input_question column
     joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
     joined = joined.rename(columns={"input_final_prompts": "prompt"})
     joined = joined.rename(columns={"is_correct": "previous_is_correct"})
     joined = Dataset.from_pandas(joined)
     joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
     joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    for item in joined:
-        check_sample(item)
     joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
+
+# get the math_hard data from the evals dataset and join it with the original math_hard dataset
 def get_math_data(model_name,output_dir):
     print(f"preparing the math data using {model_name}'s evals dataset")
     if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
@@ -64,15 +57,16 @@ def get_math_data(model_name,output_dir):
     meta_df = meta_data.to_pandas()
     math_df = math_data.to_pandas()
     math_df = math_df.rename(columns={"problem": "input_question"})
-
+    # join the two datasets on the input_question column
     joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
     joined = Dataset.from_pandas(joined)
     joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
     joined = joined.rename_column("is_correct","previous_is_correct")
     joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    for item in joined:
-        check_sample(item)
+
     joined.to_parquet(output_dir + f"/joined_math.parquet")
+
+# get the question from the ifeval dataset 
 def get_question(example):
     try:
         example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
@@ -81,15 +75,8 @@ def get_question(example):
     except:
         print(example["input_question"])
         return
-def check_sample(example):
-    if "kwargs" in example and not example["kwargs"]:
-        print(example)
-        raise ValueError("This example did not got joined for IFeval")
-    if "solution" in example and not example["solution"]:
-        print(example)
-        raise ValueError("This example did not got joined for MATH_hard")
-
 
+# change the yaml file to use the correct model name
 def change_yaml(args, base_name):
     for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
         with open(yaml_file, "r") as sources:
@@ -102,6 +89,7 @@ def change_yaml(args, base_name):
             for line in lines:
                 output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
 
+# copy the files and change the yaml file to use the correct model name
 def copy_and_prepare(args):
     if not os.path.exists(args.work_dir):
         # Copy the all files, including yaml files and python files, from template folder to the work folder
@@ -137,14 +125,15 @@ def prepare_datasets(args):
             get_ifeval_data(model_name,args.work_dir)
         if "meta_math_hard" in task_list:
             get_math_data(model_name,args.work_dir)
-    
+# copy the files from src to dst
 def copy_dir(src, dst):
     try:
         shutil.copytree(src, dst)
     except OSError as exc: # python >2.5
         if exc.errno in (errno.ENOTDIR, errno.EINVAL):
             shutil.copy(src, dst)
         else: raise
+# load the config yaml file
 def load_config(config_path: str = "./config.yaml"):
     # Read the YAML configuration file
     with open(config_path, "r") as file:
@@ -163,14 +152,15 @@ def load_config(config_path: str = "./config.yaml"):
     args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
     # Copy the all files from template folder to the work folder
     copy_and_prepare(args)
+    # Prepare the datasets for the IFeval and MATH_Hard tasks as we need to join the original dataset
     prepare_datasets(args)
     print(f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}")
     command_str = f"lm_eval --model vllm   --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
     if args.limit:
         command_str += f" --limit {args.limit}"
     if args.log_samples:
-        command_str += f" --log_samples "
+        command_str += " --log_samples "
     if args.show_config:
-        command_str += f" --show_config "
+        command_str += " --show_config "
     print("please use the following command to run the meta reproduce evals:")
     print(command_str)