diff --git a/examples/benchmark/ceval/README.md b/examples/benchmark/ceval/README.md index 67b803ed81dc..2bf4fc78c946 100644 --- a/examples/benchmark/ceval/README.md +++ b/examples/benchmark/ceval/README.md @@ -10,14 +10,12 @@ wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip unzip ceval-exam.zip -d data ``` -将data文件夹放置于本项目的scripts/ceval目录下。 ## 运行预测脚本 -运行以下脚本: +在当前目录运行以下脚本: ``` -cd scripts/ceval python eval.py \ --model_name_or_path /path/to/your/model \ --cot False \ diff --git a/examples/benchmark/mmlu/README.md b/examples/benchmark/mmlu/README.md new file mode 100644 index 000000000000..21386c1c620a --- /dev/null +++ b/examples/benchmark/mmlu/README.md @@ -0,0 +1,60 @@ +# MMLU 英文评测数据集 +MMLU ([Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300v3.pdf))用于衡量文本模型在多种任务上的准确性,是目前主流的 LLM 英文评测数据集。该数据集涵盖了57个任务,包括基础数学、美国历史、计算机科学、法律等等。 + +此 MMLU 评测脚本修改自[hendrycks/test](https://github.com/hendrycks/test)项目。 + +## 数据准备 + +从指定路径下载评测数据集: + +``` +wget https://people.eecs.berkeley.edu/~hendrycks/data.tar +tar xf data.tar +``` + +## 运行评测脚本 + +在当前目录下运行以下脚本: + +- 单卡运行 +``` +export CUDA_VISIBLE_DEVICES=0 +python eval.py \ + --model_name_or_path /path/to/your/model \ + --temperature 0.2 \ + --ntrain 5 \ + --output_dir ${output_path} \ + --dtype 'float16' +``` +- 多卡运行 +``` +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle.distributed.fleet.launch eval.py \ + --model_name_or_path /path/to/your/model \ + --temperature 0.2 \ + --ntrain 5 \ + --output_dir ${output_path} \ + --dtype 'float16' \ + --tensor_parallel_degree 4 +``` + +参数说明 + +- model_name_or_path:待评测模型所在目录 +- ntrain:指定few-shot实例的数量(5-shot:ntrain=5) +- with_prompt:模型输入是否包含针对Alpaca模型的指令模板 +- temperature:模型解码时的温度 +- output_dir:指定评测结果的输出路径 + +## 评测输出 +模型预测完成后,将在输出路径下用csv文件保存57个任务下模型的答题结果,其中 `sumaray.json` 包含模型在22个主题下和总体平均的评测结果。例如,json文件最后的All字段中会显示总体平均效果: + +``` + "All": { + "score": 0.36701337295690933, + "num": 1346, + "correct": 494.0 +} +``` + +其中score为准确率,num为测试的总样本条数,correct为正确的数量。 diff --git a/examples/benchmark/mmlu/categories.py b/examples/benchmark/mmlu/categories.py new file mode 100644 index 000000000000..6b902adfc154 --- /dev/null +++ b/examples/benchmark/mmlu/categories.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from https://github.com/hendrycks/test + +subcategories = { + "abstract_algebra": ["math"], + "anatomy": ["health"], + "astronomy": ["physics"], + "business_ethics": ["business"], + "clinical_knowledge": ["health"], + "college_biology": ["biology"], + "college_chemistry": ["chemistry"], + "college_computer_science": ["computer science"], + "college_mathematics": ["math"], + "college_medicine": ["health"], + "college_physics": ["physics"], + "computer_security": ["computer science"], + "conceptual_physics": ["physics"], + "econometrics": ["economics"], + "electrical_engineering": ["engineering"], + "elementary_mathematics": ["math"], + "formal_logic": ["philosophy"], + "global_facts": ["other"], + "high_school_biology": ["biology"], + "high_school_chemistry": ["chemistry"], + "high_school_computer_science": ["computer science"], + "high_school_european_history": ["history"], + "high_school_geography": ["geography"], + "high_school_government_and_politics": ["politics"], + "high_school_macroeconomics": ["economics"], + "high_school_mathematics": ["math"], + "high_school_microeconomics": ["economics"], + "high_school_physics": ["physics"], + "high_school_psychology": ["psychology"], + "high_school_statistics": ["math"], + "high_school_us_history": ["history"], + "high_school_world_history": ["history"], + "human_aging": ["health"], + "human_sexuality": ["culture"], + "international_law": ["law"], + "jurisprudence": ["law"], + "logical_fallacies": ["philosophy"], + "machine_learning": ["computer science"], + "management": ["business"], + "marketing": ["business"], + "medical_genetics": ["health"], + "miscellaneous": ["other"], + "moral_disputes": ["philosophy"], + "moral_scenarios": ["philosophy"], + "nutrition": ["health"], + "philosophy": ["philosophy"], + "prehistory": ["history"], + "professional_accounting": ["other"], + "professional_law": ["law"], + "professional_medicine": ["health"], + "professional_psychology": ["psychology"], + "public_relations": ["politics"], + "security_studies": ["politics"], + "sociology": ["culture"], + "us_foreign_policy": ["politics"], + "virology": ["health"], + "world_religions": ["philosophy"], +} + +categories = { + "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"], + "humanities": ["history", "philosophy", "law"], + "social sciences": ["politics", "culture", "economics", "geography", "psychology"], + "other (business, health, misc.)": ["other", "business", "health"], +} diff --git a/examples/benchmark/mmlu/eval.py b/examples/benchmark/mmlu/eval.py new file mode 100644 index 000000000000..957e9a9ba3a4 --- /dev/null +++ b/examples/benchmark/mmlu/eval.py @@ -0,0 +1,120 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from https://github.com/hendrycks/test +import argparse +import json +import os + +import numpy as np +import paddle +import pandas as pd +from categories import categories, subcategories +from evaluator import ModelEvaluator + +choices = ["A", "B", "C", "D"] + + +def main(args, evaluator): + subjects = sorted( + [f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f] + ) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) + if not os.path.exists(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path))): + os.makedirs(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path)), exist_ok=True) + + all_cors = [] + subcat_cors = {subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists} + cat_cors = {cat: [] for cat in categories} + summary = {} + for subject in subjects: + dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[: args.ntrain] + test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None) + + cors, acc, probs = evaluator.eval(args, subject, dev_df, test_df) + subcats = subcategories[subject] + for subcat in subcats: + subcat_cors[subcat].append(cors) + for key in categories.keys(): + if subcat in categories[key]: + cat_cors[key].append(cors) + all_cors.append(cors) + + test_df["{}_correct".format(args.model_name_or_path)] = cors + for j in range(probs.shape[1]): + choice = choices[j] + test_df["{}_choice{}_probs".format(args.model_name_or_path, choice)] = probs[:, j] + test_df.to_csv( + os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "{}.csv".format(subject)), + index=None, + ) + + for subcat in subcat_cors: + subcat_acc = np.mean(np.concatenate(subcat_cors[subcat])) + print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat)) + summary[subcat] = { + "acc:": subcat_acc, + "correct:": int(np.sum(np.concatenate(subcat_cors[subcat]))), + "num:": int(np.concatenate(subcat_cors[subcat]).size), + } + + for cat in cat_cors: + cat_acc = np.mean(np.concatenate(cat_cors[cat])) + print("Average accuracy {:.3f} - {}".format(cat_acc, cat)) + weighted_acc = np.mean(np.concatenate(all_cors)) + print("Average accuracy: {:.3f}".format(weighted_acc)) + print("Model:", args.model_name_or_path) + summary["All"] = { + "acc:": weighted_acc, + "correct:": int(np.sum(np.concatenate(all_cors))), + "num:": int(np.concatenate(all_cors).size), + } + json.dump( + summary, + open(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "summary.json"), "w"), + ensure_ascii=False, + indent=2, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name_or_path", type=str) + parser.add_argument("--ntrain", "-k", type=int, default=5) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--data_dir", "-d", type=str, default="data") + parser.add_argument("--output_dir", type=str, default="results") + parser.add_argument("--dtype", default="float32", type=str) + parser.add_argument("--tensor_parallel_degree", default=1, type=int) + + args = parser.parse_args() + print(args) + + if args.tensor_parallel_degree > 1: + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.hybrid_configs = { + "mp_degree": args.tensor_parallel_degree, + } + # Set control in tensor parallel + strategy.tensor_parallel_configs = {"tensor_init_seed": 1234} + paddle.distributed.fleet.init(is_collective=True, strategy=strategy) + evaluator = ModelEvaluator( + model_name_or_path=args.model_name_or_path, + ntrain=args.ntrain, + temperature=args.temperature, + dtype=args.dtype, + tensor_parallel_degree=args.tensor_parallel_degree, + ) + + main(args, evaluator=evaluator) diff --git a/examples/benchmark/mmlu/evaluator.py b/examples/benchmark/mmlu/evaluator.py new file mode 100644 index 000000000000..ccac4bf612e6 --- /dev/null +++ b/examples/benchmark/mmlu/evaluator.py @@ -0,0 +1,121 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from https://github.com/hendrycks/test + +import numpy as np +import paddle +from tqdm import tqdm + +from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer + +choices = ["A", "B", "C", "D"] + + +class ModelEvaluator(object): + def __init__(self, model_name_or_path, ntrain, temperature=0.2, dtype="float32", tensor_parallel_degree=1): + self.model_name_or_path = model_name_or_path + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + self.tensor_parallel_degree = tensor_parallel_degree + if self.tensor_parallel_degree > 1: + self.model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + dtype=dtype, + low_cpu_mem_usage=True, + tensor_parallel_output=False, + tensor_parallel_degree=self.tensor_parallel_degree, + tensor_parallel_rank=paddle.distributed.get_rank(), + ) + else: + self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype=dtype, low_cpu_mem_usage=True) + self.model.eval() + self.generation_config = dict( + temperature=temperature, + top_k=40, + top_p=0.9, + do_sample=True, + num_beams=1, + repetition_penalty=1.1, + max_new_tokens=20, + ) + + self.A_id = self.tokenizer.encode("A", add_special_tokens=False)["input_ids"][0] + self.B_id = self.tokenizer.encode("B", add_special_tokens=False)["input_ids"][0] + self.C_id = self.tokenizer.encode("C", add_special_tokens=False)["input_ids"][0] + self.D_id = self.tokenizer.encode("D", add_special_tokens=False)["input_ids"][0] + self.ntrain = ntrain + + def format_subject(self, subject): + l = subject.split("_") + s = "" + for entry in l: + s += " " + entry + return s + + def gen_prompt(self, train_df, subject, k=-1): + prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format( + self.format_subject(subject) + ) + if k == -1: + k = train_df.shape[0] + for i in range(k): + prompt += self.format_example(train_df, i) + return prompt + + def format_example(self, df, idx, include_answer=True): + prompt = df.iloc[idx, 0] + k = df.shape[1] - 2 + for j in range(k): + prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1]) + prompt += "\nAnswer:" + if include_answer: + prompt += " {}\n\n".format(df.iloc[idx, k + 1]) + return prompt + + def eval(self, subject, dev_df, test_df, do_ptq=False): + cors = [] + all_probs = [] + for i in tqdm(range(test_df.shape[0]), total=test_df.shape[0]): + # for i in range(test_df.shape[0]): + # get prompt and make sure it fits + k = self.ntrain + prompt_end = self.format_example(test_df, i, include_answer=False) + train_prompt = self.gen_prompt(dev_df, subject, k) + prompt = train_prompt + prompt_end + + inputs = self.tokenizer(prompt, return_tensors="pd") + label = test_df.iloc[i, test_df.shape[1] - 1] + + with paddle.no_grad(): + logits = self.model(**inputs)[0][0, -1, :] + choices_logits = logits[[self.A_id, self.B_id, self.C_id, self.D_id]].numpy() + assert not (np.any(np.isinf(choices_logits)) or np.any(np.isnan(choices_logits))) + ans = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(choices_logits)] + + cor = ans == label + cors.append(cor) + all_probs.append(choices_logits) + + print(f"\n=======begin {str(i)}=======") + print("prompt: ", prompt) + print("ans: ", ans) + print("ground truth: ", label, "\n") + print(f"=======end {str(i)}=======") + + acc = np.mean(cors) + cors = np.array(cors) + + all_probs = np.array(all_probs) + print("Average accuracy {:.3f} - {}".format(acc, subject)) + + return cors, acc, all_probs