PaddlePaddle · zzjjay · Aug 21, 2023 · Aug 22, 2023 · Sep 11, 2023 · Sep 12, 2023
diff --git a/examples/benchmark/ceval/README.md b/examples/benchmark/ceval/README.md
@@ -10,14 +10,12 @@
 wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
 unzip ceval-exam.zip -d data
 ```
-将data文件夹放置于本项目的scripts/ceval目录下。
 
 ## 运行预测脚本
 
-运行以下脚本：
+在当前目录运行以下脚本：
 
 ```
-cd scripts/ceval
 python eval.py \
     --model_name_or_path /path/to/your/model \
     --cot False \

diff --git a/examples/benchmark/mmlu/README.md b/examples/benchmark/mmlu/README.md
@@ -0,0 +1,47 @@
+# MMLU 英文评测数据集
+MMLU ([Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300v3.pdf))用于衡量文本模型在多种任务上的准确性，是目前主流的 LLM 英文评测数据集。该数据集涵盖了57个任务，包括基础数学、美国历史、计算机科学、法律等等。
+
+此 MMLU 评测脚本修改自[hendrycks/test](https://github.com/hendrycks/test)项目。
+
+## 数据准备
+
+从指定路径下载评测数据集：
+
+```
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar xf data.tar
+```
+
+## 运行评测脚本
+
+在当前目录下运行以下脚本：
+
+```
+python eval.py \
+    --model_name_or_path /path/to/your/model \
+    --temperature 0.2 \
+    --ntrain 5 \
+    --output_dir ${output_path} \
+    --dtype 'float16'
+```
+
+参数说明
+
+- model_name_or_path：待评测模型所在目录
+- ntrain：指定few-shot实例的数量（5-shot：ntrain=5）
+- with_prompt：模型输入是否包含针对Alpaca模型的指令模板
+- temperature：模型解码时的温度
+- output_dir：指定评测结果的输出路径
+
+## 评测输出
+模型预测完成后，将在输出路径下用csv文件保存57个任务下模型的答题结果，其中 `sumaray.json` 包含模型在22个主题下和总体平均的评测结果。例如，json文件最后的All字段中会显示总体平均效果：
+
+```
+  "All": {
+    "score": 0.36701337295690933,
+    "num": 1346,
+  "correct": 494.0
+}
+```
+
+其中score为准确率，num为测试的总样本条数，correct为正确的数量。
diff --git a/examples/benchmark/mmlu/categories.py b/examples/benchmark/mmlu/categories.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/hendrycks/test
+
+subcategories = {
+    "abstract_algebra": ["math"],
+    "anatomy": ["health"],
+    "astronomy": ["physics"],
+    "business_ethics": ["business"],
+    "clinical_knowledge": ["health"],
+    "college_biology": ["biology"],
+    "college_chemistry": ["chemistry"],
+    "college_computer_science": ["computer science"],
+    "college_mathematics": ["math"],
+    "college_medicine": ["health"],
+    "college_physics": ["physics"],
+    "computer_security": ["computer science"],
+    "conceptual_physics": ["physics"],
+    "econometrics": ["economics"],
+    "electrical_engineering": ["engineering"],
+    "elementary_mathematics": ["math"],
+    "formal_logic": ["philosophy"],
+    "global_facts": ["other"],
+    "high_school_biology": ["biology"],
+    "high_school_chemistry": ["chemistry"],
+    "high_school_computer_science": ["computer science"],
+    "high_school_european_history": ["history"],
+    "high_school_geography": ["geography"],
+    "high_school_government_and_politics": ["politics"],
+    "high_school_macroeconomics": ["economics"],
+    "high_school_mathematics": ["math"],
+    "high_school_microeconomics": ["economics"],
+    "high_school_physics": ["physics"],
+    "high_school_psychology": ["psychology"],
+    "high_school_statistics": ["math"],
+    "high_school_us_history": ["history"],
+    "high_school_world_history": ["history"],
+    "human_aging": ["health"],
+    "human_sexuality": ["culture"],
+    "international_law": ["law"],
+    "jurisprudence": ["law"],
+    "logical_fallacies": ["philosophy"],
+    "machine_learning": ["computer science"],
+    "management": ["business"],
+    "marketing": ["business"],
+    "medical_genetics": ["health"],
+    "miscellaneous": ["other"],
+    "moral_disputes": ["philosophy"],
+    "moral_scenarios": ["philosophy"],
+    "nutrition": ["health"],
+    "philosophy": ["philosophy"],
+    "prehistory": ["history"],
+    "professional_accounting": ["other"],
+    "professional_law": ["law"],
+    "professional_medicine": ["health"],
+    "professional_psychology": ["psychology"],
+    "public_relations": ["politics"],
+    "security_studies": ["politics"],
+    "sociology": ["culture"],
+    "us_foreign_policy": ["politics"],
+    "virology": ["health"],
+    "world_religions": ["philosophy"],
+}
+
+categories = {
+    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
+    "humanities": ["history", "philosophy", "law"],
+    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
+    "other (business, health, misc.)": ["other", "business", "health"],
+}
diff --git a/examples/benchmark/mmlu/eval.py b/examples/benchmark/mmlu/eval.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/hendrycks/test
+import argparse
+import json
+import os
+
+import numpy as np
+import pandas as pd
+from categories import categories, subcategories
+from evaluator import ModelEvaluator
+
+choices = ["A", "B", "C", "D"]
+
+
+def main(args, evaluator):
+    subjects = sorted(
+        [f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f]
+    )
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if not os.path.exists(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path))):
+        os.makedirs(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path)))
+
+    all_cors = []
+    subcat_cors = {subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists}
+    cat_cors = {cat: [] for cat in categories}
+    summary = {}
+    for subject in subjects:
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[: args.ntrain]
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+
+        cors, acc, probs = evaluator.eval(args, subject, dev_df, test_df)
+        subcats = subcategories[subject]
+        for subcat in subcats:
+            subcat_cors[subcat].append(cors)
+            for key in categories.keys():
+                if subcat in categories[key]:
+                    cat_cors[key].append(cors)
+        all_cors.append(cors)
+
+        test_df["{}_correct".format(args.model_name_or_path)] = cors
+        for j in range(probs.shape[1]):
+            choice = choices[j]
+            test_df["{}_choice{}_probs".format(args.model_name_or_path, choice)] = probs[:, j]
+        test_df.to_csv(
+            os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "{}.csv".format(subject)),
+            index=None,
+        )
+
+    for subcat in subcat_cors:
+        subcat_acc = np.mean(np.concatenate(subcat_cors[subcat]))
+        print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat))
+        summary[subcat] = {
+            "acc:": subcat_acc,
+            "correct:": int(np.sum(np.concatenate(subcat_cors[subcat]))),
+            "num:": int(np.concatenate(subcat_cors[subcat]).size),
+        }
+
+    for cat in cat_cors:
+        cat_acc = np.mean(np.concatenate(cat_cors[cat]))
+        print("Average accuracy {:.3f} - {}".format(cat_acc, cat))
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+    print("Model:", args.model_name_or_path)
+    summary["All"] = {
+        "acc:": weighted_acc,
+        "correct:": int(np.sum(np.concatenate(all_cors))),
+        "num:": int(np.concatenate(all_cors).size),
+    }
+    json.dump(
+        summary,
+        open(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "summary.json"), "w"),
+        ensure_ascii=False,
+        indent=2,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str)
+    parser.add_argument("--ntrain", "-k", type=int, default=5)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--data_dir", "-d", type=str, default="data")
+    parser.add_argument("--output_dir", type=str, default="results")
+    parser.add_argument("--dtype", default="float32", type=str)
+
+    args = parser.parse_args()
+    print(args)
+
+    evaluator = ModelEvaluator(
+        model_name_or_path=args.model_name_or_path,
+        ntrain=args.ntrain,
+        temperature=args.temperature,
+        dtype=args.dtype,
+    )
+
+    main(args, evaluator=evaluator)
diff --git a/examples/benchmark/mmlu/evaluator.py b/examples/benchmark/mmlu/evaluator.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/hendrycks/test
+
+import numpy as np
+import paddle
+from tqdm import tqdm
+
+from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
+
+choices = ["A", "B", "C", "D"]
+
+
+class ModelEvaluator(object):
+    def __init__(self, model_name_or_path, ntrain, temperature=0.2, dtype="float32"):
+        self.model_name_or_path = model_name_or_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype=dtype, low_cpu_mem_usage=True)
+        self.model.eval()
+        self.generation_config = dict(
+            temperature=temperature,
+            top_k=40,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=1,
+            repetition_penalty=1.1,
+            max_new_tokens=20,
+        )
+
+        self.A_id = self.tokenizer.encode("A", add_special_tokens=False)["input_ids"][0]
+        self.B_id = self.tokenizer.encode("B", add_special_tokens=False)["input_ids"][0]
+        self.C_id = self.tokenizer.encode("C", add_special_tokens=False)["input_ids"][0]
+        self.D_id = self.tokenizer.encode("D", add_special_tokens=False)["input_ids"][0]
+        self.ntrain = ntrain
+
+    def format_subject(self, subject):
+        l = subject.split("_")
+        s = ""
+        for entry in l:
+            s += " " + entry
+        return s
+
+    def gen_prompt(self, train_df, subject, k=-1):
+        prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
+            self.format_subject(subject)
+        )
+        if k == -1:
+            k = train_df.shape[0]
+        for i in range(k):
+            prompt += self.format_example(train_df, i)
+        return prompt
+
+    def format_example(self, df, idx, include_answer=True):
+        prompt = df.iloc[idx, 0]
+        k = df.shape[1] - 2
+        for j in range(k):
+            prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+        prompt += "\nAnswer:"
+        if include_answer:
+            prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+        return prompt
+
+    def eval(self, subject, dev_df, test_df, do_ptq=False):
+        cors = []
+        all_probs = []
+        for i in tqdm(range(test_df.shape[0]), total=test_df.shape[0]):
+            # for i in range(test_df.shape[0]):
+            # get prompt and make sure it fits
+            k = self.ntrain
+            prompt_end = self.format_example(test_df, i, include_answer=False)
+            train_prompt = self.gen_prompt(dev_df, subject, k)
+            prompt = train_prompt + prompt_end
+
+            inputs = self.tokenizer(prompt, return_tensors="pd")
+            label = test_df.iloc[i, test_df.shape[1] - 1]
+
+            with paddle.no_grad():
+                logits = self.model(**inputs)[0][0, -1, :]
+            choices_logits = logits[[self.A_id, self.B_id, self.C_id, self.D_id]].numpy()
+            assert not (np.any(np.isinf(choices_logits)) or np.any(np.isnan(choices_logits)))
+            ans = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(choices_logits)]
+
+            cor = ans == label
+            cors.append(cor)
+            all_probs.append(choices_logits)
+
+            print(f"\n=======begin {str(i)}=======")
+            print("prompt: ", prompt)
+            print("ans: ", ans)
+            print("ground truth: ", label, "\n")
+            print(f"=======end {str(i)}=======")
+
+        acc = np.mean(cors)
+        cors = np.array(cors)
+
+        all_probs = np.array(all_probs)
+        print("Average accuracy {:.3f} - {}".format(acc, subject))
+
+        return cors, acc, all_probs