PaddlePaddle · zzjjay · Aug 21, 2023 · Aug 22, 2023 · Sep 11, 2023 · Sep 12, 2023
diff --git a/examples/benchmark/ceval/README.md b/examples/benchmark/ceval/README.md
@@ -10,14 +10,12 @@
 wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
 unzip ceval-exam.zip -d data
 ```
-将data文件夹放置于本项目的scripts/ceval目录下。
 
 ## 运行预测脚本
 
-运行以下脚本：
+在当前目录运行以下脚本：
 
 ```
-cd scripts/ceval
 python eval.py \
     --model_name_or_path /path/to/your/model \
     --cot False \

diff --git a/examples/benchmark/mmlu/README.md b/examples/benchmark/mmlu/README.md
@@ -0,0 +1,60 @@
+# MMLU 英文评测数据集
+MMLU ([Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300v3.pdf))用于衡量文本模型在多种任务上的准确性，是目前主流的 LLM 英文评测数据集。该数据集涵盖了57个任务，包括基础数学、美国历史、计算机科学、法律等等。
+
+此 MMLU 评测脚本修改自[hendrycks/test](https://github.com/hendrycks/test)项目。
+
+## 数据准备
+
+从指定路径下载评测数据集：
+
+```
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar xf data.tar
+```
+
+## 运行评测脚本
+
+在当前目录下运行以下脚本：
+
+- 单卡运行
+```
+export CUDA_VISIBLE_DEVICES=0
+python eval.py \
+    --model_name_or_path /path/to/your/model \
+    --temperature 0.2 \
+    --ntrain 5 \
+    --output_dir ${output_path} \
+    --dtype 'float16'
+```
+- 多卡运行
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.fleet.launch eval.py \
+    --model_name_or_path /path/to/your/model \
+    --temperature 0.2 \
+    --ntrain 5 \
+    --output_dir ${output_path} \
+    --dtype 'float16' \
+    --tensor_parallel_degree 4
+```
+
+参数说明
+
+- model_name_or_path：待评测模型所在目录
+- ntrain：指定few-shot实例的数量（5-shot：ntrain=5）
+- with_prompt：模型输入是否包含针对Alpaca模型的指令模板
+- temperature：模型解码时的温度
+- output_dir：指定评测结果的输出路径
+
+## 评测输出
+模型预测完成后，将在输出路径下用csv文件保存57个任务下模型的答题结果，其中 `sumaray.json` 包含模型在22个主题下和总体平均的评测结果。例如，json文件最后的All字段中会显示总体平均效果：
+
+```
+  "All": {
+    "score": 0.36701337295690933,
+    "num": 1346,
+  "correct": 494.0
+}
+```
+
+其中score为准确率，num为测试的总样本条数，correct为正确的数量。
diff --git a/examples/benchmark/mmlu/categories.py b/examples/benchmark/mmlu/categories.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/hendrycks/test
+
+subcategories = {
+    "abstract_algebra": ["math"],
+    "anatomy": ["health"],
+    "astronomy": ["physics"],
+    "business_ethics": ["business"],
+    "clinical_knowledge": ["health"],
+    "college_biology": ["biology"],
+    "college_chemistry": ["chemistry"],
+    "college_computer_science": ["computer science"],
+    "college_mathematics": ["math"],
+    "college_medicine": ["health"],
+    "college_physics": ["physics"],
+    "computer_security": ["computer science"],
+    "conceptual_physics": ["physics"],
+    "econometrics": ["economics"],
+    "electrical_engineering": ["engineering"],
+    "elementary_mathematics": ["math"],
+    "formal_logic": ["philosophy"],
+    "global_facts": ["other"],
+    "high_school_biology": ["biology"],
+    "high_school_chemistry": ["chemistry"],
+    "high_school_computer_science": ["computer science"],
+    "high_school_european_history": ["history"],
+    "high_school_geography": ["geography"],
+    "high_school_government_and_politics": ["politics"],
+    "high_school_macroeconomics": ["economics"],
+    "high_school_mathematics": ["math"],
+    "high_school_microeconomics": ["economics"],
+    "high_school_physics": ["physics"],
+    "high_school_psychology": ["psychology"],
+    "high_school_statistics": ["math"],
+    "high_school_us_history": ["history"],
+    "high_school_world_history": ["history"],
+    "human_aging": ["health"],
+    "human_sexuality": ["culture"],
+    "international_law": ["law"],
+    "jurisprudence": ["law"],
+    "logical_fallacies": ["philosophy"],
+    "machine_learning": ["computer science"],
+    "management": ["business"],
+    "marketing": ["business"],
+    "medical_genetics": ["health"],
+    "miscellaneous": ["other"],
+    "moral_disputes": ["philosophy"],
+    "moral_scenarios": ["philosophy"],
+    "nutrition": ["health"],
+    "philosophy": ["philosophy"],
+    "prehistory": ["history"],
+    "professional_accounting": ["other"],
+    "professional_law": ["law"],
+    "professional_medicine": ["health"],
+    "professional_psychology": ["psychology"],
+    "public_relations": ["politics"],
+    "security_studies": ["politics"],
+    "sociology": ["culture"],
+    "us_foreign_policy": ["politics"],
+    "virology": ["health"],
+    "world_religions": ["philosophy"],
+}
+
+categories = {
+    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
+    "humanities": ["history", "philosophy", "law"],
+    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
+    "other (business, health, misc.)": ["other", "business", "health"],
+}
diff --git a/examples/benchmark/mmlu/eval.py b/examples/benchmark/mmlu/eval.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/hendrycks/test
+import argparse
+import json
+import os
+
+import numpy as np
+import paddle
+import pandas as pd
+from categories import categories, subcategories
+from evaluator import ModelEvaluator
+
+choices = ["A", "B", "C", "D"]
+
+
+def main(args, evaluator):
+    subjects = sorted(
+        [f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f]
+    )
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path))):
+        os.makedirs(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path)), exist_ok=True)
+
+    all_cors = []
+    subcat_cors = {subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists}
+    cat_cors = {cat: [] for cat in categories}
+    summary = {}
+    for subject in subjects:
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[: args.ntrain]
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+
+        cors, acc, probs = evaluator.eval(args, subject, dev_df, test_df)
+        subcats = subcategories[subject]
+        for subcat in subcats:
+            subcat_cors[subcat].append(cors)
+            for key in categories.keys():
+                if subcat in categories[key]:
+                    cat_cors[key].append(cors)
+        all_cors.append(cors)
+
+        test_df["{}_correct".format(args.model_name_or_path)] = cors
+        for j in range(probs.shape[1]):
+            choice = choices[j]
+            test_df["{}_choice{}_probs".format(args.model_name_or_path, choice)] = probs[:, j]
+        test_df.to_csv(
+            os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "{}.csv".format(subject)),
+            index=None,
+        )
+
+    for subcat in subcat_cors:
+        subcat_acc = np.mean(np.concatenate(subcat_cors[subcat]))
+        print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat))
+        summary[subcat] = {
+            "acc:": subcat_acc,
+            "correct:": int(np.sum(np.concatenate(subcat_cors[subcat]))),
+            "num:": int(np.concatenate(subcat_cors[subcat]).size),
+        }
+
+    for cat in cat_cors:
+        cat_acc = np.mean(np.concatenate(cat_cors[cat]))
+        print("Average accuracy {:.3f} - {}".format(cat_acc, cat))
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+    print("Model:", args.model_name_or_path)
+    summary["All"] = {
+        "acc:": weighted_acc,
+        "correct:": int(np.sum(np.concatenate(all_cors))),
+        "num:": int(np.concatenate(all_cors).size),
+    }
+    json.dump(
+        summary,
+        open(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "summary.json"), "w"),
+        ensure_ascii=False,
+        indent=2,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str)
+    parser.add_argument("--ntrain", "-k", type=int, default=5)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--data_dir", "-d", type=str, default="data")
+    parser.add_argument("--output_dir", type=str, default="results")
+    parser.add_argument("--dtype", default="float32", type=str)
+    parser.add_argument("--tensor_parallel_degree", default=1, type=int)
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.tensor_parallel_degree > 1:
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.hybrid_configs = {
+            "mp_degree": args.tensor_parallel_degree,
+        }
+        # Set control in tensor parallel
+        strategy.tensor_parallel_configs = {"tensor_init_seed": 1234}
+        paddle.distributed.fleet.init(is_collective=True, strategy=strategy)
+    evaluator = ModelEvaluator(
+        model_name_or_path=args.model_name_or_path,
+        ntrain=args.ntrain,
+        temperature=args.temperature,
+        dtype=args.dtype,
+        tensor_parallel_degree=args.tensor_parallel_degree,
+    )
+
+    main(args, evaluator=evaluator)