Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions examples/benchmark/ceval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@
wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
unzip ceval-exam.zip -d data
```
将data文件夹放置于本项目的scripts/ceval目录下。

## 运行预测脚本

运行以下脚本
在当前目录运行以下脚本

```
cd scripts/ceval
python eval.py \
--model_name_or_path /path/to/your/model \
--cot False \
Expand Down
60 changes: 60 additions & 0 deletions examples/benchmark/mmlu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# MMLU 英文评测数据集
MMLU ([Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300v3.pdf))用于衡量文本模型在多种任务上的准确性,是目前主流的 LLM 英文评测数据集。该数据集涵盖了57个任务,包括基础数学、美国历史、计算机科学、法律等等。

此 MMLU 评测脚本修改自[hendrycks/test](https://github.com/hendrycks/test)项目。

## 数据准备

从指定路径下载评测数据集:

```
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
tar xf data.tar
```

## 运行评测脚本

在当前目录下运行以下脚本:

- 单卡运行
```
export CUDA_VISIBLE_DEVICES=0
python eval.py \
--model_name_or_path /path/to/your/model \
--temperature 0.2 \
--ntrain 5 \
--output_dir ${output_path} \
--dtype 'float16'
```
- 多卡运行
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.fleet.launch eval.py \
--model_name_or_path /path/to/your/model \
--temperature 0.2 \
--ntrain 5 \
--output_dir ${output_path} \
--dtype 'float16' \
--tensor_parallel_degree 4
```

参数说明

- model_name_or_path:待评测模型所在目录
- ntrain:指定few-shot实例的数量(5-shot:ntrain=5)
- with_prompt:模型输入是否包含针对Alpaca模型的指令模板
- temperature:模型解码时的温度
- output_dir:指定评测结果的输出路径

## 评测输出
模型预测完成后,将在输出路径下用csv文件保存57个任务下模型的答题结果,其中 `sumaray.json` 包含模型在22个主题下和总体平均的评测结果。例如,json文件最后的All字段中会显示总体平均效果:

```
"All": {
"score": 0.36701337295690933,
"num": 1346,
"correct": 494.0
}
```

其中score为准确率,num为测试的总样本条数,correct为正确的数量。
81 changes: 81 additions & 0 deletions examples/benchmark/mmlu/categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/hendrycks/test

subcategories = {
"abstract_algebra": ["math"],
"anatomy": ["health"],
"astronomy": ["physics"],
"business_ethics": ["business"],
"clinical_knowledge": ["health"],
"college_biology": ["biology"],
"college_chemistry": ["chemistry"],
"college_computer_science": ["computer science"],
"college_mathematics": ["math"],
"college_medicine": ["health"],
"college_physics": ["physics"],
"computer_security": ["computer science"],
"conceptual_physics": ["physics"],
"econometrics": ["economics"],
"electrical_engineering": ["engineering"],
"elementary_mathematics": ["math"],
"formal_logic": ["philosophy"],
"global_facts": ["other"],
"high_school_biology": ["biology"],
"high_school_chemistry": ["chemistry"],
"high_school_computer_science": ["computer science"],
"high_school_european_history": ["history"],
"high_school_geography": ["geography"],
"high_school_government_and_politics": ["politics"],
"high_school_macroeconomics": ["economics"],
"high_school_mathematics": ["math"],
"high_school_microeconomics": ["economics"],
"high_school_physics": ["physics"],
"high_school_psychology": ["psychology"],
"high_school_statistics": ["math"],
"high_school_us_history": ["history"],
"high_school_world_history": ["history"],
"human_aging": ["health"],
"human_sexuality": ["culture"],
"international_law": ["law"],
"jurisprudence": ["law"],
"logical_fallacies": ["philosophy"],
"machine_learning": ["computer science"],
"management": ["business"],
"marketing": ["business"],
"medical_genetics": ["health"],
"miscellaneous": ["other"],
"moral_disputes": ["philosophy"],
"moral_scenarios": ["philosophy"],
"nutrition": ["health"],
"philosophy": ["philosophy"],
"prehistory": ["history"],
"professional_accounting": ["other"],
"professional_law": ["law"],
"professional_medicine": ["health"],
"professional_psychology": ["psychology"],
"public_relations": ["politics"],
"security_studies": ["politics"],
"sociology": ["culture"],
"us_foreign_policy": ["politics"],
"virology": ["health"],
"world_religions": ["philosophy"],
}

categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
"humanities": ["history", "philosophy", "law"],
"social sciences": ["politics", "culture", "economics", "geography", "psychology"],
"other (business, health, misc.)": ["other", "business", "health"],
}
120 changes: 120 additions & 0 deletions examples/benchmark/mmlu/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/hendrycks/test
import argparse
import json
import os

import numpy as np
import paddle
import pandas as pd
from categories import categories, subcategories
from evaluator import ModelEvaluator

choices = ["A", "B", "C", "D"]


def main(args, evaluator):
subjects = sorted(
[f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f]
)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path))):
os.makedirs(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path)), exist_ok=True)

all_cors = []
subcat_cors = {subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists}
cat_cors = {cat: [] for cat in categories}
summary = {}
for subject in subjects:
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[: args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)

cors, acc, probs = evaluator.eval(args, subject, dev_df, test_df)
subcats = subcategories[subject]
for subcat in subcats:
subcat_cors[subcat].append(cors)
for key in categories.keys():
if subcat in categories[key]:
cat_cors[key].append(cors)
all_cors.append(cors)

test_df["{}_correct".format(args.model_name_or_path)] = cors
for j in range(probs.shape[1]):
choice = choices[j]
test_df["{}_choice{}_probs".format(args.model_name_or_path, choice)] = probs[:, j]
test_df.to_csv(
os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "{}.csv".format(subject)),
index=None,
)

for subcat in subcat_cors:
subcat_acc = np.mean(np.concatenate(subcat_cors[subcat]))
print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat))
summary[subcat] = {
"acc:": subcat_acc,
"correct:": int(np.sum(np.concatenate(subcat_cors[subcat]))),
"num:": int(np.concatenate(subcat_cors[subcat]).size),
}

for cat in cat_cors:
cat_acc = np.mean(np.concatenate(cat_cors[cat]))
print("Average accuracy {:.3f} - {}".format(cat_acc, cat))
weighted_acc = np.mean(np.concatenate(all_cors))
print("Average accuracy: {:.3f}".format(weighted_acc))
print("Model:", args.model_name_or_path)
summary["All"] = {
"acc:": weighted_acc,
"correct:": int(np.sum(np.concatenate(all_cors))),
"num:": int(np.concatenate(all_cors).size),
}
json.dump(
summary,
open(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "summary.json"), "w"),
ensure_ascii=False,
indent=2,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--ntrain", "-k", type=int, default=5)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--data_dir", "-d", type=str, default="data")
parser.add_argument("--output_dir", type=str, default="results")
parser.add_argument("--dtype", default="float32", type=str)
parser.add_argument("--tensor_parallel_degree", default=1, type=int)

args = parser.parse_args()
print(args)

if args.tensor_parallel_degree > 1:
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.hybrid_configs = {
"mp_degree": args.tensor_parallel_degree,
}
# Set control in tensor parallel
strategy.tensor_parallel_configs = {"tensor_init_seed": 1234}
paddle.distributed.fleet.init(is_collective=True, strategy=strategy)
evaluator = ModelEvaluator(
model_name_or_path=args.model_name_or_path,
ntrain=args.ntrain,
temperature=args.temperature,
dtype=args.dtype,
tensor_parallel_degree=args.tensor_parallel_degree,
)

main(args, evaluator=evaluator)
Loading