Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions examples/benchmark/ceval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@
wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
unzip ceval-exam.zip -d data
```
将data文件夹放置于本项目的scripts/ceval目录下。

## 运行预测脚本

运行以下脚本
在当前目录运行以下脚本

```
cd scripts/ceval
python eval.py \
--model_name_or_path /path/to/your/model \
--cot False \
Expand Down
47 changes: 47 additions & 0 deletions examples/benchmark/mmlu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# MMLU 英文评测数据集
MMLU ([Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300v3.pdf))用于衡量文本模型在多种任务上的准确性,是目前主流的 LLM 英文评测数据集。该数据集涵盖了57个任务,包括基础数学、美国历史、计算机科学、法律等等。

此 MMLU 评测脚本修改自[hendrycks/test](https://github.com/hendrycks/test)项目。

## 数据准备

从指定路径下载评测数据集:

```
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
tar xf data.tar
```

## 运行评测脚本

在当前目录下运行以下脚本:

```
python eval.py \
--model_name_or_path /path/to/your/model \
--temperature 0.2 \
--ntrain 5 \
--output_dir ${output_path} \
--dtype 'float16'
```

参数说明

- model_name_or_path:待评测模型所在目录
- ntrain:指定few-shot实例的数量(5-shot:ntrain=5)
- with_prompt:模型输入是否包含针对Alpaca模型的指令模板
- temperature:模型解码时的温度
- output_dir:指定评测结果的输出路径

## 评测输出
模型预测完成后,将在输出路径下用csv文件保存57个任务下模型的答题结果,其中 `sumaray.json` 包含模型在22个主题下和总体平均的评测结果。例如,json文件最后的All字段中会显示总体平均效果:

```
"All": {
"score": 0.36701337295690933,
"num": 1346,
"correct": 494.0
}
```

其中score为准确率,num为测试的总样本条数,correct为正确的数量。
81 changes: 81 additions & 0 deletions examples/benchmark/mmlu/categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/hendrycks/test

subcategories = {
"abstract_algebra": ["math"],
"anatomy": ["health"],
"astronomy": ["physics"],
"business_ethics": ["business"],
"clinical_knowledge": ["health"],
"college_biology": ["biology"],
"college_chemistry": ["chemistry"],
"college_computer_science": ["computer science"],
"college_mathematics": ["math"],
"college_medicine": ["health"],
"college_physics": ["physics"],
"computer_security": ["computer science"],
"conceptual_physics": ["physics"],
"econometrics": ["economics"],
"electrical_engineering": ["engineering"],
"elementary_mathematics": ["math"],
"formal_logic": ["philosophy"],
"global_facts": ["other"],
"high_school_biology": ["biology"],
"high_school_chemistry": ["chemistry"],
"high_school_computer_science": ["computer science"],
"high_school_european_history": ["history"],
"high_school_geography": ["geography"],
"high_school_government_and_politics": ["politics"],
"high_school_macroeconomics": ["economics"],
"high_school_mathematics": ["math"],
"high_school_microeconomics": ["economics"],
"high_school_physics": ["physics"],
"high_school_psychology": ["psychology"],
"high_school_statistics": ["math"],
"high_school_us_history": ["history"],
"high_school_world_history": ["history"],
"human_aging": ["health"],
"human_sexuality": ["culture"],
"international_law": ["law"],
"jurisprudence": ["law"],
"logical_fallacies": ["philosophy"],
"machine_learning": ["computer science"],
"management": ["business"],
"marketing": ["business"],
"medical_genetics": ["health"],
"miscellaneous": ["other"],
"moral_disputes": ["philosophy"],
"moral_scenarios": ["philosophy"],
"nutrition": ["health"],
"philosophy": ["philosophy"],
"prehistory": ["history"],
"professional_accounting": ["other"],
"professional_law": ["law"],
"professional_medicine": ["health"],
"professional_psychology": ["psychology"],
"public_relations": ["politics"],
"security_studies": ["politics"],
"sociology": ["culture"],
"us_foreign_policy": ["politics"],
"virology": ["health"],
"world_religions": ["philosophy"],
}

categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
"humanities": ["history", "philosophy", "law"],
"social sciences": ["politics", "culture", "economics", "geography", "psychology"],
"other (business, health, misc.)": ["other", "business", "health"],
}
109 changes: 109 additions & 0 deletions examples/benchmark/mmlu/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/hendrycks/test
import argparse
import json
import os

import numpy as np
import pandas as pd
from categories import categories, subcategories
from evaluator import ModelEvaluator

choices = ["A", "B", "C", "D"]


def main(args, evaluator):
subjects = sorted(
[f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f]
)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if not os.path.exists(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path))):
os.makedirs(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path)))

all_cors = []
subcat_cors = {subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists}
cat_cors = {cat: [] for cat in categories}
summary = {}
for subject in subjects:
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[: args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)

cors, acc, probs = evaluator.eval(args, subject, dev_df, test_df)
subcats = subcategories[subject]
for subcat in subcats:
subcat_cors[subcat].append(cors)
for key in categories.keys():
if subcat in categories[key]:
cat_cors[key].append(cors)
all_cors.append(cors)

test_df["{}_correct".format(args.model_name_or_path)] = cors
for j in range(probs.shape[1]):
choice = choices[j]
test_df["{}_choice{}_probs".format(args.model_name_or_path, choice)] = probs[:, j]
test_df.to_csv(
os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "{}.csv".format(subject)),
index=None,
)

for subcat in subcat_cors:
subcat_acc = np.mean(np.concatenate(subcat_cors[subcat]))
print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat))
summary[subcat] = {
"acc:": subcat_acc,
"correct:": int(np.sum(np.concatenate(subcat_cors[subcat]))),
"num:": int(np.concatenate(subcat_cors[subcat]).size),
}

for cat in cat_cors:
cat_acc = np.mean(np.concatenate(cat_cors[cat]))
print("Average accuracy {:.3f} - {}".format(cat_acc, cat))
weighted_acc = np.mean(np.concatenate(all_cors))
print("Average accuracy: {:.3f}".format(weighted_acc))
print("Model:", args.model_name_or_path)
summary["All"] = {
"acc:": weighted_acc,
"correct:": int(np.sum(np.concatenate(all_cors))),
"num:": int(np.concatenate(all_cors).size),
}
json.dump(
summary,
open(os.path.join(args.output_dir, "results_{}".format(args.model_name_or_path), "summary.json"), "w"),
ensure_ascii=False,
indent=2,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--ntrain", "-k", type=int, default=5)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--data_dir", "-d", type=str, default="data")
parser.add_argument("--output_dir", type=str, default="results")
parser.add_argument("--dtype", default="float32", type=str)

args = parser.parse_args()
print(args)

evaluator = ModelEvaluator(
model_name_or_path=args.model_name_or_path,
ntrain=args.ntrain,
temperature=args.temperature,
dtype=args.dtype,
)

main(args, evaluator=evaluator)
110 changes: 110 additions & 0 deletions examples/benchmark/mmlu/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/hendrycks/test

import numpy as np
import paddle
from tqdm import tqdm

from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer

choices = ["A", "B", "C", "D"]


class ModelEvaluator(object):
def __init__(self, model_name_or_path, ntrain, temperature=0.2, dtype="float32"):
self.model_name_or_path = model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype=dtype, low_cpu_mem_usage=True)
self.model.eval()
self.generation_config = dict(
temperature=temperature,
top_k=40,
top_p=0.9,
do_sample=True,
num_beams=1,
repetition_penalty=1.1,
max_new_tokens=20,
)

self.A_id = self.tokenizer.encode("A", add_special_tokens=False)["input_ids"][0]
self.B_id = self.tokenizer.encode("B", add_special_tokens=False)["input_ids"][0]
self.C_id = self.tokenizer.encode("C", add_special_tokens=False)["input_ids"][0]
self.D_id = self.tokenizer.encode("D", add_special_tokens=False)["input_ids"][0]
self.ntrain = ntrain

def format_subject(self, subject):
l = subject.split("_")
s = ""
for entry in l:
s += " " + entry
return s

def gen_prompt(self, train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
self.format_subject(subject)
)
if k == -1:
k = train_df.shape[0]
for i in range(k):
prompt += self.format_example(train_df, i)
return prompt

def format_example(self, df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt

def eval(self, subject, dev_df, test_df, do_ptq=False):
cors = []
all_probs = []
for i in tqdm(range(test_df.shape[0]), total=test_df.shape[0]):
# for i in range(test_df.shape[0]):
# get prompt and make sure it fits
k = self.ntrain
prompt_end = self.format_example(test_df, i, include_answer=False)
train_prompt = self.gen_prompt(dev_df, subject, k)
prompt = train_prompt + prompt_end

inputs = self.tokenizer(prompt, return_tensors="pd")
label = test_df.iloc[i, test_df.shape[1] - 1]

with paddle.no_grad():
logits = self.model(**inputs)[0][0, -1, :]
choices_logits = logits[[self.A_id, self.B_id, self.C_id, self.D_id]].numpy()
assert not (np.any(np.isinf(choices_logits)) or np.any(np.isnan(choices_logits)))
ans = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(choices_logits)]

cor = ans == label
cors.append(cor)
all_probs.append(choices_logits)

print(f"\n=======begin {str(i)}=======")
print("prompt: ", prompt)
print("ans: ", ans)
print("ground truth: ", label, "\n")
print(f"=======end {str(i)}=======")

acc = np.mean(cors)
cors = np.array(cors)

all_probs = np.array(all_probs)
print("Average accuracy {:.3f} - {}".format(acc, subject))

return cors, acc, all_probs