Skip to content

Commit 868e7a2

Browse files
LiuChiachitianxin
andauthored
Add PP-MiniLM (#1403)
* # This is a combination of 2 commits. # The first commit's message is: update inference # This is the 2nd commit message: update * solve conflicts fix infer perf remove useless comments * update readme * update readme update readme * delete useless char * update reamde update code and readme update readme Add serial number to readme update readme Added a catalog fix a catalog bug fix a catalog bug * update reamde update reamde update readme update reamde update readme * update readme, add general readme, remove 'ofa' * remove infe_perf * Update README * Update README * remove blank space between Chinese characters and numbers * add blank space fix director * update data in readme * add blank space * update readme Co-authored-by: tianxin <[email protected]>
1 parent cc58a23 commit 868e7a2

File tree

24 files changed

+2933
-3
lines changed

24 files changed

+2933
-3
lines changed

examples/model_compression/PP-MiniLM/README.md

Lines changed: 389 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import numpy as np
15+
16+
from paddle.metric import Metric, Accuracy
17+
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
18+
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
19+
20+
MODEL_CLASSES = {
21+
"ernie": (ErnieForSequenceClassification, ErnieTokenizer),
22+
"bert": (BertForSequenceClassification, BertTokenizer)
23+
}
24+
25+
METRIC_CLASSES = {
26+
"afqmc": Accuracy,
27+
"tnews": Accuracy,
28+
"iflytek": Accuracy,
29+
"ocnli": Accuracy,
30+
"cmnli": Accuracy,
31+
"cluewsc2020": Accuracy,
32+
"csl": Accuracy,
33+
}
34+
35+
36+
def convert_example(example,
37+
tokenizer,
38+
label_list,
39+
max_seq_length=512,
40+
is_test=False):
41+
"""convert a glue example into necessary features"""
42+
if not is_test:
43+
# `label_list == None` is for regression task
44+
label_dtype = "int64" if label_list else "float32"
45+
# Get the label
46+
label = example['label']
47+
label = np.array([label], dtype=label_dtype)
48+
# Convert raw text to feature
49+
if 'sentence' in example:
50+
example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
51+
elif 'sentence1' in example:
52+
example = tokenizer(
53+
example['sentence1'],
54+
text_pair=example['sentence2'],
55+
max_seq_len=max_seq_length)
56+
elif 'keyword' in example: # CSL
57+
sentence1 = " ".join(example['keyword'])
58+
example = tokenizer(
59+
sentence1, text_pair=example['abst'], max_seq_len=max_seq_length)
60+
elif 'target' in example: # wsc
61+
text, query, pronoun, query_idx, pronoun_idx = example['text'], example[
62+
'target']['span1_text'], example['target']['span2_text'], example[
63+
'target']['span1_index'], example['target']['span2_index']
64+
text_list = list(text)
65+
assert text[pronoun_idx:(pronoun_idx + len(pronoun)
66+
)] == pronoun, "pronoun: {}".format(pronoun)
67+
assert text[query_idx:(query_idx + len(query)
68+
)] == query, "query: {}".format(query)
69+
if pronoun_idx > query_idx:
70+
text_list.insert(query_idx, "_")
71+
text_list.insert(query_idx + len(query) + 1, "_")
72+
text_list.insert(pronoun_idx + 2, "[")
73+
text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]")
74+
else:
75+
text_list.insert(pronoun_idx, "[")
76+
text_list.insert(pronoun_idx + len(pronoun) + 1, "]")
77+
text_list.insert(query_idx + 2, "_")
78+
text_list.insert(query_idx + len(query) + 2 + 1, "_")
79+
text = "".join(text_list)
80+
example = tokenizer(text, max_seq_len=max_seq_length)
81+
82+
if not is_test:
83+
return example['input_ids'], example['token_type_ids'], label
84+
else:
85+
return example['input_ids'], example['token_type_ids']
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import os
17+
18+
import paddle
19+
20+
from run_clue import MODEL_CLASSES
21+
22+
23+
def parse_args():
24+
parser = argparse.ArgumentParser()
25+
26+
# Required parameters
27+
parser.add_argument(
28+
"--model_type",
29+
default=None,
30+
type=str,
31+
required=True,
32+
help="Model type selected in the list: " +
33+
", ".join(MODEL_CLASSES.keys()), )
34+
parser.add_argument(
35+
"--model_path",
36+
default=None,
37+
type=str,
38+
required=True,
39+
help="Path of the trained model to be exported.", )
40+
parser.add_argument(
41+
"--output_path",
42+
default=None,
43+
type=str,
44+
required=True,
45+
help="The output file prefix used to save the exported inference model.",
46+
)
47+
args = parser.parse_args()
48+
return args
49+
50+
51+
def main():
52+
args = parse_args()
53+
54+
args.model_type = args.model_type.lower()
55+
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
56+
57+
# build model and load trained parameters
58+
model = model_class.from_pretrained(args.model_path)
59+
# switch to eval model
60+
model.eval()
61+
# convert to static graph with specific input description
62+
model = paddle.jit.to_static(
63+
model,
64+
input_spec=[
65+
paddle.static.InputSpec(
66+
shape=[None, None], dtype="int64"), # input_ids
67+
paddle.static.InputSpec(
68+
shape=[None, None], dtype="int64") # segment_ids
69+
])
70+
# save converted static graph model
71+
paddle.jit.save(model, args.output_path)
72+
# also save tokenizer for inference usage
73+
tokenizer = tokenizer_class.from_pretrained(args.model_path)
74+
tokenizer.save_pretrained(os.path.dirname(args.output_path))
75+
76+
77+
if __name__ == "__main__":
78+
main()
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# $1 means GENERAL_DIR
2+
3+
# The penultimate parameter is the card id, this script can be changed if necessary
4+
bash run_one_search.sh $1 afqmc 0 &
5+
bash run_one_search.sh $1 tnews 1 &
6+
bash run_one_search.sh $1 ifly 2 &
7+
bash run_one_search.sh $1 ocnli 3 &
8+
bash run_one_search.sh $1 csl 4 &
9+
bash run_one_search.sh $1 wsc 5 &
10+
11+
# Because the CMNLI data set is significantly larger than other data sets,
12+
# It needs to be placed on different cards.
13+
lr=1e-4
14+
bs=16
15+
sh run_clue.sh CMNLI $lr $bs 3 128 0 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
16+
bs=32
17+
sh run_clue.sh CMNLI $lr $bs 3 128 1 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
18+
bs=64
19+
sh run_clue.sh CMNLI $lr $bs 3 128 2 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
20+
21+
lr=5e-5
22+
bs=16
23+
sh run_clue.sh CMNLI $lr $bs 3 128 3 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
24+
bs=32
25+
sh run_clue.sh CMNLI $lr $bs 3 128 4 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
26+
bs=64
27+
sh run_clue.sh CMNLI $lr $bs 3 128 5 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
28+
29+
lr=3e-5
30+
bs=16
31+
sh run_clue.sh CMNLI $lr $bs 3 128 6 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
32+
bs=32
33+
sh run_clue.sh CMNLI $lr $bs 3 128 5 $1 > $1/cmnli/${lr}_${bs}_3_128.log &
34+
bs=64
35+
sh run_clue.sh CMNLI $lr $bs 3 128 7 $1 > $1/cmnli/${lr}_${bs}_3_128.log &

0 commit comments

Comments
 (0)