Skip to content

Commit 86489d8

Browse files
authored
Paddle benchmark (#1827)
1 parent ac6fc3a commit 86489d8

File tree

105 files changed

+3422
-236
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+3422
-236
lines changed

examples/language_model/bert/run_pretrain.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from paddle.io import DataLoader, Dataset
3333

3434
from paddlenlp.data import Stack, Tuple, Pad
35+
from paddlenlp.utils import profiler
3536
from paddlenlp.utils.tools import TimeCostAverage
3637
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
3738
from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
@@ -162,6 +163,14 @@ def parse_args():
162163
type=distutils.util.strtobool,
163164
default=False,
164165
help="Enable training under @to_static.")
166+
167+
# For benchmark.
168+
parser.add_argument(
169+
'--profiler_options',
170+
type=str,
171+
default=None,
172+
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
173+
)
165174
args = parser.parse_args()
166175
return args
167176

@@ -439,6 +448,11 @@ def do_train(args):
439448
total_samples += args.batch_size
440449
train_run_cost = time.time() - batch_start
441450
train_cost_avg.record(train_run_cost)
451+
452+
# Profile for model benchmark
453+
if args.profiler_options is not None:
454+
profiler.add_profiler_step(args.profiler_options)
455+
442456
if global_step % args.logging_steps == 0:
443457
if paddle.distributed.get_rank() == 0:
444458
logger.info(

examples/language_model/bert/static/run_pretrain.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import paddle.distributed.fleet as fleet
3030
from paddle.io import DataLoader, Dataset
3131

32+
from paddlenlp.utils import profiler
3233
from paddlenlp.utils.tools import TimeCostAverage
3334
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
3435
from paddlenlp.transformers import BertTokenizer
@@ -154,6 +155,14 @@ def parse_args():
154155
default=1,
155156
help="Number of merge steps before gradient update."
156157
"global_batch_size = gradient_merge_steps * batch_size.")
158+
159+
# For benchmark.
160+
parser.add_argument(
161+
'--profiler_options',
162+
type=str,
163+
default=None,
164+
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
165+
)
157166
args = parser.parse_args()
158167
return args
159168

@@ -385,6 +394,11 @@ def do_train(args):
385394
lr_scheduler.step()
386395
train_run_cost = time.time() - batch_start
387396
train_cost_avg.record(train_run_cost)
397+
398+
# Profile for model benchmark
399+
if args.profiler_options is not None:
400+
profiler.add_profiler_step(args.profiler_options)
401+
388402
if global_step % args.logging_steps == 0:
389403
print(
390404
"tobal step: %d, epoch: %d, batch: %d, loss: %f, "

examples/language_model/gpt/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ def create_pretrained_dataset(
272272
if local_rank == 0:
273273
start_time = time.time()
274274
print('> compiling dataset index builder ...')
275+
sys.path.append(
276+
os.path.abspath(
277+
os.path.join(os.path.dirname(__file__), os.pardir)))
275278
from data_tools.dataset_utils import compile_helper
276279
compile_helper()
277280
print(
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import numpy as np
2+
3+
import paddle
4+
5+
from paddlenlp.datasets import load_dataset
6+
from paddlenlp.data import Vocab
7+
8+
9+
def create_data_loader(batch_size, num_steps, data_path=None):
10+
train_ds, valid_ds, test_ds = load_dataset(
11+
'ptb', splits=('train', 'valid', 'test'))
12+
13+
train_examples = [
14+
train_ds[i]['sentence'].split() for i in range(len(train_ds))
15+
]
16+
vocab = Vocab.build_vocab(train_examples, eos_token='</eos>')
17+
18+
# Because the sentences in PTB dataset might be consecutive, we need to concatenate
19+
# all texts from our dataset and fold them into chunks while the number of rows is
20+
# equal to batch size. For example:
21+
#
22+
# Sentence1: we're talking about years ago before anyone heard of asbestos having
23+
# any questionable properties.
24+
# Sentence2: there is no asbestos in our products now.
25+
# Batch_size: 5
26+
# Grouped_text: [["we're", "talking", "about", "years"],
27+
# ["ago", "before", "anyone", "heard"],
28+
# ["of", "asbestos", "having", "any"],
29+
# ["questionable", "properties", "there", "is"],
30+
# ["no", "asbestos", "in", "our"]]
31+
#
32+
def group_texts(examples):
33+
concat_examples = []
34+
for example in examples:
35+
concat_examples += example['sentence'].split() + ['</eos>']
36+
37+
concat_examples = vocab.to_indices(concat_examples)
38+
39+
max_seq_len = len(concat_examples) // batch_size
40+
reshaped_examples = np.asarray(
41+
concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape(
42+
(batch_size, max_seq_len))
43+
encoded_examples = []
44+
for i in range(max_seq_len // num_steps):
45+
encoded_examples.append(
46+
(np.copy(reshaped_examples[:, i * num_steps:(i + 1) *
47+
num_steps]),
48+
np.copy(reshaped_examples[:, i * num_steps + 1:(i + 1) *
49+
num_steps + 1])))
50+
51+
return encoded_examples
52+
53+
train_ds.map(group_texts, batched=True)
54+
valid_ds.map(group_texts, batched=True)
55+
test_ds.map(group_texts, batched=True)
56+
57+
train_loader = paddle.io.DataLoader(
58+
train_ds, return_list=True, batch_size=None)
59+
valid_loader = paddle.io.DataLoader(
60+
valid_ds, return_list=True, batch_size=None)
61+
test_loader = paddle.io.DataLoader(
62+
test_ds, return_list=True, batch_size=None)
63+
return train_loader, valid_loader, test_loader, len(vocab)

examples/language_model/rnnlm/train.py

Lines changed: 1 addition & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,71 +5,13 @@
55

66
from model import RnnLm, CrossEntropyLossForLm, UpdateModel
77
from args import parse_args
8+
from reader import create_data_loader
89

9-
from paddlenlp.datasets import load_dataset
1010
from paddlenlp.metrics import Perplexity
11-
from paddlenlp.data import Vocab
1211

1312
paddle.seed(102)
1413

1514

16-
def create_data_loader(batch_size, num_steps, data_path):
17-
train_ds, valid_ds, test_ds = load_dataset(
18-
'ptb', splits=('train', 'valid', 'test'))
19-
20-
train_examples = [
21-
train_ds[i]['sentence'].split() for i in range(len(train_ds))
22-
]
23-
vocab = Vocab.build_vocab(train_examples, eos_token='</eos>')
24-
25-
# Because the sentences in PTB dataset might be consecutive, we need to concatenate
26-
# all texts from our dataset and fold them into chunks while the number of rows is
27-
# equal to batch size. For example:
28-
#
29-
# Sentence1: we're talking about years ago before anyone heard of asbestos having
30-
# any questionable properties.
31-
# Sentence2: there is no asbestos in our products now.
32-
# Batch_size: 5
33-
# Grouped_text: [["we're", "talking", "about", "years"],
34-
# ["ago", "before", "anyone", "heard"],
35-
# ["of", "asbestos", "having", "any"],
36-
# ["questionable", "properties", "there", "is"],
37-
# ["no", "asbestos", "in", "our"]]
38-
#
39-
def group_texts(examples):
40-
concat_examples = []
41-
for example in examples:
42-
concat_examples += example['sentence'].split() + ['</eos>']
43-
44-
concat_examples = vocab.to_indices(concat_examples)
45-
46-
max_seq_len = len(concat_examples) // batch_size
47-
reshaped_examples = np.asarray(
48-
concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape(
49-
(batch_size, max_seq_len))
50-
encoded_examples = []
51-
for i in range(max_seq_len // num_steps):
52-
encoded_examples.append(
53-
(np.copy(reshaped_examples[:, i * num_steps:(i + 1) *
54-
num_steps]),
55-
np.copy(reshaped_examples[:, i * num_steps + 1:(i + 1) *
56-
num_steps + 1])))
57-
58-
return encoded_examples
59-
60-
train_ds.map(group_texts, batched=True)
61-
valid_ds.map(group_texts, batched=True)
62-
test_ds.map(group_texts, batched=True)
63-
64-
train_loader = paddle.io.DataLoader(
65-
train_ds, return_list=True, batch_size=None)
66-
valid_loader = paddle.io.DataLoader(
67-
valid_ds, return_list=True, batch_size=None)
68-
test_loader = paddle.io.DataLoader(
69-
test_ds, return_list=True, batch_size=None)
70-
return train_loader, valid_loader, test_loader, len(vocab)
71-
72-
7315
def train(args):
7416
paddle.set_device(args.device)
7517
data_path = args.data_path

examples/language_model/xlnet/run_glue.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -153,20 +153,8 @@ def convert_example(example,
153153
'attention_mask']
154154

155155

156-
def do_train(args):
157-
paddle.set_device(args.device)
158-
if paddle.distributed.get_world_size() > 1:
159-
paddle.distributed.init_parallel_env()
160-
161-
set_seed(args)
162-
global final_res
163-
164-
args.task_name = args.task_name.lower()
165-
metric_class = METRIC_CLASSES[args.task_name]
166-
model_class, tokenizer_class = XLNetForSequenceClassification, XLNetTokenizer
167-
156+
def create_data_loader(args, tokenizer):
168157
train_ds = load_dataset('glue', args.task_name, splits="train")
169-
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
170158

171159
trans_func = partial(
172160
convert_example,
@@ -226,6 +214,26 @@ def do_train(args):
226214
num_workers=0,
227215
return_list=True)
228216

217+
return train_data_loader, dev_data_loader, train_ds, dev_ds
218+
219+
220+
def do_train(args):
221+
paddle.set_device(args.device)
222+
if paddle.distributed.get_world_size() > 1:
223+
paddle.distributed.init_parallel_env()
224+
225+
set_seed(args)
226+
global final_res
227+
228+
args.task_name = args.task_name.lower()
229+
metric_class = METRIC_CLASSES[args.task_name]
230+
model_class, tokenizer_class = XLNetForSequenceClassification, XLNetTokenizer
231+
232+
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
233+
234+
train_data_loader, dev_data_loader, train_ds, dev_ds = create_data_loader(
235+
args, tokenizer)
236+
229237
num_classes = 1 if train_ds.label_list is None else len(train_ds.label_list)
230238
model = XLNetForSequenceClassification.from_pretrained(
231239
args.model_name_or_path, num_classes=num_classes)

examples/machine_translation/seq2seq/args.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,6 @@
1818
def parse_args():
1919
parser = argparse.ArgumentParser(description=__doc__)
2020

21-
parser.add_argument(
22-
"--optimizer",
23-
type=str,
24-
default='adam',
25-
help="optimizer to use, only supprt[sgd|adam]")
26-
2721
parser.add_argument(
2822
"--learning_rate",
2923
type=float,
@@ -55,7 +49,7 @@ def parse_args():
5549
help="max length for source and target sentence")
5650

5751
parser.add_argument(
58-
"--dropout", type=float, default=0.0, help="drop probability")
52+
"--dropout", type=float, default=0.2, help="drop probability")
5953

6054
parser.add_argument(
6155
"--init_scale",

examples/machine_translation/transformer/static/train.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import paddle.distributed.fleet as fleet
1414
import paddle.distributed as dist
1515

16+
from paddlenlp.utils import profiler
1617
from paddlenlp.transformers import TransformerModel, CrossEntropyCriterion
1718

1819
sys.path.append(
@@ -82,6 +83,14 @@ def parse_args():
8283
default=None,
8384
type=str,
8485
help="The eos token. It should be provided when use custom vocab_file. ")
86+
87+
# For benchmark.
88+
parser.add_argument(
89+
'--profiler_options',
90+
type=str,
91+
default=None,
92+
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
93+
)
8594
args = parser.parse_args()
8695
return args
8796

@@ -257,6 +266,10 @@ def do_train(args):
257266
reader_cost_avg.record(train_reader_cost)
258267
batch_cost_avg.record(train_batch_cost)
259268

269+
# Profile for model benchmark
270+
if args.profiler_options is not None:
271+
profiler.add_profiler_step(args.profiler_options)
272+
260273
if step_idx % args.print_step == 0 and (args.benchmark or (
261274
args.is_distributed and dist.get_rank() == 0) or
262275
not args.is_distributed):
@@ -330,5 +343,6 @@ def do_train(args):
330343
args.bos_token = ARGS.bos_token
331344
args.eos_token = ARGS.eos_token
332345
pprint(args)
346+
args.profiler_options = ARGS.profiler_options
333347

334348
do_train(args)

0 commit comments

Comments
 (0)