Skip to content

Commit 1b4821b

Browse files
authored
Add training benchmark for gpt / gpt-3. (#1091)
1 parent baec4d7 commit 1b4821b

File tree

11 files changed

+488
-32
lines changed

11 files changed

+488
-32
lines changed

examples/language_model/gpt-3/dygraph/dataset.py

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,30 @@ def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
7474
separate_last_epoch = (
7575
last_epoch_num_samples < int(0.80 * num_samples_per_epoch))
7676
# Note. len(doc_idx) = num_epochs * len(doc)
77+
start_time = time.time()
7778
doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
7879
separate_last_epoch)
7980
np.save(doc_idx_filename, doc_idx, allow_pickle=True)
80-
81+
print(' > elasped time to build and save doc-idx mapping '
82+
'(seconds): {:4f}'.format(time.time() - start_time))
8183
# sample-idx. pos of each seq_len of data.
84+
start_time = time.time()
8285
assert doc_idx.dtype == np.int32
83-
sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
84-
num_epochs, tokens_per_epoch)
86+
assert sizes.dtype == np.int32
87+
88+
import data_tools.helpers as helpers
89+
90+
sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
91+
num_epochs, tokens_per_epoch)
92+
# sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
93+
# num_epochs, tokens_per_epoch)
94+
8595
np.save(sample_idx_filename, sample_idx, allow_pickle=True)
96+
print(' > elasped time to build and save sample-idx mapping '
97+
'(seconds): {:4f}'.format(time.time() - start_time))
98+
99+
# shuffle-idx.
100+
start_time = time.time()
86101

87102
if separate_last_epoch:
88103
num_samples_ = num_samples_from_epochs_minus_one
@@ -93,14 +108,25 @@ def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
93108
shuffle_idx = _build_shuffle_idx(num_samples_,
94109
sample_idx.shape[0] - 1, np_rng)
95110
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
111+
print(' > elasped time to build and save shuffle-idx mapping'
112+
' (seconds): {:4f}'.format(time.time() - start_time))
113+
96114
else:
97115
while True:
98116
if (not os.path.isfile(doc_idx_filename)) or \
99117
(not os.path.isfile(sample_idx_filename)) or \
100118
(not os.path.isfile(shuffle_idx_filename)):
101119
time.sleep(3)
102120
else:
103-
break
121+
try:
122+
np.load(
123+
shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
124+
break
125+
except Exception as e:
126+
print(
127+
"%s file is still writing or damaged, please wait a moment."
128+
% shuffle_idx_filename)
129+
time.sleep(3)
104130

105131
# Restore random state
106132
np_rng.set_state(savedState)
@@ -241,19 +267,48 @@ def create_pretrained_dataset(args,
241267
max_seq_len=1024,
242268
places=None,
243269
data_holders=None):
270+
if local_rank == 0:
271+
start_time = time.time()
272+
print('> compiling dataset index builder ...')
273+
from data_tools.dataset_utils import compile_helper
274+
compile_helper()
275+
print(
276+
'>>> done with dataset index builder. Compilation time: {:.3f} '
277+
'seconds'.format(time.time() - start_time),
278+
flush=True)
279+
244280
device_world_size = paddle.distributed.get_world_size()
245281
device_world_rank = paddle.distributed.get_rank()
246282

247283
logger.info(
248284
"The distributed run, total device num:{}, distinct dataflow num:{}.".
249285
format(device_world_size, data_world_size))
250286

251-
process_datas = np.load(input_path, mmap_mode="r+", allow_pickle=True)
252-
# All documment ids, extend as 1-D array.
253-
sample_ids = process_datas["ids"]
254-
# The len(sample_lens) num of docs
255-
# The sum(sample_lens) should equal len(sample_ids)
256-
sample_lens = process_datas["lens"]
287+
assert len(input_path) == 1, "GPT only support one dataset for now."
288+
289+
input_prefix = input_path[0]
290+
291+
if os.path.isfile(input_prefix + "_ids.npz"):
292+
logger.warning(
293+
"You are using compatible dataset, please make new dataset as the readme!"
294+
)
295+
process_datas = np.load(
296+
input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True)
297+
sample_ids = process_datas["ids"]
298+
sample_lens = process_datas["lens"].astype("int32")
299+
else:
300+
for suffix in ["_ids.npy", "_idx.npz"]:
301+
if not os.path.isfile(input_prefix + suffix):
302+
raise ValueError("File Not found, %s" % (path + suffix))
303+
304+
sample_ids = np.load(
305+
input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True)
306+
# All documment ids, extend as 1-D array.
307+
308+
process_datas = np.load(input_prefix + "_idx.npz")
309+
# The len(sample_lens) num of docs
310+
# The sum(sample_lens) should equal len(sample_ids)
311+
sample_lens = process_datas["lens"]
257312

258313
splits = get_train_valid_test_split_(args.split, len(sample_lens))
259314
assert len(sample_lens) >= splits[
@@ -262,7 +317,7 @@ def create_pretrained_dataset(args,
262317

263318
def build_dataset(index, name, num_samples):
264319
dataset = GPTDataset(
265-
file_path=input_path,
320+
file_path=input_prefix,
266321
build_data_file=local_rank == 0,
267322
name="gpt_" + name,
268323
max_seq_len=max_seq_len,

examples/language_model/gpt-3/dygraph/run_pretrain.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import os
1818
import random
1919
import time
20+
import sys
2021

2122
import numpy as np
2223
import paddle
@@ -25,6 +26,10 @@
2526
from paddlenlp.transformers import GPTTokenizer, GPTChineseTokenizer
2627
from paddlenlp.utils.log import logger
2728

29+
# to import data_tools
30+
filepath = os.path.abspath(os.path.dirname(__file__))
31+
sys.path.insert(0, os.path.join(filepath, "../../"))
32+
2833
from dataset import create_pretrained_dataset
2934
from args import parse_args
3035
import lr
@@ -87,6 +92,30 @@ def run_evaluate(args,
8792
model.train()
8893

8994

95+
def get_train_data_file(args):
96+
files = [
97+
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
98+
if (os.path.isfile(os.path.join(args.input_dir, f)) and str(f).endswith(
99+
"_idx.npz"))
100+
]
101+
files = [x.replace("_idx.npz", "") for x in files]
102+
if len(files) == 0:
103+
logger.warning(
104+
"Not found dataset with name of xxx_ids.npy and xxx_idx.npz! Try to found old compatible xxx_ids.npz file."
105+
)
106+
else:
107+
return files
108+
109+
files = [
110+
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
111+
if (os.path.isfile(os.path.join(args.input_dir, f)) and str(f).endswith(
112+
"_ids.npz"))
113+
]
114+
115+
files = [x.replace("_ids.npz", "") for x in files]
116+
return files
117+
118+
90119
def do_train(args):
91120
paddle.set_device(args.device)
92121
strategy = fleet.DistributedStrategy()
@@ -240,18 +269,13 @@ def do_train(args):
240269
global_step = 0
241270
tic_train = time.time()
242271
for epoch in range(args.num_train_epochs):
243-
files = [
244-
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
245-
if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_"
246-
not in str(f))
247-
]
272+
files = get_train_data_file(args)
248273
files.sort()
249274
num_files = len(files)
250275
for f_id in range(num_files):
251276
data_file = files[f_id]
252277
train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
253-
args,
254-
data_file,
278+
args, [data_file],
255279
local_rank=local_rank,
256280
data_world_size=data_world_size,
257281
data_world_rank=data_world_rank,

examples/language_model/gpt-3/static/run_pretrain_static.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
from visualdl import LogWriter
3737

3838
# Used to load the data_tools path, should import before dataset
39-
sys.path.insert(0, "../../")
39+
filepath = os.path.abspath(os.path.dirname(__file__))
40+
sys.path.insert(0, os.path.join(filepath, "../../"))
4041
from dataset import create_pretrained_dataset
4142
from args import parse_args
4243
import lr
@@ -448,7 +449,7 @@ def do_train(args):
448449
save_persistables(exe,
449450
os.path.join(output_dir, "static_vars"),
450451
main_program)
451-
if global_step == args.save_steps:
452+
if global_step <= args.save_steps:
452453
model.init_config["init_args"][0].init_config.pop("topo",
453454
None)
454455
model.save_pretrained(output_dir)

examples/language_model/gpt/args.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def parse_args(MODEL_CLASSES):
211211
parser.add_argument(
212212
"--scale_loss",
213213
type=float,
214-
default=128,
214+
default=32768,
215215
help="The value of scale_loss for fp16. This is only used for AMP training."
216216
)
217217
parser.add_argument(
@@ -245,6 +245,13 @@ def parse_args(MODEL_CLASSES):
245245
default="cosine",
246246
choices=["cosine", "none"],
247247
help="Learning rate decay style.")
248+
parser.add_argument(
249+
'-p',
250+
'--profiler_options',
251+
type=str,
252+
default=None,
253+
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
254+
)
248255
args = parser.parse_args()
249256
args.test_iters = args.eval_iters * 10
250257

examples/language_model/gpt/run_pretrain.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from dataset import create_pretrained_dataset
3030
from args import parse_args
3131
import lr
32+
from paddle.distributed import fleet
3233

3334
MODEL_CLASSES = {
3435
"gpt": (GPTForPretraining, GPTTokenizer),
@@ -183,6 +184,9 @@ def do_train(args):
183184
grad_clip=clip,
184185
apply_decay_param_fun=lambda x: x in decay_params)
185186

187+
if args.use_amp:
188+
scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
189+
186190
if args.model_name_or_path not in pretrained_models_list:
187191
logger.info("Try to load checkpoint from %s " % args.model_name_or_path)
188192
opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt")
@@ -218,9 +222,27 @@ def do_train(args):
218222
tokens, loss_mask, attention_mask, position_ids, labels = batch
219223
loss_mask.stop_gradient = True
220224
attention_mask.stop_gradient = True
225+
with paddle.amp.auto_cast(
226+
args.use_amp,
227+
custom_white_list=["layer_norm", "softmax", "gelu"],
228+
custom_black_list=[
229+
"reduce_sum", "c_softmax_with_cross_entropy",
230+
"c_embedding"
231+
]):
232+
233+
preds = model(tokens, position_ids, attention_mask)
234+
loss = criterion(preds, labels, loss_mask)
235+
236+
if args.use_amp:
237+
scaler.scale(loss).backward()
238+
scaler.minimize(optimizer, loss)
239+
else:
240+
loss.backward()
241+
optimizer.step()
221242

222-
preds = model(tokens, position_ids, attention_mask)
223-
loss = criterion(preds, labels, loss_mask)
243+
if lr_scheduler is not None:
244+
lr_scheduler.step()
245+
optimizer.clear_grad()
224246

225247
if global_step % args.logging_freq == 0:
226248
speed = args.logging_freq / (time.time() - tic_train)
@@ -233,11 +255,6 @@ def do_train(args):
233255
optimizer.get_lr(), global_step)
234256

235257
tic_train = time.time()
236-
loss.backward()
237-
optimizer.step()
238-
if lr_scheduler is not None:
239-
lr_scheduler.step()
240-
optimizer.clear_grad()
241258

242259
if args.check_accuracy:
243260
if global_step >= args.max_steps:

examples/language_model/gpt/run_pretrain_static.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from paddlenlp.transformers import GPTTokenizer, GPTChineseTokenizer
3232
from paddlenlp.ops import Topology, get_rng_state_tracker
3333
from paddlenlp.utils.log import logger
34+
from paddlenlp.utils import profiler
3435
import paddlenlp.ops as ops
3536
from visualdl import LogWriter
3637

@@ -92,7 +93,7 @@ def dist_optimizer(args, topo):
9293
'gelu',
9394
],
9495
"custom_black_list": ['c_softmax_with_cross_entropy'],
95-
"init_loss_scaling": 32768,
96+
"init_loss_scaling": args.scale_loss,
9697
"use_dynamic_loss_scaling": True,
9798
}
9899
if args.use_sharding:
@@ -173,7 +174,7 @@ def run_evaluate(data_loader,
173174
break
174175
average_loss = sum(all_loss) / len(all_loss)
175176
logger.info(
176-
"%s step %d, epoch: %d, batch: %d, loss: %f, speed: %.0f tokens/s"
177+
"%s step %d, epoch: %d, batch: %d, loss: %f, eval_ips: %.0f tokens/s"
177178
% (task_name, global_step, epoch, eval_step, average_loss,
178179
iter_steps * args.micro_batch_size * args.max_seq_len /
179180
(time.time() - local_time)))
@@ -407,6 +408,7 @@ def do_train(args):
407408
use_program_cache=True)
408409
# In the new 2.0 api, must call this function to change the learning_rate
409410
lr_scheduler.step()
411+
profiler.add_profiler_step(args.profiler_options)
410412

411413
if global_step % args.logging_freq == 0:
412414
if topo.is_last:
@@ -446,7 +448,8 @@ def do_train(args):
446448
save_persistables(exe,
447449
os.path.join(output_dir, "static_vars"),
448450
main_program)
449-
if global_step == args.save_steps:
451+
452+
if global_step <= args.save_steps:
450453
model.init_config["init_args"][0].init_config.pop("topo",
451454
None)
452455
model.save_pretrained(output_dir)

examples/language_model/gpt/scripts/run.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,6 @@ python -u run_pretrain.py \
1818
--grad_clip 1.0\
1919
--logging_freq 1\
2020
--eval_freq 1000\
21-
--device "gpu"
21+
--device "gpu" \
22+
23+
# --use_amp true

0 commit comments

Comments
 (0)