Skip to content

Commit 83743cc

Browse files
ZHUIZeyuChen
andauthored
[BUGFIX] fix gpt benchmark. (#1787)
Co-authored-by: Zeyu Chen <[email protected]>
1 parent 36a8f50 commit 83743cc

File tree

5 files changed

+16
-10
lines changed

5 files changed

+16
-10
lines changed

examples/language_model/gpt-3/dygraph/run_pretrain.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def get_train_data_file(args):
122122

123123
def do_train(args):
124124
paddle.set_device(args.device)
125+
nranks = paddle.distributed.get_world_size()
125126
strategy = fleet.DistributedStrategy()
126127
strategy.hybrid_configs = {
127128
"dp_degree": args.dp_degree,
@@ -393,9 +394,10 @@ def do_train(args):
393394
avg_reader_cost = train_reader_cost / args.logging_freq
394395

395396
logger.info(
396-
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e"
397+
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e"
397398
% (global_step, epoch, step, avg_loss, avg_reader_cost,
398399
1. / speed, speed, speed * default_global_tokens_num,
400+
speed * default_global_tokens_num / nranks,
399401
optimizer.get_lr()))
400402
log_writer.add_scalar("loss", float(loss), global_step)
401403
log_writer.add_scalar("learning_rate",

examples/language_model/gpt-3/static/run_pretrain_static.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,11 +435,12 @@ def do_train(args):
435435
train_reader_cost + train_run_cost)
436436
avg_reader_cost = train_reader_cost / args.logging_freq
437437
logger.info(
438-
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e"
438+
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f steps/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e"
439439
% (global_step, epoch, step, loss_return[0],
440440
avg_reader_cost, 1. / speed, speed,
441441
speed * args.global_batch_size * args.max_seq_len,
442-
lr_return[0]))
442+
speed * args.global_batch_size * args.max_seq_len /
443+
worker_num, lr_return[0]))
443444
log_writer.add_scalar("loss", loss_return[0], global_step)
444445
log_writer.add_scalar("learning_rate", lr_return[0],
445446
global_step)

examples/language_model/gpt/run_pretrain.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,10 +263,12 @@ def do_train(args):
263263
train_reader_cost + train_run_cost)
264264
avg_reader_cost = train_reader_cost / args.logging_freq
265265
logger.info(
266-
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e"
267-
% (global_step, epoch, step, loss_numpy,
268-
avg_reader_cost, 1. / speed, speed, speed *
269-
default_global_tokens_num, optimizer.get_lr()))
266+
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e"
267+
%
268+
(global_step, epoch, step, loss_numpy, avg_reader_cost,
269+
1. / speed, speed, speed * default_global_tokens_num,
270+
speed * default_global_tokens_num / worker_num,
271+
optimizer.get_lr()))
270272
log_writer.add_scalar("loss", loss_numpy, global_step)
271273
log_writer.add_scalar("learning_rate",
272274
optimizer.get_lr(), global_step)

examples/language_model/gpt/run_pretrain_static.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,11 +430,12 @@ def do_train(args):
430430
avg_reader_cost = train_reader_cost / args.logging_freq
431431

432432
logger.info(
433-
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e"
433+
"global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f steps/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e"
434434
% (global_step, epoch, step, loss_return[0],
435435
avg_reader_cost, 1. / speed, speed,
436436
speed * args.global_batch_size * args.max_seq_len,
437-
lr_return[0]))
437+
speed * args.global_batch_size * args.max_seq_len /
438+
worker_num, lr_return[0]))
438439
log_writer.add_scalar("loss", loss_return[0], global_step)
439440
log_writer.add_scalar("learning_rate", lr_return[0],
440441
global_step)

tests/benchmark/run_benchmark.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ function _set_params(){
3030

3131
log_with_profiler=$log_file
3232
profiler_path=$log_profile
33-
keyword="ips:"
33+
keyword="ips_per_card:"
3434
keyword_loss="loss:"
3535
skip_steps=20
3636
model_mode=-1

0 commit comments

Comments
 (0)