Skip to content

Commit 8fecd6b

Browse files
author
gongenlei
authored
[BUGFIX] fix hang when training and evaluation in multi-gpus model (#1681)
* fix: fix hang in multi-gpus * rm del dataloader
1 parent eb5cf37 commit 8fecd6b

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

examples/language_model/ernie-m/run_classifier.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -345,17 +345,17 @@ def do_train(args):
345345
evaluate(model, loss_fct, metric, test_data_loader,
346346
language)
347347
print("eval done total : %s s" % (time.time() - tic_eval))
348-
if paddle.distributed.get_rank() == 0:
349-
output_dir = os.path.join(
350-
args.output_dir,
351-
"ernie_m_ft_model_%d.pdparams" % (global_step))
352-
if not os.path.exists(output_dir):
353-
os.makedirs(output_dir)
354-
# Need better way to get inner model of DataParallel
355-
model_to_save = model._layers if isinstance(
356-
model, paddle.DataParallel) else model
357-
model_to_save.save_pretrained(output_dir)
358-
tokenizer.save_pretrained(output_dir)
348+
if paddle.distributed.get_rank() == 0:
349+
output_dir = os.path.join(args.output_dir,
350+
"ernie_m_ft_model_%d.pdparams" %
351+
(global_step))
352+
if not os.path.exists(output_dir):
353+
os.makedirs(output_dir)
354+
# Need better way to get inner model of DataParallel
355+
model_to_save = model._layers if isinstance(
356+
model, paddle.DataParallel) else model
357+
model_to_save.save_pretrained(output_dir)
358+
tokenizer.save_pretrained(output_dir)
359359
if global_step >= num_training_steps:
360360
break
361361
if global_step >= num_training_steps:

0 commit comments

Comments
 (0)