From 06918131367e2a75a63b80ef16c936ece4afdc6d Mon Sep 17 00:00:00 2001 From: Zhicheng Chen Date: Mon, 30 Nov 2020 01:36:24 +0800 Subject: [PATCH] fix last ckpt failure --- pycls/core/checkpoint.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pycls/core/checkpoint.py b/pycls/core/checkpoint.py index 49ae5a5..49ec5d6 100644 --- a/pycls/core/checkpoint.py +++ b/pycls/core/checkpoint.py @@ -59,6 +59,7 @@ def save_checkpoint(model, optimizer, epoch, best): """Saves a checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): + torch.distributed.barrier() return # Ensure that the checkpoint dir exists os.makedirs(get_checkpoint_dir(), exist_ok=True) @@ -75,6 +76,7 @@ def save_checkpoint(model, optimizer, epoch, best): # If best copy checkpoint to the best checkpoint if best: copyfile(checkpoint_file, get_checkpoint_best()) + torch.distributed.barrier() return checkpoint_file