Skip to content

Commit d17d3c8

Browse files
author
Jeffrey
committed
fix sleep behavior only for last iteration, also address the parent dir issue
1 parent 27c5f3a commit d17d3c8

File tree

2 files changed

+4
-3
lines changed

2 files changed

+4
-3
lines changed

open_lm/file_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,10 @@ def remote_sync(local_dir, remote_dir, protocol):
7272

7373
def remote_sync_with_expon_backoff(sync_every, local_dir, remote_dir, protocol, max_retries=6):
7474
for i in range(max_retries):
75+
time.sleep(sync_every * 2**i)
7576
success = remote_sync(local_dir, remote_dir, protocol)
7677
if success:
7778
return True
78-
time.sleep(sync_every * 2**i)
7979
return False
8080

8181

open_lm/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ def main(args):
381381
args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to
382382
args.checkpoint_path = os.path.join(log_base_path, "checkpoints")
383383
args.failed_checkpoint_path = os.path.join(log_base_path, "checkpoints_failed")
384-
if is_master(args):
384+
if is_master(args, local=args.log_local):
385385
args.tensorboard_path = os.path.join(log_base_path, "tensorboard") if args.tensorboard else ""
386386
for dirname in [args.tensorboard_path, args.checkpoint_path, args.failed_checkpoint_path]:
387387
if dirname:
@@ -932,8 +932,9 @@ def main(args):
932932
if remote_sync_process is not None:
933933
logging.info("Final remote sync.")
934934
terminate_sync_process(remote_sync_process)
935+
# Can just pass in sync_every=0 for last sync, otherwise will unecessarily sleep.
935936
result = remote_sync_with_expon_backoff(
936-
args.remote_sync_frequency,
937+
0,
937938
os.path.join(args.logs, args.name),
938939
os.path.join(args.remote_sync, args.name),
939940
args.remote_sync_protocol,

0 commit comments

Comments
 (0)