Skip to content

Commit f2be2f7

Browse files
committed
fix: ckpt
1 parent fdb1acd commit f2be2f7

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

open_diloco/train_fsdp.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def check_checkpoint_path_access(checkpoint_path: str, rank: int, world_rank_hv:
7777
)
7878
else:
7979
dummy_file_path = os.path.join(checkpoint_path, f"dummy_file_{rank}.txt")
80+
8081
with fsspec.open(dummy_file_path, "w") as f:
8182
f.write("This is a dummy file for testing access.")
8283
gfs = GenericFileSystem()
@@ -470,11 +471,14 @@ def scheduler_fn(opt):
470471
log(f"saving at step {real_step}, step {step+1}")
471472
ckpt_path = os.path.join(config.checkpoint_path, f"model_step_{int(real_step)}")
472473

474+
if config.hv:
475+
ckpt_path = os.path.join(ckpt_path, get_diloco_rank_dir_name(config.hv.world_rank))
476+
473477
if world_messenger_hv:
474478
assert isinstance(optimizer, DiLoCoOptimizer)
475479
with optimizer.tracker.pause_updates():
476480
save_checkpoint(
477-
checkpoint_path=os.path.join(ckpt_path, get_diloco_rank_dir_name(config.hv.world_rank)),
481+
checkpoint_path=ckpt_path,
478482
model=model,
479483
optimizer=optimizer.inner_optimizer,
480484
scheduler=scheduler,

0 commit comments

Comments
 (0)