Skip to content

Commit dacd086

Browse files
authored
[Feature] Option not to save optimizer states to save disk space (#1333)
1 parent a685e5e commit dacd086

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

slime/backends/fsdp_utils/checkpoint.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,14 +214,15 @@ def save(actor: Any, iteration: int) -> None:
214214
state_dict = {"model_state": model_state}
215215
dcp.save(state_dict, checkpoint_id=str(model_dir))
216216

217-
# Save optimizer state
218-
if hasattr(actor, "optimizer") and actor.optimizer is not None:
217+
# Save optimizer state (skip if --no-save-optim is set)
218+
save_optimizer_state = not getattr(actor.args, "no_save_optim", False)
219+
if save_optimizer_state and hasattr(actor, "optimizer") and actor.optimizer is not None:
219220
optimizer_state = OptimizerState(actor.model, actor.optimizer)
220221
optim_state_dict = {"optim_state": optimizer_state}
221222
dcp.save(optim_state_dict, checkpoint_id=str(optimizer_dir))
222223

223-
# Save LR scheduler state
224-
if hasattr(actor, "lr_scheduler") and actor.lr_scheduler is not None:
224+
# Save LR scheduler state (skip if --no-save-optim is set)
225+
if save_optimizer_state and hasattr(actor, "lr_scheduler") and actor.lr_scheduler is not None:
225226
lr_scheduler_state = LRSchedulerState(actor.lr_scheduler)
226227
lr_scheduler_state_dict = {"lr_scheduler_state": lr_scheduler_state}
227228
dcp.save(lr_scheduler_state_dict, checkpoint_id=str(lr_scheduler_dir))

slime/utils/arguments.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,16 @@ def add_algo_arguments(parser):
695695
reset_arg(parser, "--save", type=str, default=None)
696696
reset_arg(parser, "--save-interval", type=int, default=None)
697697
reset_arg(parser, "--async-save", action="store_true")
698+
reset_arg(
699+
parser,
700+
"--no-save-optim",
701+
action="store_true",
702+
default=False,
703+
help=(
704+
"If set, do not save the optimizer state when saving checkpoints. "
705+
"This reduces checkpoint size but disables training resumption from the saved checkpoint."
706+
),
707+
)
698708
parser.add_argument(
699709
"--save-hf",
700710
type=str,

0 commit comments

Comments
 (0)