File tree Expand file tree Collapse file tree 2 files changed +15
-4
lines changed
Expand file tree Collapse file tree 2 files changed +15
-4
lines changed Original file line number Diff line number Diff line change @@ -214,14 +214,15 @@ def save(actor: Any, iteration: int) -> None:
214214 state_dict = {"model_state" : model_state }
215215 dcp .save (state_dict , checkpoint_id = str (model_dir ))
216216
217- # Save optimizer state
218- if hasattr (actor , "optimizer" ) and actor .optimizer is not None :
217+ # Save optimizer state (skip if --no-save-optim is set)
218+ save_optimizer_state = not getattr (actor .args , "no_save_optim" , False )
219+ if save_optimizer_state and hasattr (actor , "optimizer" ) and actor .optimizer is not None :
219220 optimizer_state = OptimizerState (actor .model , actor .optimizer )
220221 optim_state_dict = {"optim_state" : optimizer_state }
221222 dcp .save (optim_state_dict , checkpoint_id = str (optimizer_dir ))
222223
223- # Save LR scheduler state
224- if hasattr (actor , "lr_scheduler" ) and actor .lr_scheduler is not None :
224+ # Save LR scheduler state (skip if --no-save-optim is set)
225+ if save_optimizer_state and hasattr (actor , "lr_scheduler" ) and actor .lr_scheduler is not None :
225226 lr_scheduler_state = LRSchedulerState (actor .lr_scheduler )
226227 lr_scheduler_state_dict = {"lr_scheduler_state" : lr_scheduler_state }
227228 dcp .save (lr_scheduler_state_dict , checkpoint_id = str (lr_scheduler_dir ))
Original file line number Diff line number Diff line change @@ -695,6 +695,16 @@ def add_algo_arguments(parser):
695695 reset_arg (parser , "--save" , type = str , default = None )
696696 reset_arg (parser , "--save-interval" , type = int , default = None )
697697 reset_arg (parser , "--async-save" , action = "store_true" )
698+ reset_arg (
699+ parser ,
700+ "--no-save-optim" ,
701+ action = "store_true" ,
702+ default = False ,
703+ help = (
704+ "If set, do not save the optimizer state when saving checkpoints. "
705+ "This reduces checkpoint size but disables training resumption from the saved checkpoint."
706+ ),
707+ )
698708 parser .add_argument (
699709 "--save-hf" ,
700710 type = str ,
You can’t perform that action at this time.
0 commit comments