Skip to content

Commit a5f69e4

Browse files
authored
add checkpoint_done to last model (#8223)
1 parent ae7dc15 commit a5f69e4

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,11 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op
20922092
# recover unified_checkpoint_config for not trine stage
20932093
if not self.is_in_train:
20942094
self.args.unified_checkpoint_config = unified_checkpoint_config_backup
2095+
if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
2096+
# save checkpoint_done file to ensure checkpoint is complete
2097+
if self.args.should_save_model_state and self.args.should_save:
2098+
# For ckpt integrity
2099+
paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))
20952100

20962101
def _save_checkpoint(self, model, metrics=None):
20972102
# assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"

0 commit comments

Comments
 (0)