TrainerConfig.save_checkpoint_upon_crash

emailweixu · emailweixu · commit f5d8bd15c2cb · 2025-10-09T13:17:37.000-07:00
diff --git a/alf/algorithms/config.py b/alf/algorithms/config.py
@@ -45,6 +45,7 @@ def __init__(self,
                  sync_progress_to_envs=False,
                  num_checkpoints=10,
                  confirm_checkpoint_upon_crash=True,
+                 save_checkpoint_upon_crash=False,
                  no_thread_env_for_conf=False,
                  evaluate=False,
                  num_evals=None,
@@ -207,6 +208,8 @@ def __init__(self,
             num_checkpoints (int): how many checkpoints to save for the training
             confirm_checkpoint_upon_crash (bool): whether to prompt for whether
                 do checkpointing after crash.
+            save_checkpoint_upon_crash (bool): whether to do checkpointing after
+                crash.
             no_thread_env_for_conf (bool): not to create an unwrapped env for
                 the purpose of showing operative configurations. If True, no
                 ``ThreadEnvironment`` will ever be created, regardless of the
@@ -401,6 +404,7 @@ def __init__(self,
         self.sync_progress_to_envs = sync_progress_to_envs
         self.num_checkpoints = num_checkpoints
         self.confirm_checkpoint_upon_crash = confirm_checkpoint_upon_crash
+        self.save_checkpoint_upon_crash = save_checkpoint_upon_crash
         self.no_thread_env_for_conf = no_thread_env_for_conf
         self.evaluate = evaluate
         self.num_evals = num_evals
diff --git a/alf/trainers/policy_trainer.py b/alf/trainers/policy_trainer.py
@@ -377,8 +377,11 @@ def train(self):
             self._save_checkpoint()
             checkpoint_saved = True
         finally:
-            if (self._config.confirm_checkpoint_upon_crash
+            if (self._config.save_checkpoint_upon_crash
                     and not checkpoint_saved and self._rank <= 0):
+                self._save_checkpoint()
+            elif (self._config.confirm_checkpoint_upon_crash
+                  and not checkpoint_saved and self._rank <= 0):
                 # Prompts for checkpoint only when running single process
                 # training (rank is -1) or master process of DDP training (rank
                 # is 0).