make sure temp checkpoints are cleaned up on failed tuning

SkafteNicki · SkafteNicki · commit 63ce3c728996 · 2025-09-03T14:10:55.000+02:00
diff --git a/src/lightning/pytorch/tuner/batch_size_scaling.py b/src/lightning/pytorch/tuner/batch_size_scaling.py
@@ -76,24 +76,27 @@ def _scale_batch_size(
     if trainer.progress_bar_callback:
         trainer.progress_bar_callback.disable()
 
-    new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=init_val)
-
-    if mode == "power":
-        new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params)
-    elif mode == "binsearch":
-        new_size = _run_binary_scaling(trainer, new_size, batch_arg_name, max_trials, params)
+    try:
+        new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=init_val)
 
-    garbage_collection_cuda()
+        if mode == "power":
+            new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params)
+        elif mode == "binsearch":
+            new_size = _run_binary_scaling(trainer, new_size, batch_arg_name, max_trials, params)
 
-    log.info(f"Finished batch size finder, will continue with full run using batch size {new_size}")
+        garbage_collection_cuda()
 
-    __scale_batch_restore_params(trainer, params)
+        log.info(f"Finished batch size finder, will continue with full run using batch size {new_size}")
+    except Exception as e:
+        raise e
+    finally:
+        __scale_batch_restore_params(trainer, params)
 
-    if trainer.progress_bar_callback:
-        trainer.progress_bar_callback.enable()
+        if trainer.progress_bar_callback:
+            trainer.progress_bar_callback.enable()
 
-    trainer._checkpoint_connector.restore(ckpt_path)
-    trainer.strategy.remove_checkpoint(ckpt_path)
+        trainer._checkpoint_connector.restore(ckpt_path)
+        trainer.strategy.remove_checkpoint(ckpt_path)
 
     return new_size
 
diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py
@@ -257,40 +257,45 @@ def _lr_find(
     # Initialize lr finder object (stores results)
     lr_finder = _LRFinder(mode, min_lr, max_lr, num_training)
 
-    # Configure optimizer and scheduler
-    lr_finder._exchange_scheduler(trainer)
-
-    # Fit, lr & loss logged in callback
-    _try_loop_run(trainer, params)
-
-    # Prompt if we stopped early
-    if trainer.global_step != num_training + start_steps:
-        log.info(f"LR finder stopped early after {trainer.global_step} steps due to diverging loss.")
-
-    # Transfer results from callback to lr finder object
-    lr_finder.results.update({"lr": trainer.callbacks[0].lrs, "loss": trainer.callbacks[0].losses})
-    lr_finder._total_batch_idx = trainer.fit_loop.total_batch_idx  # for debug purpose
-
-    __lr_finder_restore_params(trainer, params)
-
-    if trainer.progress_bar_callback:
-        trainer.progress_bar_callback.enable()
-
-    # Update results across ranks
-    lr_finder.results = trainer.strategy.broadcast(lr_finder.results)
-
-    # Restore initial state of model (this will also restore the original optimizer state)
-    trainer._checkpoint_connector.restore(ckpt_path)
-    trainer.strategy.remove_checkpoint(ckpt_path)
-    trainer.fit_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
-    trainer.fit_loop.epoch_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
-    trainer.fit_loop.epoch_loop.val_loop._combined_loader = None
-    trainer.fit_loop._combined_loader = None  # reset data fetcher to avoid issues with the next fit
-    trainer.fit_loop.setup_data()
+    lr_finder_finished = False
+    try:
+        # Configure optimizer and scheduler
+        lr_finder._exchange_scheduler(trainer)
+
+        # Fit, lr & loss logged in callback
+        _try_loop_run(trainer, params)
+
+        # Prompt if we stopped early
+        if trainer.global_step != num_training + start_steps:
+            log.info(f"LR finder stopped early after {trainer.global_step} steps due to diverging loss.")
+
+        # Transfer results from callback to lr finder object
+        lr_finder.results.update({"lr": trainer.callbacks[0].lrs, "loss": trainer.callbacks[0].losses})
+        lr_finder._total_batch_idx = trainer.fit_loop.total_batch_idx  # for debug purpose
+
+        __lr_finder_restore_params(trainer, params)
+
+        if trainer.progress_bar_callback:
+            trainer.progress_bar_callback.enable()
+
+        # Update results across ranks
+        lr_finder.results = trainer.strategy.broadcast(lr_finder.results)
+        lr_finder_finished = True
+    except Exception as e:
+        raise e
+    finally:
+        # Restore initial state of model (this will also restore the original optimizer state)
+        trainer._checkpoint_connector.restore(ckpt_path)
+        trainer.strategy.remove_checkpoint(ckpt_path)
+        trainer.fit_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
+        trainer.fit_loop.epoch_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
+        trainer.fit_loop.epoch_loop.val_loop._combined_loader = None
+        trainer.fit_loop._combined_loader = None  # reset data fetcher to avoid issues with the next fit
+        trainer.fit_loop.setup_data()
 
     # Apply LR suggestion after restoring so it persists for the real training run
     # When used as a callback, the suggestion would otherwise be lost due to checkpoint restore
-    if update_attr:
+    if update_attr and lr_finder_finished:
         lr = lr_finder.suggestion()
         if lr is not None:
             # update the attribute on the LightningModule (e.g., lr or learning_rate)