Lightning-AI
diff --git a/‎docs/source-pytorch/common/early_stopping.rst‎
Lines changed: 33 additions & 1 deletion b/‎docs/source-pytorch/common/early_stopping.rst‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/lightning/pytorch/CHANGELOG.md‎
Lines changed: 17 additions & 2 deletions b/‎src/lightning/pytorch/CHANGELOG.md‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎src/lightning/pytorch/callbacks/early_stopping.py‎
Lines changed: 32 additions & 0 deletions b/‎src/lightning/pytorch/callbacks/early_stopping.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/lightning/pytorch/callbacks/model_checkpoint.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/pytorch/callbacks/model_checkpoint.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/pytorch/callbacks/pruning.py‎
Lines changed: 2 additions & 1 deletion b/‎src/lightning/pytorch/callbacks/pruning.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/lightning/pytorch/callbacks/throughput_monitor.py‎
Lines changed: 14 additions & 5 deletions b/‎src/lightning/pytorch/callbacks/throughput_monitor.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎src/lightning/pytorch/trainer/connectors/callback_connector.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/pytorch/trainer/connectors/callback_connector.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/pytorch/tuner/batch_size_scaling.py‎
Lines changed: 16 additions & 2 deletions b/‎src/lightning/pytorch/tuner/batch_size_scaling.py‎
Lines changed: 16 additions & 2 deletions
@@ -1,6 +1,7 @@
 .. testsetup:: *
 
-    from lightning.pytorch.callbacks.early_stopping import EarlyStopping
+    from lightning.pytorch.callbacks.early_stopping import EarlyStopping, EarlyStoppingReason
+    from lightning.pytorch import Trainer, LightningModule
 
 .. _early_stopping:
 
@@ -71,6 +72,37 @@ Additional parameters that stop training at extreme points:
 - ``check_on_train_epoch_end``: When turned on, it checks the metric at the end of a training epoch. Use this only when you are monitoring any metric logged within
   training-specific hooks on epoch-level.
 
+After training completes, you can programmatically check why early stopping occurred using the ``stopping_reason``
+attribute, which returns an ``EarlyStoppingReason`` enum value.
+
+.. code-block:: python
+
+    from lightning.pytorch.callbacks import EarlyStopping
+    from lightning.pytorch.callbacks.early_stopping import EarlyStoppingReason
+
+    early_stopping = EarlyStopping(monitor="val_loss", patience=3)
+    trainer = Trainer(callbacks=[early_stopping])
+    trainer.fit(model)
+
+    # Check why training stopped
+    if early_stopping.stopping_reason == EarlyStoppingReason.PATIENCE_EXHAUSTED:
+        print("Training stopped due to patience exhaustion")
+    elif early_stopping.stopping_reason == EarlyStoppingReason.STOPPING_THRESHOLD:
+        print("Training stopped due to reaching stopping threshold")
+    elif early_stopping.stopping_reason == EarlyStoppingReason.NOT_STOPPED:
+        print("Training completed normally without early stopping")
+
+    # Access human-readable message
+    if early_stopping.stopping_reason_message:
+        print(f"Details: {early_stopping.stopping_reason_message}")
+
+The available stopping reasons are:
+
+- ``NOT_STOPPED``: Training completed normally without early stopping
+- ``STOPPING_THRESHOLD``: Training stopped because the monitored metric reached the stopping threshold
+- ``DIVERGENCE_THRESHOLD``: Training stopped because the monitored metric exceeded the divergence threshold
+- ``PATIENCE_EXHAUSTED``: Training stopped because the metric didn't improve for the specified patience
+- ``NON_FINITE_METRIC``: Training stopped because the monitored metric became NaN or infinite
 
 In case you need early stopping in a different part of training, subclass :class:`~lightning.pytorch.callbacks.early_stopping.EarlyStopping`
 and change where it is called:
 
@@ -19,12 +19,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added time-based validation support though `val_check_interval` ([#21071](https://github.com/Lightning-AI/pytorch-lightning/pull/21071))
 
 
+- Added attributes to access stopping reason in `EarlyStopping` callback ([#21188](https://github.com/Lightning-AI/pytorch-lightning/pull/21188))
+
+
+- Added support for variable batch size in `ThroughputMonitor` ([#20236](https://github.com/Lightning-AI/pytorch-lightning/pull/20236))
+
+
 ### Changed
 
 - Default to `weights_only=True` for `torch>=2.6` when loading checkpoints. ([#21072](https://github.com/Lightning-AI/pytorch-lightning/pull/21072))
 
 
--
+- Default to `RichProgressBar` and `RichModelSummary` if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise ([#20896](https://github.com/Lightning-AI/pytorch-lightning/pull/20896))
+
+
+- Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))
 
 
 ### Removed
@@ -34,7 +43,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed edgecase when `max_trials` is reached in `Tuner.scale_batch_size` ([#21187](https://github.com/Lightning-AI/pytorch-lightning/pull/21187))
+
+
+- Fixed case where `LightningCLI` could not be initialized with `trainer_default` containing callbacks ([#21192](https://github.com/Lightning-AI/pytorch-lightning/pull/21192))
+
+
+- Fixed missing reset when `ModelPruning` is applied with lottery ticket hypothesis ([#21191](https://github.com/Lightning-AI/pytorch-lightning/pull/21191))
 
 
 ---
 
@@ -20,6 +20,7 @@
 """
 
 import logging
+from enum import Enum
 from typing import Any, Callable, Optional
 
 import torch
@@ -34,6 +35,16 @@
 log = logging.getLogger(__name__)
 
 
+class EarlyStoppingReason(Enum):
+    """Enum for early stopping reasons."""
+
+    NOT_STOPPED = 0
+    STOPPING_THRESHOLD = 1
+    DIVERGENCE_THRESHOLD = 2
+    PATIENCE_EXHAUSTED = 3
+    NON_FINITE_METRIC = 4
+
+
 class EarlyStopping(Callback):
     r"""Monitor a metric and stop training when it stops improving.
 
@@ -65,6 +76,11 @@ class EarlyStopping(Callback):
             If this is ``False``, then the check runs at the end of the validation.
         log_rank_zero_only: When set ``True``, logs the status of the early stopping callback only for rank 0 process.
 
+    Attributes:
+        stopped_epoch: The epoch at which training was stopped. 0 if training was not stopped.
+        stopping_reason: An ``EarlyStoppingReason`` enum indicating why training was stopped.
+        stopping_reason_message: A human-readable message explaining why training was stopped.
+
     Raises:
         MisconfigurationException:
             If ``mode`` is none of ``"min"`` or ``"max"``.
@@ -75,8 +91,12 @@ class EarlyStopping(Callback):
 
         >>> from lightning.pytorch import Trainer
         >>> from lightning.pytorch.callbacks import EarlyStopping
+        >>> from lightning.pytorch.callbacks.early_stopping import EarlyStoppingReason
         >>> early_stopping = EarlyStopping('val_loss')
         >>> trainer = Trainer(callbacks=[early_stopping])
+        >>> # After training...
+        >>> if early_stopping.stopping_reason == EarlyStoppingReason.PATIENCE_EXHAUSTED:
+        ...     print("Training stopped due to patience exhaustion")
 
     .. tip:: Saving and restoring multiple early stopping callbacks at the same time is supported under variation in the
         following arguments:
@@ -117,6 +137,8 @@ def __init__(
         self.divergence_threshold = divergence_threshold
         self.wait_count = 0
         self.stopped_epoch = 0
+        self.stopping_reason = EarlyStoppingReason.NOT_STOPPED
+        self.stopping_reason_message: Optional[str] = None
         self._check_on_train_epoch_end = check_on_train_epoch_end
         self.log_rank_zero_only = log_rank_zero_only
 
@@ -169,6 +191,8 @@ def state_dict(self) -> dict[str, Any]:
             "stopped_epoch": self.stopped_epoch,
             "best_score": self.best_score,
             "patience": self.patience,
+            "stopping_reason": self.stopping_reason.value,
+            "stopping_reason_message": self.stopping_reason_message,
         }
 
     @override
@@ -177,6 +201,9 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         self.stopped_epoch = state_dict["stopped_epoch"]
         self.best_score = state_dict["best_score"]
         self.patience = state_dict["patience"]
+        stopping_reason_value = state_dict.get("stopping_reason", EarlyStoppingReason.NOT_STOPPED.value)
+        self.stopping_reason = EarlyStoppingReason(stopping_reason_value)
+        self.stopping_reason_message = state_dict.get("stopping_reason_message")
 
     def _should_skip_check(self, trainer: "pl.Trainer") -> bool:
         from lightning.pytorch.trainer.states import TrainerFn
@@ -212,6 +239,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         trainer.should_stop = trainer.should_stop or should_stop
         if should_stop:
             self.stopped_epoch = trainer.current_epoch
+            self.stopping_reason_message = reason
         if reason and self.verbose:
             self._log_info(trainer, reason, self.log_rank_zero_only)
 
@@ -220,19 +248,22 @@ def _evaluate_stopping_criteria(self, current: Tensor) -> tuple[bool, Optional[s
         reason = None
         if self.check_finite and not torch.isfinite(current):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.NON_FINITE_METRIC
             reason = (
                 f"Monitored metric {self.monitor} = {current} is not finite."
                 f" Previous best value was {self.best_score:.3f}. Signaling Trainer to stop."
             )
         elif self.stopping_threshold is not None and self.monitor_op(current, self.stopping_threshold):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.STOPPING_THRESHOLD
             reason = (
                 "Stopping threshold reached:"
                 f" {self.monitor} = {current} {self.order_dict[self.mode]} {self.stopping_threshold}."
                 " Signaling Trainer to stop."
             )
         elif self.divergence_threshold is not None and self.monitor_op(-current, -self.divergence_threshold):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.DIVERGENCE_THRESHOLD
             reason = (
                 "Divergence threshold reached:"
                 f" {self.monitor} = {current} {self.order_dict[self.mode]} {self.divergence_threshold}."
@@ -247,6 +278,7 @@ def _evaluate_stopping_criteria(self, current: Tensor) -> tuple[bool, Optional[s
             self.wait_count += 1
             if self.wait_count >= self.patience:
                 should_stop = True
+                self.stopping_reason = EarlyStoppingReason.PATIENCE_EXHAUSTED
                 reason = (
                     f"Monitored metric {self.monitor} did not improve in the last {self.wait_count} records."
                     f" Best score: {self.best_score:.3f}. Signaling Trainer to stop."
 
@@ -484,7 +484,7 @@ def _save_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None:
 
     @staticmethod
     def _link_checkpoint(trainer: "pl.Trainer", filepath: str, linkpath: str) -> None:
-        if trainer.is_global_zero:
+        if trainer.is_global_zero and os.path.abspath(filepath) != os.path.abspath(linkpath):
             if os.path.islink(linkpath) or os.path.isfile(linkpath):
                 os.remove(linkpath)
             elif os.path.isdir(linkpath):
 
@@ -277,7 +277,8 @@ def make_pruning_permanent(self, module: nn.Module) -> None:
 
     @staticmethod
     def _copy_param(new: nn.Module, old: nn.Module, name: str) -> None:
-        dst = getattr(new, name)
+        # Check if the parameter has been pruned (has _orig suffix)
+        dst = getattr(new, name + "_orig") if hasattr(new, name + "_orig") else getattr(new, name)
         src = getattr(old, name)
         if dst is None or src is None or not isinstance(dst, Tensor) or not isinstance(src, Tensor):
             return
 
@@ -87,6 +87,8 @@ def __init__(
         self._throughputs: dict[RunningStage, Throughput] = {}
         self._t0s: dict[RunningStage, float] = {}
         self._lengths: dict[RunningStage, int] = {}
+        self._samples: dict[RunningStage, int] = {}
+        self._batches: dict[RunningStage, int] = {}
 
     @override
     def setup(self, trainer: "Trainer", pl_module: "LightningModule", stage: str) -> None:
@@ -106,8 +108,13 @@ def setup(self, trainer: "Trainer", pl_module: "LightningModule", stage: str) ->
     def _start(self, trainer: "Trainer") -> None:
         stage = trainer.state.stage
         assert stage is not None
-        self._throughputs[stage].reset()
-        self._lengths[stage] = 0
+
+        if stage not in self._samples:
+            self._throughputs[stage].reset()
+            self._lengths[stage] = 0
+            self._samples[stage] = 0
+            self._batches[stage] = 0
+
         self._t0s[stage] = time.perf_counter()
 
     @torch.inference_mode()  # in case `length_fn` or `batch_size_fn` computes grads
@@ -133,12 +140,14 @@ def _update(self, trainer: "Trainer", pl_module: "LightningModule", batch: Any,
             )
             flops_per_batch = None
 
-        batch_size = self.batch_size_fn(batch)
+        self._samples[stage] += self.batch_size_fn(batch)
+        self._batches[stage] += 1
+
         throughput.update(
             time=elapsed,
-            batches=iter_num,
+            batches=self._batches[stage],
             # this assumes that all iterations used the same batch size
-            samples=iter_num * batch_size,
+            samples=self._samples[stage],
             lengths=None if self.length_fn is None else self._lengths[stage],
             flops=flops_per_batch,  # type: ignore[arg-type]
         )
 
@@ -240,7 +240,7 @@ def _reorder_callbacks(callbacks: list[Callback]) -> list[Callback]:
 
 
 def _validate_callbacks_list(callbacks: list[Callback]) -> None:
-    stateful_callbacks = [cb for cb in callbacks if is_overridden("state_dict", instance=cb)]
+    stateful_callbacks = [cb for cb in callbacks if is_overridden("state_dict", instance=cb, parent=Callback)]
     seen_callbacks = set()
     for callback in stateful_callbacks:
         if callback.state_key in seen_callbacks:
 
@@ -178,14 +178,22 @@ def _run_power_scaling(
     # this flag is used to determine whether the previously scaled batch size, right before OOM, was a success or not
     # if it was we exit, else we continue downscaling in case we haven't encountered a single optimal batch size
     any_success = False
-    for _ in range(max_trials):
+    last_successful_size = new_size
+    for i in range(max_trials):
         garbage_collection_cuda()
 
         # reset after each try
         _reset_progress(trainer)
 
         try:
             _try_loop_run(trainer, params)
+            last_successful_size = new_size  # Store the current size before doubling
+
+            # Check if this is the last trial before trying to double
+            if i + 1 >= max_trials:
+                new_size = last_successful_size
+                break
+
             new_size, changed = _adjust_batch_size(trainer, batch_arg_name, factor=2.0, desc="succeeded")
 
             if not changed:
@@ -224,6 +232,7 @@ def _run_binary_scaling(
     low = 1
     high = None
     count = 0
+    last_successful_size = new_size
     while True:
         garbage_collection_cuda()
 
@@ -233,9 +242,14 @@ def _run_binary_scaling(
         try:
             # run loop
             _try_loop_run(trainer, params)
+            last_successful_size = new_size  # Store the current size before doubling
             count += 1
-            if count > max_trials:
+
+            # Check if we've reached max_trials before trying to adjust batch size
+            if count >= max_trials:
+                new_size = last_successful_size
                 break
+
             # Double in size
             low = new_size
             if high: