add public reason api

SkafteNicki · SkafteNicki · commit 46d8fa2a5c48 · 2025-09-09T12:28:30.000+02:00
diff --git a/src/lightning/pytorch/callbacks/early_stopping.py b/src/lightning/pytorch/callbacks/early_stopping.py
@@ -20,6 +20,7 @@
 """
 
 import logging
+from enum import Enum
 from typing import Any, Callable, Optional
 
 import torch
@@ -34,6 +35,16 @@
 log = logging.getLogger(__name__)
 
 
+class EarlyStoppingReason(Enum):
+    """Enum for early stopping reasons."""
+
+    NOT_STOPPED = 0
+    STOPPING_THRESHOLD = 1
+    DIVERGENCE_THRESHOLD = 2
+    PATIENCE_EXHAUSTED = 3
+    NON_FINITE_METRIC = 4
+
+
 class EarlyStopping(Callback):
     r"""Monitor a metric and stop training when it stops improving.
 
@@ -65,6 +76,11 @@ class EarlyStopping(Callback):
             If this is ``False``, then the check runs at the end of the validation.
         log_rank_zero_only: When set ``True``, logs the status of the early stopping callback only for rank 0 process.
 
+    Attributes:
+        stopped_epoch: The epoch at which training was stopped. 0 if training was not stopped.
+        stopping_reason: An ``EarlyStoppingReason`` enum indicating why training was stopped.
+        stopping_reason_message: A human-readable message explaining why training was stopped.
+
     Raises:
         MisconfigurationException:
             If ``mode`` is none of ``"min"`` or ``"max"``.
@@ -74,9 +90,12 @@ class EarlyStopping(Callback):
     Example::
 
         >>> from lightning.pytorch import Trainer
-        >>> from lightning.pytorch.callbacks import EarlyStopping
+        >>> from lightning.pytorch.callbacks import EarlyStopping, EarlyStoppingReason
         >>> early_stopping = EarlyStopping('val_loss')
         >>> trainer = Trainer(callbacks=[early_stopping])
+        >>> # After training...
+        >>> if early_stopping.stopping_reason == EarlyStoppingReason.PATIENCE_EXHAUSTED:
+        ...     print("Training stopped due to patience exhaustion")
 
     .. tip:: Saving and restoring multiple early stopping callbacks at the same time is supported under variation in the
         following arguments:
@@ -117,6 +136,8 @@ def __init__(
         self.divergence_threshold = divergence_threshold
         self.wait_count = 0
         self.stopped_epoch = 0
+        self.stopping_reason = EarlyStoppingReason.NOT_STOPPED
+        self.stopping_reason_message = None
         self._check_on_train_epoch_end = check_on_train_epoch_end
         self.log_rank_zero_only = log_rank_zero_only
 
@@ -169,6 +190,8 @@ def state_dict(self) -> dict[str, Any]:
             "stopped_epoch": self.stopped_epoch,
             "best_score": self.best_score,
             "patience": self.patience,
+            "stopping_reason": self.stopping_reason,
+            "stopping_reason_message": self.stopping_reason_message,
         }
 
     @override
@@ -177,6 +200,9 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         self.stopped_epoch = state_dict["stopped_epoch"]
         self.best_score = state_dict["best_score"]
         self.patience = state_dict["patience"]
+        # For backward compatibility, set defaults if not present
+        self.stopping_reason = state_dict.get("stopping_reason", EarlyStoppingReason.NOT_STOPPED)
+        self.stopping_reason_message = state_dict.get("stopping_reason_message")
 
     def _should_skip_check(self, trainer: "pl.Trainer") -> bool:
         from lightning.pytorch.trainer.states import TrainerFn
@@ -212,6 +238,8 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         trainer.should_stop = trainer.should_stop or should_stop
         if should_stop:
             self.stopped_epoch = trainer.current_epoch
+            # Store the stopping reason message
+            self.stopping_reason_message = reason
         if reason and self.verbose:
             self._log_info(trainer, reason, self.log_rank_zero_only)
 
@@ -220,19 +248,22 @@ def _evaluate_stopping_criteria(self, current: Tensor) -> tuple[bool, Optional[s
         reason = None
         if self.check_finite and not torch.isfinite(current):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.NON_FINITE_METRIC
             reason = (
                 f"Monitored metric {self.monitor} = {current} is not finite."
                 f" Previous best value was {self.best_score:.3f}. Signaling Trainer to stop."
             )
         elif self.stopping_threshold is not None and self.monitor_op(current, self.stopping_threshold):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.STOPPING_THRESHOLD
             reason = (
                 "Stopping threshold reached:"
                 f" {self.monitor} = {current} {self.order_dict[self.mode]} {self.stopping_threshold}."
                 " Signaling Trainer to stop."
             )
         elif self.divergence_threshold is not None and self.monitor_op(-current, -self.divergence_threshold):
             should_stop = True
+            self.stopping_reason = EarlyStoppingReason.DIVERGENCE_THRESHOLD
             reason = (
                 "Divergence threshold reached:"
                 f" {self.monitor} = {current} {self.order_dict[self.mode]} {self.divergence_threshold}."
@@ -247,6 +278,7 @@ def _evaluate_stopping_criteria(self, current: Tensor) -> tuple[bool, Optional[s
             self.wait_count += 1
             if self.wait_count >= self.patience:
                 should_stop = True
+                self.stopping_reason = EarlyStoppingReason.PATIENCE_EXHAUSTED
                 reason = (
                     f"Monitored metric {self.monitor} did not improve in the last {self.wait_count} records."
                     f" Best score: {self.best_score:.3f}. Signaling Trainer to stop."