Warn when early stop checker returns True (#994)

diego-urgell · facebook-github-bot · commit 1918819bea52 · 2025-05-02T01:56:42.000-07:00
Summary: Pull Request resolved: #994 Reviewed By: JKSenthil Differential Revision: D73955323 fbshipit-source-id: fb967c02cd684b9989cf16cd6f608518e03ad502
diff --git a/torchtnt/framework/callbacks/early_stopping.py b/torchtnt/framework/callbacks/early_stopping.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import logging
 from typing import Literal
 
 from torchtnt.framework.callback import Callback
@@ -14,6 +15,8 @@
 from torchtnt.utils.distributed import get_global_rank, sync_bool
 from torchtnt.utils.early_stop_checker import EarlyStopChecker
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 
 class EarlyStopping(Callback):
     """
@@ -102,4 +105,5 @@ def _maybe_stop(self, state: State, unit: AppStateMixin) -> None:
 
         should_stop = sync_bool(should_stop, coherence_mode="rank_zero")
         if should_stop:
+            logger.warning("Stopping training early due to early stopping criteria.")
             state.stop()
diff --git a/torchtnt/utils/early_stop_checker.py b/torchtnt/utils/early_stop_checker.py
@@ -189,15 +189,15 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
 
         # Check finite
         if self.check_finite and not torch.isfinite(val):
-            _log.debug(
+            _log.warning(
                 f"Metric is not finite: {val}."
                 f" Previous best value was {self._best_value}."
             )
             return True
 
         # Check if reached stopping threshold
         if stopping_threshold is not None and self._mode_func(val, stopping_threshold):
-            _log.debug(
+            _log.warning(
                 "Stopping threshold reached:"
                 f" {val} {self._mode_char} {stopping_threshold}."
             )
@@ -207,7 +207,7 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
         if divergence_threshold is not None and self._mode_func(
             -val, -divergence_threshold
         ):
-            _log.debug(
+            _log.warning(
                 "Divergence threshold reached:"
                 f" {val} {self._mode_char} {divergence_threshold}."
             )
@@ -222,6 +222,8 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
             message = self._improvement_message(val)
             self._best_value = val
             self._patience_count = 0
+            _log.debug(message)
+
         else:
             # Not improving
             self._patience_count += 1
@@ -241,7 +243,8 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
                     f" {self.patience - self._patience_count} checks of patience remaining."
                 )
 
-        _log.debug(message)
+            _log.warning(message)
+
         return should_stop
 
     @property