Add eval step/epoch early stopping to TNT callback (#920)

rehm-g · facebook-github-bot · commit f9838b8e351d · 2024-10-09T12:18:30.000-07:00
Summary: Pull Request resolved: #920 Add eval phase testing to the early stopping TNT callback Reviewed By: JKSenthil Differential Revision: D64004301 fbshipit-source-id: 81c23bde273cca140647693960dfb82224c95bef
diff --git a/tests/framework/callbacks/test_early_stopping.py b/tests/framework/callbacks/test_early_stopping.py
@@ -10,11 +10,15 @@
 from typing import cast, Literal
 from unittest.mock import MagicMock, patch
 
-from torchtnt.framework._test_utils import Batch, get_dummy_train_state
+from torchtnt.framework._test_utils import (
+    Batch,
+    get_dummy_eval_state,
+    get_dummy_train_state,
+)
 
 from torchtnt.framework.callbacks.early_stopping import EarlyStopping
 from torchtnt.framework.state import State
-from torchtnt.framework.unit import TrainUnit
+from torchtnt.framework.unit import EvalUnit, TrainUnit
 
 from torchtnt.utils.early_stop_checker import EarlyStopChecker
 
@@ -131,6 +135,48 @@ def test_interval_freq(self, _maybe_stop: MagicMock) -> None:
         esc.on_train_step_end(state, unit)
         _maybe_stop.assert_called_once()
 
+    @patch("torchtnt.framework.callbacks.early_stopping.EarlyStopping._maybe_stop")
+    def test_phase(self, _maybe_stop: MagicMock) -> None:
+        early_stop_checker = EarlyStopChecker(
+            mode="min",
+            patience=2,
+            min_delta=0.0,
+        )
+        esc = EarlyStopping(
+            monitored_attr="eval_loss",
+            early_stop_checker=early_stop_checker,
+            interval="epoch",
+            interval_freq=2,
+            phase="eval",
+        )
+
+        state = get_dummy_eval_state()
+        unit = MyEvalLossUnit()
+
+        unit.eval_progress.increment_epoch()
+        esc.on_eval_epoch_end(state, unit)
+        _maybe_stop.assert_not_called()
+        unit.eval_progress.increment_epoch()
+        esc.on_eval_epoch_end(state, unit)
+        _maybe_stop.assert_called_once()
+
+        _maybe_stop.reset_mock()
+
+        esc = EarlyStopping(
+            monitored_attr="eval_loss",
+            early_stop_checker=early_stop_checker,
+            interval="step",
+            interval_freq=2,
+            phase="eval",
+        )
+
+        unit.eval_progress.increment_step()
+        esc.on_eval_step_end(state, unit)
+        _maybe_stop.assert_not_called()
+        unit.eval_progress.increment_step()
+        esc.on_eval_step_end(state, unit)
+        _maybe_stop.assert_called_once()
+
 
 class MyTrainLossUnit(TrainUnit[Batch]):
     def __init__(self) -> None:
@@ -139,3 +185,12 @@ def __init__(self) -> None:
 
     def train_step(self, state: State, data: Batch) -> None:
         return None
+
+
+class MyEvalLossUnit(EvalUnit[Batch]):
+    def __init__(self) -> None:
+        super().__init__()
+        self.eval_loss = 0.01
+
+    def eval_step(self, state: State, data: Batch) -> None:
+        return None
diff --git a/torchtnt/framework/callbacks/early_stopping.py b/torchtnt/framework/callbacks/early_stopping.py
@@ -10,7 +10,7 @@
 
 from torchtnt.framework.callback import Callback
 from torchtnt.framework.state import State
-from torchtnt.framework.unit import AppStateMixin, TTrainUnit
+from torchtnt.framework.unit import AppStateMixin, TEvalUnit, TTrainUnit
 from torchtnt.utils.distributed import get_global_rank, sync_bool
 from torchtnt.utils.early_stop_checker import EarlyStopChecker
 
@@ -23,6 +23,7 @@ class EarlyStopping(Callback):
         monitored_attr: The attribute to monitor on the unit. Must be a float or tensor attribute on the unit.
         early_stop_checker: a :class:`~torchtnt.utils.early_stop_checker.EarlyStopChecker` to use for checking whether to stop early.
         interval: The interval to check the monitored attribute. Must be one of "step" or "epoch".
+        phase: The phase to check the monitored attribute. Must be one of "train" or "eval".
 
     Note:
         If doing distributed training, this callback checks the metric value only on rank 0
@@ -33,29 +34,49 @@ def __init__(
         monitored_attr: str,
         early_stop_checker: EarlyStopChecker,
         interval: Literal["step", "epoch"] = "epoch",
+        phase: Literal["train", "eval"] = "train",
         interval_freq: int = 1,
     ) -> None:
         self._monitored_attr = monitored_attr
         self._esc = early_stop_checker
         self._interval = interval
         self._interval_freq = interval_freq
+        self._phase = phase
 
         self._rank: int = get_global_rank()
 
     def on_train_step_end(self, state: State, unit: TTrainUnit) -> None:
         if (
-            self._interval == "step"
+            self._phase == "train"
+            and self._interval == "step"
             and unit.train_progress.num_steps_completed % self._interval_freq == 0
         ):
             self._maybe_stop(state, unit)
 
     def on_train_epoch_end(self, state: State, unit: TTrainUnit) -> None:
         if (
-            self._interval == "epoch"
+            self._phase == "train"
+            and self._interval == "epoch"
             and unit.train_progress.num_epochs_completed % self._interval_freq == 0
         ):
             self._maybe_stop(state, unit)
 
+    def on_eval_step_end(self, state: State, unit: TEvalUnit) -> None:
+        if (
+            self._phase == "eval"
+            and self._interval == "step"
+            and unit.eval_progress.num_steps_completed % self._interval_freq == 0
+        ):
+            self._maybe_stop(state, unit)
+
+    def on_eval_epoch_end(self, state: State, unit: TEvalUnit) -> None:
+        if (
+            self._phase == "eval"
+            and self._interval == "epoch"
+            and unit.eval_progress.num_epochs_completed % self._interval_freq == 0
+        ):
+            self._maybe_stop(state, unit)
+
     def _maybe_stop(self, state: State, unit: AppStateMixin) -> None:
         """
         Checks whether to stop early based on the monitored attribute.