Early stopper inconsistent devices fix (#949)

Pavel Levin · facebook-github-bot · commit 6e6824cb0849 · 2025-01-08T09:48:30.000-08:00
Summary: Pull Request resolved: #949 `val` and `self._best_value` can have inconsitent devices in multi-GPU trainings which will fail at early stopper checks Reviewed By: JKSenthil Differential Revision: D66160768 fbshipit-source-id: 7f80900343bbdb80b118052452156a4fd5b67b73
diff --git a/torchtnt/utils/early_stop_checker.py b/torchtnt/utils/early_stop_checker.py
@@ -14,6 +14,7 @@
 from typing_extensions import final, Literal
 
 _log: logging.Logger = logging.getLogger(__name__)
+_log.setLevel(logging.DEBUG)
 
 
 @final
@@ -179,11 +180,13 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
             divergence_threshold = divergence_threshold.to(val.device)
         improvement_threshold = self.min_delta
         if self._threshold_mode == "rel":
-            base_val = self._best_value if torch.isfinite(self._best_value) else 0.0
+            base_val = (
+                self._best_value.to(val.device)
+                if torch.isfinite(self._best_value)
+                else 0.0
+            )
             improvement_threshold = self.min_delta.to(val.device) * base_val
 
-        improvement_threshold = improvement_threshold.to(val.device)
-
         # Check finite
         if self.check_finite and not torch.isfinite(val):
             _log.debug(
@@ -212,7 +215,7 @@ def check(self, val: Union[torch.Tensor, float, int]) -> bool:
 
         # Check if improvement is happening
         if self._mode_func(
-            val - improvement_threshold, self._best_value.to(val.device)
+            val - improvement_threshold.to(val.device), self._best_value.to(val.device)
         ):
             # Still improving
             should_stop = False
@@ -259,9 +262,12 @@ def _improvement_message(self, val: torch.Tensor) -> str:
         """Formats a log message that informs the user about an improvement in the monitored score."""
         if torch.isfinite(self._best_value):
             improvement = (
-                torch.abs(self._best_value - val)
+                torch.abs(self._best_value.to(val.device) - val)
                 if self.threshold_mode == "abs"
-                else torch.abs((self._best_value - val) / (1.0 * self._best_value))
+                else torch.abs(
+                    (self._best_value.to(val.device) - val)
+                    / (1.0 * self._best_value.to(val.device))
+                )
             )
             msg = (
                 f"Metric improved by {self.threshold_mode} {improvement} >="