slow rank detector callback (#764)

galrotem · facebook-github-bot · commit c988b864c317 · 2024-03-29T13:43:07.000-07:00
Summary: Pull Request resolved: #764 Reviewed By: JKSenthil Differential Revision: D55383095 fbshipit-source-id: fa42d0cf664c78016c9c95634788cca1d001a3cd
diff --git a/docs/source/framework/callbacks.rst b/docs/source/framework/callbacks.rst
@@ -27,6 +27,7 @@ We offer several pre-written callbacks which are ready to be used out of the box
     MemorySnapshot
     ModuleSummary
     PyTorchProfiler
+    SlowRankDetector
     SystemResourcesMonitor
     TensorBoardParameterMonitor
     TimeLimitInterrupter
diff --git a/tests/framework/callbacks/test_slow_rank_detector.py b/tests/framework/callbacks/test_slow_rank_detector.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+from unittest import mock
+from unittest.mock import MagicMock
+
+from torchtnt.framework.callbacks.slow_rank_detector import (
+    _get_min_max_indices,
+    SlowRankDetector,
+)
+
+from torchtnt.framework.state import State
+from torchtnt.framework.unit import TrainUnit
+from torchtnt.utils.distributed import get_global_rank, spawn_multi_process
+from torchtnt.utils.loggers.logger import MetricLogger
+from torchtnt.utils.progress import Progress
+from torchtnt.utils.test_utils import skip_if_not_distributed, skip_if_not_gpu
+
+
+class SlowRankDetectorTest(unittest.TestCase):
+
+    @skip_if_not_distributed
+    @skip_if_not_gpu
+    def test_sync_times(self) -> None:
+        spawn_multi_process(2, "nccl", self._test_sync_times)
+
+    @staticmethod
+    def _test_sync_times() -> None:
+        tc = unittest.TestCase()
+        rank = get_global_rank()
+        logger = MagicMock(spec=MetricLogger)
+
+        with mock.patch("time.perf_counter", return_value=rank + 1), tc.assertLogs(
+            level="INFO"
+        ) as log:
+            slow_rank_detector = SlowRankDetector(logger=logger)
+            slow_rank_detector._sync_times(1, 1)
+            tc.assertEqual(
+                log.output,
+                [
+                    "INFO:torchtnt.framework.callbacks.slow_rank_detector:Time difference between fastest rank (0: 1.0 sec) and slowest rank (1: 2.0 sec) is 1.0 seconds after 1 epochs and 1 steps."
+                ],
+            )
+            if rank == 0:
+                logger.log.assert_called_once_with(
+                    "Difference between fastest/slowest rank (seconds)", 1.0, 1
+                )
+            else:
+                logger.log.assert_not_called()
+
+    def test_get_min_max_indices(self) -> None:
+        min_index, max_index = _get_min_max_indices([5.0, 2.0, 3.5])
+        self.assertEqual(min_index, 1)
+        self.assertEqual(max_index, 0)
+
+        min_index, max_index = _get_min_max_indices([1.0])
+        self.assertEqual(min_index, 0)
+        self.assertEqual(max_index, 0)
+
+        min_index, max_index = _get_min_max_indices([2.0, 3.0, 2.0])
+        self.assertEqual(min_index, 0)
+        self.assertEqual(max_index, 1)
+
+    def test_invalid_initialization_params(self) -> None:
+        with self.assertRaisesRegex(
+            ValueError,
+            "At least one of check_every_n_steps or check_every_n_epochs must be specified.",
+        ):
+            SlowRankDetector(check_every_n_steps=None, check_every_n_epochs=None)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "check_every_n_steps must be a positive integer. Value passed is 0",
+        ):
+            SlowRankDetector(check_every_n_steps=0)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "check_every_n_epochs must be a positive integer. Value passed is 0",
+        ):
+            SlowRankDetector(check_every_n_epochs=0)
+
+    def test_sync_times_frequency(self) -> None:
+        slow_rank_detector = SlowRankDetector(
+            check_every_n_steps=2, check_every_n_epochs=2
+        )
+        unit = MagicMock(spec=TrainUnit)
+        unit.train_progress = Progress(num_epochs_completed=1, num_steps_completed=1)
+        state = MagicMock(spec=State)
+        with mock.patch.object(slow_rank_detector, "_sync_times") as sync_times_mock:
+            # first step shouldn't trigger time sync
+            slow_rank_detector.on_train_step_end(state, unit)
+            sync_times_mock.assert_not_called()
+
+            # second step should trigger time sync
+            unit.train_progress.increment_step()
+            slow_rank_detector.on_train_step_end(state, unit)
+            sync_times_mock.assert_called_once()
+
+            # third step shouldn't trigger time sync
+            unit.train_progress.increment_step()
+            sync_times_mock.reset_mock()
+            slow_rank_detector.on_train_step_end(state, unit)
+            sync_times_mock.assert_not_called()
+
+            # first epoch shouldn't trigger time sync
+            slow_rank_detector.on_train_epoch_end(state, unit)
+            sync_times_mock.assert_not_called()
+
+            # second epoch should trigger time sync
+            unit.train_progress.increment_epoch()
+            slow_rank_detector.on_train_epoch_end(state, unit)
+            sync_times_mock.assert_called_once()
+
+            # third epoch shouldn't trigger time sync
+            unit.train_progress.increment_epoch()
+            sync_times_mock.reset_mock()
+            slow_rank_detector.on_train_epoch_end(state, unit)
+            sync_times_mock.assert_not_called()
diff --git a/torchtnt/framework/callbacks/__init__.py b/torchtnt/framework/callbacks/__init__.py
@@ -16,6 +16,7 @@
 from .memory_snapshot import MemorySnapshot
 from .module_summary import ModuleSummary
 from .pytorch_profiler import PyTorchProfiler
+from .slow_rank_detector import SlowRankDetector
 from .system_resources_monitor import SystemResourcesMonitor
 from .tensorboard_parameter_monitor import TensorBoardParameterMonitor
 from .time_limit_interrupter import TimeLimitInterrupter
@@ -35,6 +36,7 @@
     "MemorySnapshot",
     "ModuleSummary",
     "PyTorchProfiler",
+    "SlowRankDetector",
     "SystemResourcesMonitor",
     "TensorBoardParameterMonitor",
     "TimeLimitInterrupter",
diff --git a/torchtnt/framework/callbacks/slow_rank_detector.py b/torchtnt/framework/callbacks/slow_rank_detector.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import time
+from typing import List, Optional, Tuple
+
+import torch
+from torch import distributed as dist
+from torchtnt.framework.callback import Callback
+from torchtnt.framework.state import State
+from torchtnt.framework.unit import TTrainUnit
+from torchtnt.utils.distributed import all_gather_tensors, get_global_rank
+from torchtnt.utils.env import init_from_env
+from torchtnt.utils.loggers.logger import MetricLogger
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class SlowRankDetector(Callback):
+    """
+    A callback which detects slow ranks every N steps/epochs by comparing the time on each process.
+    This is useful to debug ranks which are lagging behind and are likely to cause a NCCL timeout.
+    If a logger is passed, the difference between the fastest rank and slowest rank is also reported.
+
+    Args:
+        check_every_n_steps: frequency of steps to check for slow ranks.
+        check_every_n_epochs: frequency of epochs to check for slow ranks.
+        pg: the process group to use for all_gather_tensors. If None, the default process group will be used.
+        logger: an optional logger to log time difference.
+        device: the device that will be used to store the time as a tensor. If none, the device will be inferred from the environment.
+
+    Note:
+        It is recommended to use this callback after you detect a timeout, and to make sure this callback runs before
+        the logic triggering timeout (other callback, train_step, etc).
+    """
+
+    def __init__(
+        self,
+        *,
+        check_every_n_steps: Optional[int] = 100,
+        check_every_n_epochs: Optional[int] = 1,
+        pg: Optional[dist.ProcessGroup] = None,
+        logger: Optional[MetricLogger] = None,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        if not (check_every_n_steps or check_every_n_epochs):
+            raise ValueError(
+                "At least one of check_every_n_steps or check_every_n_epochs must be specified."
+            )
+
+        if check_every_n_steps is not None and check_every_n_steps <= 0:
+            raise ValueError(
+                f"check_every_n_steps must be a positive integer. Value passed is {check_every_n_steps}"
+            )
+
+        if check_every_n_epochs is not None and check_every_n_epochs <= 0:
+            raise ValueError(
+                f"check_every_n_epochs must be a positive integer. Value passed is {check_every_n_epochs}"
+            )
+
+        self._check_every_n_steps = check_every_n_steps
+        self._check_every_n_epochs = check_every_n_epochs
+        self._pg = pg
+        self._logger = logger
+        self._device: torch.device = device or init_from_env()
+        self._rank: int = get_global_rank()
+
+    def on_train_step_end(self, state: State, unit: TTrainUnit) -> None:
+        if (
+            self._check_every_n_steps is not None
+            and unit.train_progress.num_steps_completed % self._check_every_n_steps == 0
+        ):
+            self._sync_times(
+                unit.train_progress.num_epochs_completed,
+                unit.train_progress.num_steps_completed,
+            )
+
+    def on_train_epoch_end(self, state: State, unit: TTrainUnit) -> None:
+        if (
+            self._check_every_n_epochs is not None
+            and unit.train_progress.num_epochs_completed % self._check_every_n_epochs
+            == 0
+        ):
+            self._sync_times(
+                unit.train_progress.num_epochs_completed,
+                unit.train_progress.num_steps_completed,
+            )
+
+    def _sync_times(self, epochs: int, steps: int) -> None:
+        curr_time = time.perf_counter()
+        curr_time_tensor = torch.Tensor([curr_time]).to(self._device)
+        timings_as_tensor_list = all_gather_tensors(curr_time_tensor, self._pg)
+        timings_as_list: List[float] = [
+            tensor.item() for tensor in timings_as_tensor_list
+        ]
+        fastest_rank, slowest_rank = _get_min_max_indices(timings_as_list)
+        time_on_fastest_rank = timings_as_list[fastest_rank]
+        time_on_slowest_rank = timings_as_list[slowest_rank]
+        time_difference = time_on_slowest_rank - time_on_fastest_rank
+        logger.info(
+            f"""Time difference between fastest rank ({fastest_rank}: {time_on_fastest_rank} sec) and slowest rank ({slowest_rank}: {time_on_slowest_rank} sec) is {time_difference} seconds after {epochs} epochs and {steps} steps."""
+        )
+        if self._logger and self._rank == 0:
+            self._logger.log(
+                "Difference between fastest/slowest rank (seconds)",
+                time_difference,
+                steps,
+            )
+
+
+# instead of taking a dependency on numpy
+def _get_min_max_indices(input_list: List[float]) -> Tuple[int, int]:
+    min_index = -1
+    max_index = -1
+    min_value = float("inf")
+    max_value = float("-inf")
+    for rank, curr_value in enumerate(input_list):
+        if curr_value < min_value:
+            min_value = curr_value
+            min_index = rank
+        if curr_value > max_value:
+            max_value = curr_value
+            max_index = rank
+
+    return min_index, max_index