periodic distributed sync (#843)

galrotem · facebook-github-bot · commit a8750ae4966a · 2024-06-04T17:09:05.000-07:00
Summary: Pull Request resolved: #843 Reviewed By: diego-urgell Differential Revision: D58118753 fbshipit-source-id: 42d69c285ca36738a86020018b4137c3a9d20e1d
diff --git a/docs/source/framework/callbacks.rst b/docs/source/framework/callbacks.rst
@@ -27,6 +27,7 @@ We offer several pre-written callbacks which are ready to be used out of the box
     LearningRateMonitor
     MemorySnapshot
     ModuleSummary
+    PeriodicDistributedSync
     ProgressReporter
     PyTorchProfiler
     SlowRankDetector
diff --git a/tests/framework/callbacks/test_periodic_distributed_sync.py b/tests/framework/callbacks/test_periodic_distributed_sync.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+from torchtnt.framework._test_utils import DummyPredictUnit
+
+from torchtnt.framework.callbacks.periodic_distributed_sync import (
+    PeriodicDistributedSync,
+)
+from torchtnt.framework.state import EntryPoint, State
+
+
+class PeriodicDistributedSyncTest(unittest.TestCase):
+    @patch("torchtnt.framework.callbacks.periodic_distributed_sync.barrier")
+    def test_frequency(self, barrier_mock: MagicMock) -> None:
+        pds = PeriodicDistributedSync(sync_every_n_steps=2)
+        unit = DummyPredictUnit(2)
+        state = State(entry_point=EntryPoint.PREDICT)
+        unit.predict_progress.increment_step()  # 1 step completed
+        pds.on_predict_step_end(state, unit)
+        barrier_mock.assert_not_called()
+
+        unit.predict_progress.increment_step()  # 2 steps completed
+        pds.on_predict_step_end(state, unit)
+        barrier_mock.assert_called_once()
diff --git a/torchtnt/framework/callbacks/__init__.py b/torchtnt/framework/callbacks/__init__.py
@@ -16,6 +16,7 @@
 from .learning_rate_monitor import LearningRateMonitor
 from .memory_snapshot import MemorySnapshot
 from .module_summary import ModuleSummary
+from .periodic_distributed_sync import PeriodicDistributedSync
 from .progress_reporter import ProgressReporter
 from .pytorch_profiler import PyTorchProfiler
 from .slow_rank_detector import SlowRankDetector
@@ -39,6 +40,7 @@
     "LearningRateMonitor",
     "MemorySnapshot",
     "ModuleSummary",
+    "PeriodicDistributedSync",
     "ProgressReporter",
     "PyTorchProfiler",
     "SlowRankDetector",
diff --git a/torchtnt/framework/callbacks/periodic_distributed_sync.py b/torchtnt/framework/callbacks/periodic_distributed_sync.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import logging
+
+from torchtnt.framework.callback import Callback
+from torchtnt.framework.state import State
+from torchtnt.framework.unit import TPredictUnit
+from torchtnt.utils.distributed import barrier
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class PeriodicDistributedSync(Callback):
+    """
+    A callback to sync all distributed workers at a given frequency.
+    Helpful when using distributed without DDP/FSDP but would still like to ensure that the workers are in sync with each other, for example large predict jobs.
+    Note that only predict is supported at the moment.
+
+    Args:
+        sync_every_n_steps: the frequency at which to sync the workers.
+    """
+
+    def __init__(self, sync_every_n_steps: int = 1000) -> None:
+        self.sync_every_n_steps = sync_every_n_steps
+
+    def on_predict_step_end(self, state: State, unit: TPredictUnit) -> None:
+        num_steps = unit.predict_progress.num_steps_completed
+        if num_steps % self.sync_every_n_steps == 0:
+            logger.info(f"Barrier at step {num_steps}")
+            barrier()