Make base_checkpointer NCCL test run in CPU (#761)

diego-urgell · facebook-github-bot · commit 8e93a51f2283 · 2024-03-27T12:58:59.000-07:00
Summary: Pull Request resolved: #761 Reviewed By: galrotem Differential Revision: D55346092 fbshipit-source-id: da53c356ec50db2c90d7117b42e111061c4e6b7d
diff --git a/tests/framework/callbacks/test_base_checkpointer.py b/tests/framework/callbacks/test_base_checkpointer.py
@@ -28,7 +28,9 @@
     get_dummy_fit_state,
     get_dummy_train_state,
 )
-from torchtnt.framework.callbacks.base_checkpointer import BaseCheckpointer
+from torchtnt.framework.callbacks.base_checkpointer import (
+    BaseCheckpointer as BaseCheckpointer,
+)
 from torchtnt.framework.callbacks.checkpointer_types import (
     BestCheckpointConfig,
     RestoreOptions,
@@ -41,7 +43,7 @@
 from torchtnt.framework.unit import AppStateMixin, TrainUnit, TTrainData
 from torchtnt.utils.distributed import get_global_rank, spawn_multi_process
 from torchtnt.utils.env import init_from_env
-from torchtnt.utils.test_utils import skip_if_not_distributed, skip_if_not_gpu
+from torchtnt.utils.test_utils import skip_if_not_distributed
 
 
 class BaseCheckpointSaver(BaseCheckpointer):
@@ -411,24 +413,20 @@ def test_invalid_args(self) -> None:
                 BaseCheckpointSaver(temp_dir, save_every_n_epochs=0)
 
     @skip_if_not_distributed
-    @skip_if_not_gpu
     def test_process_group_plumbing(self) -> None:
-        """
-        Creates a new process group and verifies GLOO group is created accordingly
-        """
         spawn_multi_process(
             2,
-            "nccl",
-            self._test_process_group_plumbing,
+            "gloo",
+            self._test_process_group_plumbing_gloo,
         )
         spawn_multi_process(
             2,
-            "gloo",
-            self._test_process_group_plumbing,
+            "gloo",  # inner test mocks nccl backend
+            self._test_process_group_plumbing_nccl,
         )
 
     @staticmethod
-    def _test_process_group_plumbing() -> None:
+    def _test_process_group_plumbing_gloo() -> None:
         checkpoint_cb = BaseCheckpointSaver(
             "foo",
             process_group=None,
@@ -441,6 +439,23 @@ def _test_process_group_plumbing() -> None:
             # verify no new process group was created
             tc.assertEqual(checkpoint_cb._process_group, dist.group.WORLD)
 
+    @staticmethod
+    @patch("torch.cuda.nccl.version", return_value=(1, 0, 0))
+    def _test_process_group_plumbing_nccl(_: MagicMock) -> None:
+        with patch("torch.distributed.get_backend", return_value=dist.Backend.NCCL):
+            checkpoint_cb = BaseCheckpointSaver(
+                "foo",
+                process_group=None,
+            )
+
+        tc = unittest.TestCase()
+        tc.assertIsNotNone(checkpoint_cb._process_group)
+        tc.assertEqual(
+            checkpoint_cb._process_group._get_backend_name(), dist.Backend.GLOO
+        )
+        # check that a new process group was created
+        tc.assertNotEqual(checkpoint_cb._process_group, dist.group.WORLD)
+
     @patch(
         "torchtnt.framework.callbacks.base_checkpointer.get_checkpoint_dirpaths",
         return_value=["epoch_1_step_10", "epoch_2_step_20"],
diff --git a/torchtnt/framework/callbacks/base_checkpointer.py b/torchtnt/framework/callbacks/base_checkpointer.py
@@ -439,7 +439,7 @@ def restore_from_latest(
         **kwargs: Any,
     ) -> bool:
         """
-        Given a parent directory where checkpoints are saved, restore the checkppoint state from the latest checkpoint in the directory.
+        Given a parent directory where checkpoints are saved, restore the checkpoint state from the latest checkpoint in the directory.
 
         There are additional flags offered should the user want to skip loading the train and eval progress.
         By default, the train and eval progress are restored, if applicable.