Move torchsnapshot_saver GPU test to dedicate file (#760)

diego-urgell · facebook-github-bot · commit b27d916920d1 · 2024-03-26T12:07:51.000-07:00
Summary: Pull Request resolved: #760 Reviewed By: galrotem Differential Revision: D55327868 fbshipit-source-id: 45e1dae2dc7ee1304a01cfe3a5b9a102dec02e15
diff --git a/tests/framework/callbacks/test_torchsnapshot_saver.py b/tests/framework/callbacks/test_torchsnapshot_saver.py
@@ -36,7 +36,7 @@
 from torchtnt.framework.train import train
 from torchtnt.utils.distributed import get_global_rank, spawn_multi_process
 from torchtnt.utils.env import seed
-from torchtnt.utils.test_utils import skip_if_not_distributed, skip_if_not_gpu
+from torchtnt.utils.test_utils import skip_if_not_distributed
 
 
 class TorchSnapshotSaverTest(unittest.TestCase):
@@ -227,56 +227,6 @@ def test_save_restore_no_lr_scheduler_restore(
         app_state = mock_torchsnapshot.Snapshot().restore.call_args.args[0]
         self.assertIn("lr_scheduler", app_state)
 
-    @skip_if_not_distributed
-    @skip_if_not_gpu
-    def test_save_restore_fsdp(self) -> None:
-        spawn_multi_process(
-            2,
-            "nccl",
-            self._save_restore_fsdp,
-        )
-
-    @staticmethod
-    def _save_restore_fsdp() -> None:
-        input_dim = 2
-        dataset_len = 10
-        batch_size = 2
-        max_epochs = 2
-        save_every_n_epochs = 1
-
-        my_unit = DummyAutoUnit(module=torch.nn.Linear(input_dim, 2), strategy="fsdp")
-        dataloader = generate_random_dataloader(dataset_len, input_dim, batch_size)
-        if get_global_rank() == 0:
-            temp_dir = tempfile.mkdtemp()
-        else:
-            temp_dir = ""
-
-        snapshot_cb = TorchSnapshotSaver(
-            temp_dir,
-            save_every_n_epochs=save_every_n_epochs,
-            replicated=["**"],
-        )
-        temp_dir = snapshot_cb.dirpath
-        train(my_unit, dataloader, max_epochs=max_epochs, callbacks=[snapshot_cb])
-
-        tc = unittest.TestCase()
-        try:
-            my_new_unit = DummyAutoUnit(
-                module=torch.nn.Linear(input_dim, 2), strategy="fsdp"
-            )
-            tc.assertNotEqual(
-                my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
-            )
-            # get latest checkpoint
-            ckpt_path = os.path.join(temp_dir, f"epoch_{max_epochs}_step_10")
-            snapshot_cb.restore(ckpt_path, my_new_unit)
-            tc.assertEqual(
-                my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
-            )
-        finally:
-            if get_global_rank() == 0:
-                shutil.rmtree(temp_dir)  # delete temp directory
-
     @skip_if_not_distributed
     def test_save_restore_ddp(self) -> None:
         spawn_multi_process(
diff --git a/tests/framework/callbacks/test_torchsnapshot_saver_gpu.py b/tests/framework/callbacks/test_torchsnapshot_saver_gpu.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+from torchtnt.framework._test_utils import DummyAutoUnit, generate_random_dataloader
+from torchtnt.framework.callbacks.torchsnapshot_saver import TorchSnapshotSaver
+from torchtnt.framework.train import train
+from torchtnt.utils.distributed import get_global_rank, spawn_multi_process
+from torchtnt.utils.test_utils import skip_if_not_distributed, skip_if_not_gpu
+
+
+class TorchSnapshotSaverGPUTest(unittest.TestCase):
+    @skip_if_not_distributed
+    @skip_if_not_gpu
+    def test_save_restore_fsdp(self) -> None:
+        spawn_multi_process(
+            2,
+            "nccl",
+            self._save_restore_fsdp,
+        )
+
+    @staticmethod
+    def _save_restore_fsdp() -> None:
+        input_dim = 2
+        dataset_len = 10
+        batch_size = 2
+        max_epochs = 2
+        save_every_n_epochs = 1
+
+        my_unit = DummyAutoUnit(module=torch.nn.Linear(input_dim, 2), strategy="fsdp")
+        dataloader = generate_random_dataloader(dataset_len, input_dim, batch_size)
+        if get_global_rank() == 0:
+            temp_dir = tempfile.mkdtemp()
+        else:
+            temp_dir = ""
+
+        snapshot_cb = TorchSnapshotSaver(
+            temp_dir,
+            save_every_n_epochs=save_every_n_epochs,
+            replicated=["**"],
+        )
+        temp_dir = snapshot_cb.dirpath
+        train(my_unit, dataloader, max_epochs=max_epochs, callbacks=[snapshot_cb])
+
+        tc = unittest.TestCase()
+        try:
+            my_new_unit = DummyAutoUnit(
+                module=torch.nn.Linear(input_dim, 2), strategy="fsdp"
+            )
+            tc.assertNotEqual(
+                my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
+            )
+            # get latest checkpoint
+            ckpt_path = os.path.join(temp_dir, f"epoch_{max_epochs}_step_10")
+            snapshot_cb.restore(ckpt_path, my_new_unit)
+            tc.assertEqual(
+                my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
+            )
+        finally:
+            if get_global_rank() == 0:
+                shutil.rmtree(temp_dir)  # delete temp directory