support meta weight loading in DDP (#966)

JKSenthil · facebook-github-bot · commit 7272dbdb72ec · 2025-01-29T10:33:14.000-08:00
Summary: Pull Request resolved: #966 Reviewed By: galrotem Differential Revision: D68837358 fbshipit-source-id: e3fcb6adf89e6ae5265a1cb0ccb2ad86a0b2c4e4
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -17,7 +17,9 @@
 from torchtnt.utils.prepare_module import (
     DDPStrategy,
     FSDPStrategy,
+    materialize_meta_params,
     NOOPStrategy,
+    on_meta_device,
     prepare_module,
     TorchCompileParams,
 )
@@ -214,3 +216,29 @@ def test_prepare_module_compile_module_state_dict(self) -> None:
                 torch.allclose(my_module_state_dict[k], compiled_state_dict[k])
             )
         self.assertIsNotNone(compiled_module._compiled_call_impl)
+
+    @unittest.skipUnless(
+        torch_version_geq_2_1_0,
+        reason="Must be on torch 2.1.0+ to run test",
+    )
+    def test_materialize_meta_params(self) -> None:
+        # Create a simple module with parameters on the meta device
+        class SimpleModule(torch.nn.Module):
+            def __init__(self):
+                super(SimpleModule, self).__init__()
+                self.linear1 = torch.nn.Linear(10, 10, device="meta")
+                self.linear2 = torch.nn.Linear(10, 10, device="cpu")
+
+        module = SimpleModule()
+        device = torch.device("cpu")
+
+        self.assertFalse(on_meta_device(module))  # top level module has no params
+        self.assertTrue(on_meta_device(module.linear1))
+        self.assertFalse(on_meta_device(module.linear2))
+
+        # Call the function to test
+        materialize_meta_params(module, device)
+
+        # Check if the parameters are moved to the specified device
+        for param in module.parameters():
+            self.assertEqual(param.device, device)
diff --git a/tests/utils/test_prepare_module_gpu.py b/tests/utils/test_prepare_module_gpu.py
@@ -41,6 +41,11 @@ def test_prepare_ddp(self) -> None:
             "nccl",
             self._test_prepare_ddp,
         )
+        spawn_multi_process(
+            2,
+            "nccl",
+            self._test_prepare_ddp_meta_device,
+        )
 
     @staticmethod
     def _test_prepare_ddp() -> None:
@@ -54,6 +59,18 @@ def _test_prepare_ddp() -> None:
         tc = unittest.TestCase()
         tc.assertTrue(isinstance(ddp_module, DDP))
 
+    @staticmethod
+    def _test_prepare_ddp_meta_device() -> None:
+        module = torch.nn.Linear(2, 2, device="meta")
+        device = init_from_env()
+        ddp_module = prepare_ddp(
+            module,
+            device,
+            DDPStrategy(find_unused_parameters=True, gradient_as_bucket_view=True),
+        )
+        tc = unittest.TestCase()
+        tc.assertTrue(isinstance(ddp_module, DDP))
+
     @skip_if_not_gpu
     @skip_if_not_distributed
     def test_prepare_fsdp(self) -> None:
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -50,7 +50,7 @@
     StateDictType,
 )
 
-from torchtnt.utils.rank_zero_log import rank_zero_warn
+from torchtnt.utils.rank_zero_log import rank_zero_info, rank_zero_warn
 from torchtnt.utils.version import is_torch_version_geq
 
 
@@ -188,7 +188,7 @@ def prepare_ddp(
     Utility to move a module to device and wrap in `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_.
 
     Args:
-        module: module to be wrapped in DDP
+        module: module to be wrapped in DDP. If module has params on meta device, they will be materialized on the device prior to DDP wrapping
         device: device to which module will be moved
         strategy: an instance of :class:`~torchtnt.utils.prepare_module.DDPStrategy` which defines the settings of DDP APIs
 
@@ -207,6 +207,10 @@ def prepare_ddp(
     # remove ddp comm hook variables from params dict
     del params_dict["comm_state"]
     del params_dict["comm_hook"]
+
+    materialize_meta_params(module, device)
+
+    # now move rest of module to device
     module = module.to(device)
 
     # remove sync batch norm from params dict before converting module
@@ -424,3 +428,24 @@ def convert_str_to_strategy(
             f"Strategy {strategy} not supported. Please use one of {list(string_to_strategy_mapping.keys())}"
         )
     return string_to_strategy_mapping[strategy]
+
+
+def on_meta_device(module: torch.nn.Module) -> bool:
+    try:
+        return next(module.parameters(recurse=False)).device.type == "meta"
+    except StopIteration:
+        return False
+
+
+def materialize_meta_params(module: torch.nn.Module, device: torch.device) -> None:
+    """
+    Materialize meta device parameters to the given device.
+
+    Args:
+        module: module to be used.
+        device: device to which module will be moved.
+    """
+    for name, submodule in module.named_modules():
+        if on_meta_device(submodule):
+            rank_zero_info(f"{name} is on meta device, intializing on device {device}")
+            submodule.to_empty(device=device, recurse=False)