Avoid moving the model to device if move_to_device=False (#19152)

carmocca · web-flow · commit 234ded89d493 · 2023-12-15T00:00:21.000+01:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -44,6 +44,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed broadcast at initialization in `MPIEnvironment` ([#19074](https://github.com/Lightning-AI/lightning/pull/19074))
 
 
+- Avoid moving the model to device if `move_to_device=False` is passed ([#19152](https://github.com/Lightning-AI/lightning/pull/19152))
+
+
 - Fixed issue where the `precision="transformer-engine"` argument would not replace layers by default ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
 
 
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -64,6 +64,7 @@
     _update_dataloader,
     has_iterable_dataset,
 )
+from lightning.fabric.utilities.device_dtype_mixin import _update_properties
 from lightning.fabric.utilities.distributed import DistributedSamplerWrapper
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
@@ -243,9 +244,11 @@ def setup(
 
         module = _FabricModule(module, self._precision, original_module=original_module)
 
-        if not isinstance(self._strategy, (FSDPStrategy, XLAFSDPStrategy)):
-            # Update the _DeviceDtypeModuleMixin's device parameter
-            module.to(self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device)
+        # Update the _DeviceDtypeModuleMixin's device parameter
+        # NOTE: for sharded strategies or manual device placement, there's no single root device
+        _update_properties(
+            module, device=self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device
+        )
 
         optimizers = [
             _FabricOptimizer(optimizer=optimizer, strategy=self._strategy, callbacks=self._callbacks)
@@ -295,9 +298,11 @@ def setup_module(self, module: nn.Module, move_to_device: bool = True) -> _Fabri
         module = self._strategy.setup_module(module)
         module = _FabricModule(module, self._precision, original_module=original_module)
 
-        if not isinstance(self._strategy, (FSDPStrategy, XLAFSDPStrategy)):
-            # Update the _DeviceDtypeModuleMixin's device parameter
-            module.to(self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device)
+        # Update the _DeviceDtypeModuleMixin's device parameter
+        # NOTE: for sharded strategies or manual device placement, there's no single root device
+        _update_properties(
+            module, device=self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device
+        )
 
         if hasattr(original_module, "_fabric"):  # this is probably a LightningModule
             original_module._fabric = self  # type: ignore[assignment]
diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py
@@ -50,7 +50,7 @@ def to(self, *args: Any, **kwargs: Any) -> Self:
         """See :meth:`torch.nn.Module.to`."""
         # this converts `str` device to `torch.device`
         device, dtype = torch._C._nn._parse_to(*args, **kwargs)[:2]
-        self.__update_properties(device=device, dtype=dtype)
+        _update_properties(self, device=device, dtype=dtype)
         return super().to(*args, **kwargs)
 
     def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self:
@@ -70,43 +70,46 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self:
             device = torch.device("cuda", torch.cuda.current_device())
         elif isinstance(device, int):
             device = torch.device("cuda", index=device)
-        self.__update_properties(device=device)
+        _update_properties(self, device=device)
         return super().cuda(device=device)
 
     def cpu(self) -> Self:
         """See :meth:`torch.nn.Module.cpu`."""
-        self.__update_properties(device=torch.device("cpu"))
+        _update_properties(self, device=torch.device("cpu"))
         return super().cpu()
 
     def type(self, dst_type: Union[str, torch.dtype]) -> Self:
         """See :meth:`torch.nn.Module.type`."""
-        self.__update_properties(dtype=dst_type)
+        _update_properties(self, dtype=dst_type)
         return super().type(dst_type=dst_type)
 
     def float(self) -> Self:
         """See :meth:`torch.nn.Module.float`."""
-        self.__update_properties(dtype=torch.float)
+        _update_properties(self, dtype=torch.float)
         return super().float()
 
     def double(self) -> Self:
         """See :meth:`torch.nn.Module.double`."""
-        self.__update_properties(dtype=torch.double)
+        _update_properties(self, dtype=torch.double)
         return super().double()
 
     def half(self) -> Self:
         """See :meth:`torch.nn.Module.half`."""
-        self.__update_properties(dtype=torch.half)
+        _update_properties(self, dtype=torch.half)
         return super().half()
 
-    def __update_properties(
-        self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None
-    ) -> None:
-        def apply_fn(module: Union[_DeviceDtypeModuleMixin, Module]) -> None:
-            if not isinstance(module, _DeviceDtypeModuleMixin):
-                return
-            if device is not None:
-                module._device = device
-            if dtype is not None:
-                module._dtype = dtype
-
-        self.apply(apply_fn)
+
+def _update_properties(
+    root: torch.nn.Module, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None
+) -> None:
+    def apply_fn(module: Union[_DeviceDtypeModuleMixin, Module]) -> None:
+        if not isinstance(module, _DeviceDtypeModuleMixin):
+            return
+        # cannot use `module.to()` because we don't actually want to move the model in case there are multiple
+        # devices types (such as partial meta parameters)
+        if device is not None:
+            module._device = device
+        if dtype is not None:
+            module._dtype = dtype
+
+    root.apply(apply_fn)
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -304,8 +304,9 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device):
     else:
         assert isinstance(next(fabric_model.parameters()), FlatParameter)
 
-    # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for sharded models
-    assert fabric_model.device == torch.device("cpu")
+    # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for models with pieces on
+    # different devices
+    assert fabric_model.device == torch.device("cuda", fabric.local_rank)
     assert fabric.device == torch.device("cuda", fabric.local_rank)
 
 
diff --git a/tests/tests_fabric/strategies/test_xla_fsdp_integration.py b/tests/tests_fabric/strategies/test_xla_fsdp_integration.py
@@ -190,8 +190,9 @@ def _test_setup_module_move_to_device(fabric, move_to_device):
         fabric_model = fabric.setup_module(model, move_to_device=move_to_device)
     fabric_module_mock.assert_not_called()
 
-    # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for sharded models
-    assert fabric_model.device == torch.device("cpu")
+    # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for models with pieces on
+    # different devices
+    assert fabric_model.device.type == "xla"
     assert fabric.device.type == "xla"
 
 
diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py
@@ -157,8 +157,8 @@ def test_setup_module_parameters_on_different_devices(setup_method, move_to_devi
 
     fabric = Fabric(accelerator="cuda", devices=1)
 
-    module0 = nn.Linear(1, 2).to(device0)
-    module1 = nn.Linear(1, 2).to(device1)
+    module0 = nn.Linear(1, 2, device=device0)
+    module1 = nn.Linear(1, 2, device=device1)
     model = nn.Sequential(module0, module1)
 
     setup_method = getattr(fabric, setup_method)
@@ -174,7 +174,14 @@ def test_setup_module_parameters_on_different_devices(setup_method, move_to_devi
         assert module1.weight.device == module1.bias.device == device1
     else:
         with no_warning_call(expected_warning=PossibleUserWarning, match=match):
-            setup_method(model, move_to_device=move_to_device)
+            fabric_model = setup_method(model, move_to_device=move_to_device)
+
+        # the first device is set at the root
+        assert fabric_model.device == device0
+        assert fabric_model._device == device0
+        # the weights were not moved
+        assert module0.weight.device == module0.bias.device == device0
+        assert module1.weight.device == module1.bias.device == device1
 
 
 def test_setup_module_and_optimizers():