Cast to fp16 before moving to device with deepspeed (#14000)

awaelchli · lexierule · commit b438fa5e23c2 · 2022-08-09T14:48:08.000-04:00
Co-authored-by: Rohit Gupta &lt;rohitgr1998@gmail.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -9,8 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983))
-
-
+- Casted tensors to fp16 before moving them to device with  `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
 - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988))
 - Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262))
 - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992))
diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py
@@ -33,6 +33,7 @@
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.strategies.utils import _fp_to_half
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -46,10 +47,10 @@
 from pytorch_lightning.utilities.imports import _RequirementAvailable
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
-from pytorch_lightning.utilities.rank_zero import rank_zero_info
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import _LRScheduler, _PATH, LRSchedulerConfig, ReduceLROnPlateau, STEP_OUTPUT
-from pytorch_lightning.utilities.warnings import rank_zero_warn, WarningCache
+from pytorch_lightning.utilities.warnings import WarningCache
 
 warning_cache = WarningCache()
 
@@ -70,9 +71,15 @@ def remove_module_hooks(model: torch.nn.Module) -> None:
 
 
 class LightningDeepSpeedModule(_LightningModuleWrapperBase):
+    """
+    .. deprecated:: v1.7.1
+        ``LightningDeepSpeedModule`` has been deprecated in v1.7.1 and will be removed in v1.9.0.
+    """
+
     def __init__(
         self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int]
     ) -> None:
+        rank_zero_deprecation("`LightningDeepSpeedModule` has been deprecated in v1.7.1 and will be removed in v1.9.0")
         super().__init__(pl_module)
         self.precision = precision
 
@@ -477,7 +484,7 @@ def init_deepspeed(self) -> None:
             )
 
         assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
-        model = LightningDeepSpeedModule(pl_module=self.model, precision=self.precision_plugin.precision)
+        model = _LightningModuleWrapperBase(pl_module=self.model)
 
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
@@ -605,9 +612,9 @@ def _initialize_deepspeed_inference(self, model: Module) -> None:
 
     @property
     def lightning_module(self) -> Optional["pl.LightningModule"]:
-        # the model may not be wrapped with DeepEngine & LightningDeepSpeedModule if calling this too early
+        # the model may not be wrapped with DeepEngine & _LightningModuleWrapperBase if calling this too early
         module = getattr(self.model, "module", self.model)
-        module = module.module if isinstance(module, LightningDeepSpeedModule) else module
+        module = module.module if isinstance(module, _LightningModuleWrapperBase) else module
         assert isinstance(module, pl.LightningModule) or module is None
         return module
 
@@ -943,6 +950,10 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
             offload_optimizer_device="nvme",
         )
 
+    def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any:
+        batch = apply_to_collection(batch, Tensor, function=_fp_to_half, precision=self.precision_plugin.precision)
+        return super().batch_to_device(batch, device, dataloader_idx)
+
     def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
         assert self.model is not None
         with self.precision_plugin.val_step_context():
diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.parallel import ParallelStrategy
+from pytorch_lightning.strategies.utils import _fp_to_half
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -43,6 +44,11 @@
 
 
 class LightningIPUModule(_LightningModuleWrapperBase):
+    """
+    .. deprecated:: v1.7.0
+        ``LightningIPUModule`` has been deprecated in v1.7.0 and will be removed in v1.9.0.
+    """
+
     def __init__(
         self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int]
     ) -> None:
@@ -274,13 +280,7 @@ def to_tensor(x):
     def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any:
         # This override is necessary because the cast must occur before the data
         # is moved to the device to prevent wasteful host->device copies.
-        def fp_to_half(tensor: Tensor) -> Tensor:
-            if torch.is_floating_point(tensor):
-                return tensor.half()
-            return tensor
-
-        if self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF):
-            batch = apply_to_collection(batch, Tensor, function=fp_to_half)
+        batch = apply_to_collection(batch, Tensor, function=_fp_to_half, precision=self.precision_plugin.precision)
         # We don't call `super().batch_to_device` because `data.to(device)` is not
         # currently necessary for IPUs. The movement of data from host<->IPU is
         # currently handled by PopTorch.
diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py
@@ -13,6 +13,20 @@
 # limitations under the License.
 import os
 
+import torch
+
+from pytorch_lightning.utilities.enums import PrecisionType
+
 
 def on_colab_kaggle() -> bool:
     return bool(os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE"))
+
+
+def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor:
+    if torch.is_floating_point(tensor):
+        if precision in (PrecisionType.MIXED, PrecisionType.HALF):
+            return tensor.half()
+        if precision == PrecisionType.BFLOAT:
+            return tensor.bfloat16()
+
+    return tensor
diff --git a/src/pytorch_lightning/utilities/deepspeed.py b/src/pytorch_lightning/utilities/deepspeed.py
@@ -98,7 +98,7 @@ def convert_zero_checkpoint_to_fp32_state_dict(
     model_file = get_model_state_file(checkpoint_dir, zero_stage)
     client_state = torch.load(model_file, map_location=CPU_DEVICE)
     client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states}
-    # State dict keys will include reference to wrapper LightningDeepSpeedModule
+    # State dict keys will include reference to wrapper _LightningModuleWrapperBase
     # Delete `module` prefix before saving.
     state_dict = {k.partition("module.")[2]: state_dict[k] for k in state_dict.keys()}
     client_state["state_dict"] = state_dict
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -1007,9 +1007,7 @@ def test_trainer_config_ipus(monkeypatch, trainer_kwargs, expected_ipus):
         trainer.ipus == expected_ipus
 
 
-@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)
-def test_v1_8_0_deprecated_lightning_ipu_module(_, monkeypatch):
-    monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", True)
+def test_v1_8_0_deprecated_lightning_ipu_module():
     with pytest.deprecated_call(match=r"has been deprecated in v1.7.0 and will be removed in v1.8."):
         _ = LightningIPUModule(BoringModel(), 32)
 
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py
@@ -30,6 +30,7 @@
 from pytorch_lightning.profiler.pytorch import PyTorchProfiler, RegisterRecordFunction, ScheduleWrapper
 from pytorch_lightning.profiler.simple import SimpleProfiler
 from pytorch_lightning.profiler.xla import XLAProfiler
+from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from tests_pytorch.helpers.runif import RunIf
@@ -217,3 +218,8 @@ def test_gpu_accelerator_deprecation_warning():
         )
     ):
         GPUAccelerator()
+
+
+def test_v1_9_0_deprecated_lightning_deepspeed_module():
+    with pytest.deprecated_call(match=r"has been deprecated in v1.7.1 and will be removed in v1.9."):
+        _ = LightningDeepSpeedModule(BoringModel(), 32)
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -85,11 +85,12 @@ def automatic_optimization(self) -> bool:
         return False
 
 
-def test_deepspeed_lightning_module(tmpdir):
+def test_deepspeed_lightning_module():
     """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly."""
 
     model = BoringModel()
-    module = LightningDeepSpeedModule(model, precision=16)
+    with pytest.deprecated_call(match="`LightningDeepSpeedModule` has been deprecated in v1.7.1"):
+        module = LightningDeepSpeedModule(model, precision=16)
 
     module.half()
     assert module.dtype == torch.half
@@ -101,12 +102,13 @@ def test_deepspeed_lightning_module(tmpdir):
 
 
 @RunIf(min_cuda_gpus=1)
-def test_deepspeed_lightning_module_precision(tmpdir):
+def test_deepspeed_lightning_module_precision():
     """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision
     16."""
 
     model = BoringModel()
-    module = LightningDeepSpeedModule(model, precision=16)
+    with pytest.deprecated_call(match="`LightningDeepSpeedModule` has been deprecated in v1.7.1"):
+        module = LightningDeepSpeedModule(model, precision=16)
 
     module.cuda().half()
     assert module.dtype == torch.half
@@ -1306,6 +1308,7 @@ def test_deepspeed_with_bfloat16_precision(tmpdir):
     assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecisionPlugin)
     assert trainer.strategy.precision_plugin.precision == "bf16"
     assert trainer.strategy.config["zero_optimization"]["stage"] == 3
+    assert trainer.strategy.config["bf16"]["enabled"]
     assert model.layer.weight.dtype == torch.bfloat16
 
 
@@ -1344,3 +1347,19 @@ def configure_optimizers(self):
     )
     with pytest.raises(SystemExit):
         trainer.fit(model)
+
+
+@RunIf(min_cuda_gpus=1, deepspeed=True)
+def test_deepspeed_tensors_cast_to_fp16_before_hosted_on_device():
+    class CustomBoringModel(BoringModel):
+        def transfer_batch_to_device(self, batch, *args, **kwargs):
+            assert batch.dtype is torch.float16
+            return super().transfer_batch_to_device(batch, *args, **kwargs)
+
+    model = CustomBoringModel()
+    trainer = Trainer(strategy="deepspeed", devices=1, accelerator="cuda", precision=16)
+    trainer.strategy.connect(model)
+    batch = torch.zeros((1), dtype=torch.float32)
+    batch = trainer.strategy.batch_to_device(batch)
+    assert batch.is_cuda
+    assert batch.dtype is torch.float16