Merge branch 'master' into deepspeed_mics_init

lantiga · web-flow · commit 773001a023af · 2024-12-11T01:51:03.000+01:00
diff --git a/docs/source-pytorch/common/checkpointing_basic.rst b/docs/source-pytorch/common/checkpointing_basic.rst
@@ -20,6 +20,13 @@ PyTorch Lightning checkpoints are fully usable in plain PyTorch.
 
 ----
 
+.. important::
+
+   **Important Update: Deprecated Method**
+
+   Starting from PyTorch Lightning v1.0.0, the `resume_from_checkpoint` argument has been deprecated. To resume training from a checkpoint, use the `ckpt_path` argument in the `fit()` method.
+   Please update your code accordingly to avoid potential compatibility issues.
+
 ************************
 Contents of a checkpoint
 ************************
@@ -197,16 +204,31 @@ You can disable checkpointing by passing:
 
 ----
 
+
 *********************
 Resume training state
 *********************
 
 If you don't just want to load weights, but instead restore the full training, do the following:
 
+Correct usage:
+
 .. code-block:: python
 
    model = LitModel()
    trainer = Trainer()
 
    # automatically restores model, epoch, step, LR schedulers, etc...
-   trainer.fit(model, ckpt_path="some/path/to/my_checkpoint.ckpt")
+   trainer.fit(model, ckpt_path="path/to/your/checkpoint.ckpt")
+
+.. warning::
+
+   The argument `resume_from_checkpoint` has been deprecated in versions of PyTorch Lightning >= 1.0.0.
+   To resume training from a checkpoint, use the `ckpt_path` argument in the `fit()` method instead.
+
+Incorrect (deprecated) usage:
+
+.. code-block:: python
+
+   trainer = Trainer(resume_from_checkpoint="path/to/your/checkpoint.ckpt")
+   trainer.fit(model)
diff --git a/src/lightning/fabric/plugins/precision/fsdp.py b/src/lightning/fabric/plugins/precision/fsdp.py
@@ -74,6 +74,12 @@ def __init__(self, precision: _PRECISION_INPUT, scaler: Optional["ShardedGradSca
         }
         self._desired_input_dtype = precision_to_type[self.precision]
 
+    @override
+    def convert_module(self, module: Module) -> Module:
+        if "true" in self.precision:
+            return module.to(dtype=self._desired_input_dtype)
+        return module
+
     @property
     def mixed_precision_config(self) -> "TorchMixedPrecision":
         from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -35,6 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `_LoggerConnector`'s `_ResultMetric` to move all registered keys to the device of the logged value if needed ([#19814](https://github.com/Lightning-AI/pytorch-lightning/issues/19814))
 - Fixed `_optimizer_to_device` logic for special 'step' key in optimizer state causing performance regression ([#20019](https://github.com/Lightning-AI/lightning/pull/20019))
 - Fixed parameter counts in `ModelSummary` when model has distributed parameters (DTensor) ([#20163](https://github.com/Lightning-AI/pytorch-lightning/pull/20163))
+- Fixed PyTorch Lightning FSDP takes more memory than PyTorch FSDP ([#20323](https://github.com/Lightning-AI/pytorch-lightning/pull/20323))
 
 
 ## [2.3.0] - 2024-06-13
diff --git a/src/lightning/pytorch/core/datamodule.py b/src/lightning/pytorch/core/datamodule.py
@@ -14,7 +14,8 @@
 """LightningDataModule for loading DataLoaders with ease."""
 
 import inspect
-from collections.abc import Iterable
+import os
+from collections.abc import Iterable, Sized
 from typing import IO, Any, Optional, Union, cast
 
 from lightning_utilities import apply_to_collection
@@ -244,3 +245,75 @@ def load_from_checkpoint(
             **kwargs,
         )
         return cast(Self, loaded)
+
+    def __str__(self) -> str:
+        """Return a string representation of the datasets that are set up.
+
+        Returns:
+            A string representation of the datasets that are setup.
+
+        """
+
+        class dataset_info:
+            def __init__(self, available: bool, length: str) -> None:
+                self.available = available
+                self.length = length
+
+        def retrieve_dataset_info(loader: DataLoader) -> dataset_info:
+            """Helper function to compute dataset information."""
+            dataset = loader.dataset
+            size: str = str(len(dataset)) if isinstance(dataset, Sized) else "NA"
+
+            return dataset_info(True, size)
+
+        def loader_info(
+            loader: Union[DataLoader, Iterable[DataLoader]],
+        ) -> Union[dataset_info, Iterable[dataset_info]]:
+            """Helper function to compute dataset information."""
+            return apply_to_collection(loader, DataLoader, retrieve_dataset_info)
+
+        def extract_loader_info(methods: list[tuple[str, str]]) -> dict:
+            """Helper function to extract information for each dataloader method."""
+            info: dict[str, Union[dataset_info, Iterable[dataset_info]]] = {}
+            for loader_name, func_name in methods:
+                loader_method = getattr(self, func_name, None)
+
+                try:
+                    loader = loader_method()  # type: ignore
+                    info[loader_name] = loader_info(loader)
+                except Exception:
+                    info[loader_name] = dataset_info(False, "")
+
+            return info
+
+        def format_loader_info(info: dict[str, Union[dataset_info, Iterable[dataset_info]]]) -> str:
+            """Helper function to format loader information."""
+            output = []
+            for loader_name, loader_info in info.items():
+                # Single dataset
+                if isinstance(loader_info, dataset_info):
+                    loader_info_formatted = "None" if not loader_info.available else f"size={loader_info.length}"
+                # Iterable of datasets
+                else:
+                    loader_info_formatted = " ; ".join(
+                        "None" if not loader_info_i.available else f"{i}. size={loader_info_i.length}"
+                        for i, loader_info_i in enumerate(loader_info, start=1)
+                    )
+
+                output.append(f"{{{loader_name}: {loader_info_formatted}}}")
+
+            return os.linesep.join(output)
+
+        # Available dataloader methods
+        datamodule_loader_methods: list[tuple[str, str]] = [
+            ("Train dataloader", "train_dataloader"),
+            ("Validation dataloader", "val_dataloader"),
+            ("Test dataloader", "test_dataloader"),
+            ("Predict dataloader", "predict_dataloader"),
+        ]
+
+        # Retrieve information for each dataloader method
+        dataloader_info = extract_loader_info(datamodule_loader_methods)
+        # Format the information
+        dataloader_str = format_loader_info(dataloader_info)
+        return dataloader_str
diff --git a/src/lightning/pytorch/demos/boring_classes.py b/src/lightning/pytorch/demos/boring_classes.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections.abc import Iterator
+from collections.abc import Iterable, Iterator
 from typing import Any, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from lightning_utilities import apply_to_collection
 from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
@@ -188,6 +189,86 @@ def predict_dataloader(self) -> DataLoader:
         return DataLoader(self.random_predict)
 
 
+class BoringDataModuleNoLen(LightningDataModule):
+    """
+    .. warning::  This is meant for testing/debugging and is experimental.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def setup(self, stage: str) -> None:
+        if stage == "fit":
+            self.random_train = RandomIterableDataset(32, 512)
+
+        if stage in ("fit", "validate"):
+            self.random_val = RandomIterableDataset(32, 128)
+
+        if stage == "test":
+            self.random_test = RandomIterableDataset(32, 256)
+
+        if stage == "predict":
+            self.random_predict = RandomIterableDataset(32, 64)
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.random_train)
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(self.random_val)
+
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(self.random_test)
+
+    def predict_dataloader(self) -> DataLoader:
+        return DataLoader(self.random_predict)
+
+
+class IterableBoringDataModule(LightningDataModule):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def setup(self, stage: str) -> None:
+        if stage == "fit":
+            self.train_datasets = [
+                RandomDataset(4, 16),
+                RandomIterableDataset(4, 16),
+            ]
+
+        if stage in ("fit", "validate"):
+            self.val_datasets = [
+                RandomDataset(4, 32),
+                RandomIterableDataset(4, 32),
+            ]
+
+        if stage == "test":
+            self.test_datasets = [
+                RandomDataset(4, 64),
+                RandomIterableDataset(4, 64),
+            ]
+
+        if stage == "predict":
+            self.predict_datasets = [
+                RandomDataset(4, 128),
+                RandomIterableDataset(4, 128),
+            ]
+
+    def train_dataloader(self) -> Iterable[DataLoader]:
+        combined_train = apply_to_collection(self.train_datasets, Dataset, lambda x: DataLoader(x))
+        return combined_train
+
+    def val_dataloader(self) -> DataLoader:
+        combined_val = apply_to_collection(self.val_datasets, Dataset, lambda x: DataLoader(x))
+        return combined_val
+
+    def test_dataloader(self) -> DataLoader:
+        combined_test = apply_to_collection(self.test_datasets, Dataset, lambda x: DataLoader(x))
+        return combined_test
+
+    def predict_dataloader(self) -> DataLoader:
+        combined_predict = apply_to_collection(self.predict_datasets, Dataset, lambda x: DataLoader(x))
+        return combined_predict
+
+
 class ManualOptimBoringModel(BoringModel):
     """
     .. warning::  This is meant for testing/debugging and is experimental.
diff --git a/src/lightning/pytorch/loops/prediction_loop.py b/src/lightning/pytorch/loops/prediction_loop.py
@@ -233,8 +233,9 @@ def _predict_step(
 
         self.batch_progress.increment_ready()
 
-        if not using_dataloader_iter:
-            any_on_epoch = self._store_data_for_prediction_writer(batch_idx, dataloader_idx)
+        any_on_epoch = (
+            self._store_data_for_prediction_writer(batch_idx, dataloader_idx) if not using_dataloader_iter else False
+        )
 
         # the `_step` methods don't take a batch_idx when `dataloader_iter` is used, but all other hooks still do,
         # so we need different kwargs
diff --git a/src/lightning/pytorch/plugins/precision/fsdp.py b/src/lightning/pytorch/plugins/precision/fsdp.py
@@ -17,6 +17,7 @@
 import torch
 from lightning_utilities import apply_to_collection
 from torch import Tensor
+from torch.nn import Module
 from typing_extensions import get_args, override
 
 import lightning.pytorch as pl
@@ -73,6 +74,12 @@ def __init__(self, precision: _PRECISION_INPUT, scaler: Optional["ShardedGradSca
         }
         self._desired_input_dtype = precision_to_type[self.precision]
 
+    @override
+    def convert_module(self, module: Module) -> Module:
+        if "true" in self.precision:
+            return module.to(dtype=self._desired_input_dtype)
+        return module
+
     @override
     def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
         # see https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_
diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py
@@ -127,3 +127,21 @@ def test_invalid_precision_with_fsdp_precision():
 
     with pytest.raises(ValueError, match="is not supported in FSDP. `precision` must be one of"):
         FSDPPrecision(precision="64-true")
+
+
+@pytest.mark.parametrize(
+    ("precision", "expected_dtype"),
+    [
+        ("32-true", torch.float32),
+        ("bf16-mixed", torch.float32),
+        ("16-mixed", torch.float32),
+        ("bf16-true", torch.bfloat16),
+        ("16-true", torch.float16),
+    ],
+)
+def test_convert_module(precision, expected_dtype):
+    precision = FSDPPrecision(precision=precision)
+    module = torch.nn.Linear(2, 2)
+    assert module.weight.dtype == module.bias.dtype == torch.float32
+    module = precision.convert_module(module)
+    assert module.weight.dtype == module.bias.dtype == expected_dtype
diff --git a/tests/tests_pytorch/core/test_datamodules.py b/tests/tests_pytorch/core/test_datamodules.py
diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py