Merge branch 'master' into improve-docs-resume-checkpoints

WangYue0000 · web-flow · commit c08b2c1429f9 · 2024-12-10T21:54:09.000+08:00
diff --git a/.actions/assistant.py b/.actions/assistant.py
@@ -483,6 +483,21 @@ def convert_version2nightly(ver_file: str = "src/version.info") -> None:
 
 
 if __name__ == "__main__":
+    import sys
+
     import jsonargparse
+    from jsonargparse import ArgumentParser
+
+    def patch_jsonargparse_python_3_12_8():
+        if sys.version_info < (3, 12, 8):
+            return
+
+        def _parse_known_args_patch(self: ArgumentParser, args: Any = None, namespace: Any = None) -> tuple[Any, Any]:
+            namespace, args = super(ArgumentParser, self)._parse_known_args(args, namespace, intermixed=False)  # type: ignore
+            return namespace, args
+
+        setattr(ArgumentParser, "_parse_known_args", _parse_known_args_patch)
+
+    patch_jsonargparse_python_3_12_8()  # Required until fix https://github.com/omni-us/jsonargparse/issues/641
 
     jsonargparse.CLI(AssistantCLI, as_positional=False)
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -49,16 +49,16 @@ jobs:
           - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
-          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
-          - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" }
+          - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.5.1" }
           # "oldest" versions tests, only on minimum Python
           - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
           - {
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -53,16 +53,16 @@ jobs:
           - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
-          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.4.1" }
-          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
-          - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" }
-          - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" }
+          - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
+          - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.5.1" }
           # "oldest" versions tests, only on minimum Python
           - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
           - {
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -59,7 +59,7 @@ RUN \
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
         python${PYTHON_VERSION} \
-        python${PYTHON_VERSION}-distutils \
+        python3-setuptools \
         python${PYTHON_VERSION}-dev \
     && \
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
diff --git a/dockers/docs/Dockerfile b/dockers/docs/Dockerfile
@@ -44,7 +44,7 @@ RUN \
         dvipng \
         texlive-pictures \
         python3 \
-        python3-distutils \
+        python3-setuptools \
         python3-dev \
     && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst
@@ -202,6 +202,13 @@ How-to Guides
     :col_css: col-md-4
     :height: 180
 
+.. displayitem::
+    :header: Truncated Back-Propagation Through Time
+    :description: Efficiently step through time when training recurrent models
+    :button_link: ../common/tbtt.html
+    :col_css: col-md-4
+    :height: 180
+
 .. raw:: html
 
         </div>
diff --git a/docs/source-pytorch/common/tbptt.rst b/docs/source-pytorch/common/tbptt.rst
@@ -0,0 +1,59 @@
+##############################################
+Truncated Backpropagation Through Time (TBPTT)
+##############################################
+
+Truncated Backpropagation Through Time (TBPTT) performs backpropogation every k steps of
+a much longer sequence. This is made possible by passing training batches
+split along the time-dimensions into splits of size k to the
+``training_step``. In order to keep the same forward propagation behavior, all
+hidden states should be kept in-between each time-dimension split.
+
+
+.. code-block:: python
+
+    import torch
+    import torch.optim as optim
+    import pytorch_lightning as pl
+    from pytorch_lightning import LightningModule
+
+    class LitModel(LightningModule):
+
+        def __init__(self):
+            super().__init__()
+
+            # 1. Switch to manual optimization
+            self.automatic_optimization = False
+
+            self.truncated_bptt_steps = 10
+            self.my_rnn = ParityModuleRNN() # Define RNN model using ParityModuleRNN
+
+        # 2. Remove the `hiddens` argument
+        def training_step(self, batch, batch_idx):
+
+            # 3. Split the batch in chunks along the time dimension
+            split_batches = split_batch(batch, self.truncated_bptt_steps)
+
+            batch_size = 10
+            hidden_dim = 20
+            hiddens = torch.zeros(1, batch_size, hidden_dim, device=self.device)
+            for split_batch in range(split_batches):
+                # 4. Perform the optimization in a loop
+                loss, hiddens = self.my_rnn(split_batch, hiddens)
+                self.backward(loss)
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                # 5. "Truncate"
+                hiddens = hiddens.detach()
+
+            # 6. Remove the return of `hiddens`
+            # Returning loss in manual optimization is not needed
+            return None
+
+        def configure_optimizers(self):
+            return optim.Adam(self.my_rnn.parameters(), lr=0.001)
+
+    if __name__ == "__main__":
+        model = LitModel()
+        trainer = pl.Trainer(max_epochs=5)
+        trainer.fit(model, train_dataloader) # Define your own dataloader
diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
@@ -462,7 +462,9 @@ def _load_py_module(name: str, location: str) -> ModuleType:
     ("py:obj", "lightning.pytorch.utilities.memory.is_out_of_cpu_memory"),
     ("py:func", "lightning.pytorch.utilities.rank_zero.rank_zero_only"),
     ("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfig"),
-    ("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfig"),
+    ("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfigType"),
+    ("py:class", "lightning.pytorch.utilities.types.OptimizerConfigType"),
+    ("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfigType"),
     ("py:class", "lightning_habana.pytorch.plugins.precision.HPUPrecisionPlugin"),
     ("py:class", "lightning_habana.pytorch.strategies.HPUDDPStrategy"),
     ("py:class", "lightning_habana.pytorch.strategies.HPUParallelStrategy"),
diff --git a/examples/fabric/reinforcement_learning/rl/utils.py b/examples/fabric/reinforcement_learning/rl/utils.py
@@ -1,7 +1,6 @@
 import argparse
 import math
 import os
-from distutils.util import strtobool
 from typing import TYPE_CHECKING, Optional, Union
 
 import gymnasium as gym
@@ -12,6 +11,23 @@
     from rl.agent import PPOAgent, PPOLightningAgent
 
 
+def strtobool(val):
+    """Convert a string representation of truth to true (1) or false (0).
+
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+    Raises ValueError if 'val' is anything else.
+
+    Note: taken from distutils after its deprecation.
+
+    """
+    val = val.lower()
+    if val in ("y", "yes", "t", "true", "on", "1"):
+        return 1
+    if val in ("n", "no", "f", "false", "off", "0"):
+        return 0
+    raise ValueError(f"invalid truth value {val!r}")
+
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--exp-name", type=str, default="default", help="the name of this experiment")
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
@@ -18,6 +18,7 @@
 import platform
 from collections.abc import Mapping
 from contextlib import AbstractContextManager, ExitStack
+from datetime import timedelta
 from itertools import chain
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
@@ -29,6 +30,7 @@
 from typing_extensions import override
 
 from lightning.fabric.accelerators import Accelerator, CUDAAccelerator
+from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
 from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
 from lightning.fabric.plugins.precision import Precision
 from lightning.fabric.strategies.ddp import DDPStrategy
@@ -97,6 +99,7 @@ def __init__(
         load_full_weights: bool = False,
         precision: Optional[Precision] = None,
         process_group_backend: Optional[str] = None,
+        timeout: Optional[timedelta] = default_pg_timeout,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
         billion parameter models. `For more information: https://pytorch-
@@ -241,6 +244,7 @@ def __init__(
             process_group_backend=process_group_backend,
         )
         self._backward_sync_control = None  # DeepSpeed handles gradient accumulation internally
+        self._timeout: Optional[timedelta] = timeout
 
         self.config = self._load_config(config)
         if self.config is None:
@@ -648,7 +652,9 @@ def _init_deepspeed_distributed(self) -> None:
                 f"MEMBER: {self.global_rank + 1}/{self.world_size}"
             )
         self._process_group_backend = self._get_process_group_backend()
-        deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
+        deepspeed.init_distributed(
+            self._process_group_backend, distributed_port=self.cluster_environment.main_port, timeout=self._timeout
+        )
 
     def _set_node_environment_variables(self) -> None:
         assert self.cluster_environment is not None
diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py
@@ -37,6 +37,18 @@
 
 _JSONARGPARSE_SIGNATURES_AVAILABLE = RequirementCache("jsonargparse[signatures]>=4.27.7")
 
+
+def patch_jsonargparse_python_3_12_8() -> None:
+    if sys.version_info < (3, 12, 8):
+        return
+
+    def _parse_known_args_patch(self: ArgumentParser, args: Any = None, namespace: Any = None) -> tuple[Any, Any]:
+        namespace, args = super(ArgumentParser, self)._parse_known_args(args, namespace, intermixed=False)  # type: ignore
+        return namespace, args
+
+    setattr(ArgumentParser, "_parse_known_args", _parse_known_args_patch)
+
+
 if _JSONARGPARSE_SIGNATURES_AVAILABLE:
     import docstring_parser
     from jsonargparse import (
@@ -48,6 +60,8 @@
         set_config_read_mode,
     )
 
+    patch_jsonargparse_python_3_12_8()  # Required until fix https://github.com/omni-us/jsonargparse/issues/641
+
     register_unresolvable_import_paths(torch)  # Required until fix https://github.com/pytorch/pytorch/issues/74483
     set_config_read_mode(fsspec_enabled=True)
 else:
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -19,6 +19,7 @@
 from collections import OrderedDict
 from collections.abc import Generator, Mapping
 from contextlib import contextmanager
+from datetime import timedelta
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, Union
 
@@ -30,6 +31,7 @@
 
 import lightning.pytorch as pl
 from lightning.fabric.plugins import ClusterEnvironment
+from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.strategies.deepspeed import (
     _DEEPSPEED_AVAILABLE,
@@ -119,6 +121,7 @@ def __init__(
         load_full_weights: bool = False,
         precision_plugin: Optional[Precision] = None,
         process_group_backend: Optional[str] = None,
+        timeout: Optional[timedelta] = default_pg_timeout,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
         billion parameter models. `For more information: https://pytorch-
@@ -264,6 +267,7 @@ def __init__(
             precision_plugin=precision_plugin,
             process_group_backend=process_group_backend,
         )
+        self._timeout: Optional[timedelta] = timeout
 
         self.config = self._load_config(config)
         if self.config is None:
@@ -364,7 +368,9 @@ def _init_deepspeed_distributed(self) -> None:
                 f"MEMBER: {self.global_rank + 1}/{self.world_size}"
             )
         self._process_group_backend = self._get_process_group_backend()
-        deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
+        deepspeed.init_distributed(
+            self._process_group_backend, distributed_port=self.cluster_environment.main_port, timeout=self._timeout
+        )
 
     def _set_node_environment_variables(self) -> None:
         assert self.cluster_environment is not None
diff --git a/src/lightning/pytorch/utilities/types.py b/src/lightning/pytorch/utilities/types.py
@@ -104,18 +104,25 @@ class LRSchedulerConfigType(TypedDict, total=False):
     strict: bool
 
 
-class OptimizerLRSchedulerConfig(TypedDict):
+class OptimizerConfigType(TypedDict):
     optimizer: Optimizer
-    lr_scheduler: NotRequired[Union[LRSchedulerTypeUnion, LRSchedulerConfigType]]
+
+
+class OptimizerLRSchedulerConfigType(TypedDict):
+    optimizer: Optimizer
+    lr_scheduler: Union[LRSchedulerTypeUnion, LRSchedulerConfigType]
+    monitor: NotRequired[str]
 
 
 OptimizerLRScheduler = Optional[
     Union[
         Optimizer,
         Sequence[Optimizer],
         tuple[Sequence[Optimizer], Sequence[Union[LRSchedulerTypeUnion, LRSchedulerConfig]]],
-        OptimizerLRSchedulerConfig,
-        Sequence[OptimizerLRSchedulerConfig],
+        OptimizerConfigType,
+        OptimizerLRSchedulerConfigType,
+        Sequence[OptimizerConfigType],
+        Sequence[OptimizerLRSchedulerConfigType],
     ]
 ]
 
diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md
diff --git a/tests/parity_fabric/test_parity_ddp.py b/tests/parity_fabric/test_parity_ddp.py
diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py