Merge branch 'master' into feature/19743-tensorboard-histograms

SkafteNicki · web-flow · commit eb242202d1a0 · 2025-08-21T08:36:40.000+02:00
diff --git a/.actions/assistant.py b/.actions/assistant.py
@@ -341,33 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> N
 
 
 class AssistantCLI:
-    @staticmethod
-    def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
-        """Remove some packages from given requirement files."""
-        if isinstance(req_files, str):
-            req_files = [req_files]
-        for req in req_files:
-            AssistantCLI._prune_packages(req, packages)
-
-    @staticmethod
-    def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
-        """Remove some packages from given requirement files."""
-        path = Path(req_file)
-        assert path.exists()
-        text = path.read_text()
-        lines = text.splitlines()
-        final = []
-        for line in lines:
-            ln_ = line.strip()
-            if not ln_ or ln_.startswith("#"):
-                final.append(line)
-                continue
-            req = list(_parse_requirements([ln_]))[0]
-            if req.name not in packages:
-                final.append(line)
-        print(final)
-        path.write_text("\n".join(final) + "\n")
-
     @staticmethod
     def copy_replace_imports(
         source_dir: str,
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -99,10 +99,16 @@ jobs:
         displayName: "Image info & NVIDIA"
 
       - bash: |
-          cd requirements/fabric
+          set -ex
+          pip install "cython<3.0" wheel  # for compatibility
           pip install -U "lightning-utilities[cli]"
+          cd requirements/fabric
+          # replace range by pin minimal requirements
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
-          pip install "cython<3.0" wheel  # for compatibility
+          # drop deepspeed since it is not supported by our minimal Torch requirements
+          python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
+          # uninstall deepspeed since some older docker images have it pre-installed
+          pip uninstall -y deepspeed
         condition: contains(variables['Agent.JobName'], 'oldest')
         displayName: "setting oldest dependencies"
 
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -103,10 +103,16 @@ jobs:
         displayName: "Image info & NVIDIA"
 
       - bash: |
-          cd requirements/pytorch
+          set -ex
+          pip install "cython<3.0" wheel  # for compatibility
           pip install -U "lightning-utilities[cli]"
+          cd requirements/pytorch
+          # replace range by pin minimal requirements
           python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
-          pip install "cython<3.0" wheel  # for compatibility
+          # drop deepspeed since it is not supported by our minimal Torch requirements
+          python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
+          # uninstall deepspeed since some older docker images have it pre-installed
+          pip uninstall -y deepspeed
         condition: contains(variables['Agent.JobName'], 'oldest')
         displayName: "setting oldest dependencies"
 
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
@@ -21,6 +21,7 @@ FROM pytorchlightning/pytorch_lightning:base-cuda${CUDA_VERSION}-py${PYTHON_VERS
 LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"
 
 ARG LIGHTNING_VERSION=""
+ARG PYTORCH_VERSION
 
 COPY ./ /home/pytorch-lightning/
 
@@ -39,7 +40,14 @@ RUN \
     fi && \
     # otherwise there is collision with folder name and pkg name on Pypi
     cd pytorch-lightning && \
-    pip install setuptools==75.6.0 && \
+    # pip install setuptools==75.6.0 && \
+    pip install -U "lightning-utilities[cli]" && \
+    # drop deepspeed since it is not supported by our minimal Torch requirements \
+    echo "PYTORCH_VERSION is: '$PYTORCH_VERSION'" && \
+    if [[ "$PYTORCH_VERSION" =~ ^(2\.1|2\.2|2\.3|2\.4)$ ]]; then \
+        python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/fabric/strategies.txt ; \
+        python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/pytorch/strategies.txt ; \
+    fi && \
     PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \
     PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \
     cd .. && \
diff --git a/pyproject.toml b/pyproject.toml
@@ -179,6 +179,7 @@ markers = [
     "cloud: Run the cloud tests for example",
 ]
 filterwarnings = [
+    # "error::DeprecationWarning",
     "error::FutureWarning",
     "ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated
 ]
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
@@ -47,7 +47,6 @@
     from torch.optim.lr_scheduler import _LRScheduler
 
 _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
-_DEEPSPEED_GREATER_EQUAL_0_14_1 = RequirementCache("deepspeed>=0.14.1")
 
 
 # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -503,10 +502,7 @@ def load_checkpoint(
             )
         engine = engines[0]
 
-        if _DEEPSPEED_GREATER_EQUAL_0_14_1:
-            from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
-        else:
-            from deepspeed.runtime import DeepSpeedOptimizer
+        from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
 
         optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values())
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -34,8 +34,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `AsyncCheckpointIO` snapshots tensors to avoid race with parameter mutation ([#21079](https://github.com/Lightning-AI/pytorch-lightning/pull/21079))
 
 
+- Fixed `AsyncCheckpointIO` threadpool exception if calling fit or validate more than one ([#20952](https://github.com/Lightning-AI/pytorch-lightning/pull/20952))
+
+
 - Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068))
 
+
 ---
 
 ## [2.5.3] - 2025-08-13
diff --git a/src/lightning/pytorch/plugins/io/async_plugin.py b/src/lightning/pytorch/plugins/io/async_plugin.py
@@ -13,15 +13,17 @@
 # limitations under the License.
 
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from lightning_utilities.core.apply_func import apply_to_collection
 from typing_extensions import override
 
-from lightning.fabric.plugins import CheckpointIO
 from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
 
+if TYPE_CHECKING:
+    from lightning.fabric.plugins import CheckpointIO
+
 
 class AsyncCheckpointIO(_WrappingCheckpointIO):
     """``AsyncCheckpointIO`` enables saving the checkpoints asynchronously in a thread.
@@ -33,20 +35,30 @@ class AsyncCheckpointIO(_WrappingCheckpointIO):
 
     """
 
+    _executor: Optional[ThreadPoolExecutor]
+    _error: Optional[BaseException]
+
     def __init__(self, checkpoint_io: Optional["CheckpointIO"] = None) -> None:
         super().__init__(checkpoint_io)
+        self._executor = None
+        self._error = None
+
+    # CheckpointIO doesn't have a setup method so we have to do something like.
+    def _ensure_setup(self) -> None:
+        """Ensures that the executor is setup.
 
-        self._executor = ThreadPoolExecutor(max_workers=1)
-        self._error: Optional[BaseException] = None
+        We can't do setup in __init__ because if train or validate is called more than once, the teardown method deletes
+        the executor.
+
+        """
+        if self._executor is None:
+            self._executor = ThreadPoolExecutor(max_workers=1)
 
     @override
     def save_checkpoint(self, *args: Any, **kwargs: Any) -> None:
         """Uses the ``ThreadPoolExecutor`` to save the checkpoints using the base ``checkpoint_io``."""
 
-        # snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
-        def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
-            # detach to avoid autograd history and clone to take a point-in-time copy
-            return t.detach().clone()
+        self._ensure_setup()
 
         # rebuild args/kwargs with a cloned checkpoint (supports positional or kw form)
         if "checkpoint" in kwargs:
@@ -61,6 +73,7 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None:
             except BaseException as ex:
                 self._error = ex
 
+        assert self._executor is not None
         self._executor.submit(_save_checkpoint, *args, **kwargs)
 
         # if an error was raised between the previous time `save_checkpoint`` was called and now,
@@ -71,8 +84,17 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None:
     @override
     def teardown(self) -> None:
         """This method is called to close the threads."""
-        self._executor.shutdown(wait=True)
+        if self._executor is not None:
+            self._executor.shutdown(wait=True)
+            self._executor = None
 
         # if an error was raised anytime in any of the `executor.submit` calls
         if self._error:
             raise self._error
+
+
+# snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
+def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
+    """Clones a tensor on the caller thread."""
+    # detach to avoid autograd history and clone to take a point-in-time copy
+    return t.detach().clone()
diff --git a/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py b/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py
@@ -30,6 +30,7 @@ def __init__(self):
 
 
 @RunIf(skip_windows=True)
+@pytest.mark.flaky(reruns=3)
 @pytest.mark.parametrize("strategy", ["ddp_spawn", "ddp_fork"])
 def test_memory_sharing_disabled(strategy):
     """Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race
diff --git a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py
@@ -127,6 +127,10 @@ def on_fit_start(self):
         enable_progress_bar=False,
         enable_model_summary=False,
     )
+
+    # We add a validate step to test that async works when fit or validate is called multiple times.
+    trainer.validate(model)
+
     trainer.fit(model)
 
     assert checkpoint_plugin.save_checkpoint.call_count == 3
diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py
@@ -32,7 +32,7 @@
 
 # https://github.com/pytorch/pytorch/issues/95708
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
-@RunIf(dynamo=True)
+@RunIf(dynamo=True, deepspeed=True)
 @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
 def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
     trainer_kwargs = {

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,7 @@ markers = [`
`179`	`179`	`"cloud: Run the cloud tests for example",`
`180`	`180`	`]`
`181`	`181`	`filterwarnings = [`
	`182`	`+ # "error::DeprecationWarning",`
`182`	`183`	`"error::FutureWarning",`
`183`	`184`	`"ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated`
`184`	`185`	`]`