Skip to content

Commit eb24220

Browse files
authored
Merge branch 'master' into feature/19743-tensorboard-histograms
2 parents 230b71f + ff64a92 commit eb24220

File tree

13 files changed

+70
-49
lines changed

13 files changed

+70
-49
lines changed

.actions/assistant.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -341,33 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> N
341341

342342

343343
class AssistantCLI:
344-
@staticmethod
345-
def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
346-
"""Remove some packages from given requirement files."""
347-
if isinstance(req_files, str):
348-
req_files = [req_files]
349-
for req in req_files:
350-
AssistantCLI._prune_packages(req, packages)
351-
352-
@staticmethod
353-
def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
354-
"""Remove some packages from given requirement files."""
355-
path = Path(req_file)
356-
assert path.exists()
357-
text = path.read_text()
358-
lines = text.splitlines()
359-
final = []
360-
for line in lines:
361-
ln_ = line.strip()
362-
if not ln_ or ln_.startswith("#"):
363-
final.append(line)
364-
continue
365-
req = list(_parse_requirements([ln_]))[0]
366-
if req.name not in packages:
367-
final.append(line)
368-
print(final)
369-
path.write_text("\n".join(final) + "\n")
370-
371344
@staticmethod
372345
def copy_replace_imports(
373346
source_dir: str,

.azure/gpu-tests-fabric.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,16 @@ jobs:
9999
displayName: "Image info & NVIDIA"
100100
101101
- bash: |
102-
cd requirements/fabric
102+
set -ex
103+
pip install "cython<3.0" wheel # for compatibility
103104
pip install -U "lightning-utilities[cli]"
105+
cd requirements/fabric
106+
# replace range by pin minimal requirements
104107
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
105-
pip install "cython<3.0" wheel # for compatibility
108+
# drop deepspeed since it is not supported by our minimal Torch requirements
109+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
110+
# uninstall deepspeed since some older docker images have it pre-installed
111+
pip uninstall -y deepspeed
106112
condition: contains(variables['Agent.JobName'], 'oldest')
107113
displayName: "setting oldest dependencies"
108114

.azure/gpu-tests-pytorch.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,16 @@ jobs:
103103
displayName: "Image info & NVIDIA"
104104
105105
- bash: |
106-
cd requirements/pytorch
106+
set -ex
107+
pip install "cython<3.0" wheel # for compatibility
107108
pip install -U "lightning-utilities[cli]"
109+
cd requirements/pytorch
110+
# replace range by pin minimal requirements
108111
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
109-
pip install "cython<3.0" wheel # for compatibility
112+
# drop deepspeed since it is not supported by our minimal Torch requirements
113+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
114+
# uninstall deepspeed since some older docker images have it pre-installed
115+
pip uninstall -y deepspeed
110116
condition: contains(variables['Agent.JobName'], 'oldest')
111117
displayName: "setting oldest dependencies"
112118

dockers/release/Dockerfile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ FROM pytorchlightning/pytorch_lightning:base-cuda${CUDA_VERSION}-py${PYTHON_VERS
2121
LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"
2222

2323
ARG LIGHTNING_VERSION=""
24+
ARG PYTORCH_VERSION
2425

2526
COPY ./ /home/pytorch-lightning/
2627

@@ -39,7 +40,14 @@ RUN \
3940
fi && \
4041
# otherwise there is collision with folder name and pkg name on Pypi
4142
cd pytorch-lightning && \
42-
pip install setuptools==75.6.0 && \
43+
# pip install setuptools==75.6.0 && \
44+
pip install -U "lightning-utilities[cli]" && \
45+
# drop deepspeed since it is not supported by our minimal Torch requirements \
46+
echo "PYTORCH_VERSION is: '$PYTORCH_VERSION'" && \
47+
if [[ "$PYTORCH_VERSION" =~ ^(2\.1|2\.2|2\.3|2\.4)$ ]]; then \
48+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/fabric/strategies.txt ; \
49+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/pytorch/strategies.txt ; \
50+
fi && \
4351
PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \
4452
PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \
4553
cd .. && \

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ markers = [
179179
"cloud: Run the cloud tests for example",
180180
]
181181
filterwarnings = [
182+
# "error::DeprecationWarning",
182183
"error::FutureWarning",
183184
"ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated
184185
]

requirements/fabric/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55

66
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
77
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
8-
deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict
8+
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
99
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"

requirements/pytorch/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
55
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
6-
deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict
6+
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict

src/lightning/fabric/strategies/deepspeed.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
from torch.optim.lr_scheduler import _LRScheduler
4848

4949
_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
50-
_DEEPSPEED_GREATER_EQUAL_0_14_1 = RequirementCache("deepspeed>=0.14.1")
5150

5251

5352
# TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -503,10 +502,7 @@ def load_checkpoint(
503502
)
504503
engine = engines[0]
505504

506-
if _DEEPSPEED_GREATER_EQUAL_0_14_1:
507-
from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
508-
else:
509-
from deepspeed.runtime import DeepSpeedOptimizer
505+
from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
510506

511507
optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values())
512508

src/lightning/pytorch/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3434
- Fixed `AsyncCheckpointIO` snapshots tensors to avoid race with parameter mutation ([#21079](https://github.com/Lightning-AI/pytorch-lightning/pull/21079))
3535

3636

37+
- Fixed `AsyncCheckpointIO` threadpool exception if calling fit or validate more than one ([#20952](https://github.com/Lightning-AI/pytorch-lightning/pull/20952))
38+
39+
3740
- Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068))
3841

42+
3943
---
4044

4145
## [2.5.3] - 2025-08-13

src/lightning/pytorch/plugins/io/async_plugin.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,17 @@
1313
# limitations under the License.
1414

1515
from concurrent.futures import ThreadPoolExecutor
16-
from typing import Any, Optional
16+
from typing import TYPE_CHECKING, Any, Optional
1717

1818
import torch
1919
from lightning_utilities.core.apply_func import apply_to_collection
2020
from typing_extensions import override
2121

22-
from lightning.fabric.plugins import CheckpointIO
2322
from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
2423

24+
if TYPE_CHECKING:
25+
from lightning.fabric.plugins import CheckpointIO
26+
2527

2628
class AsyncCheckpointIO(_WrappingCheckpointIO):
2729
"""``AsyncCheckpointIO`` enables saving the checkpoints asynchronously in a thread.
@@ -33,20 +35,30 @@ class AsyncCheckpointIO(_WrappingCheckpointIO):
3335
3436
"""
3537

38+
_executor: Optional[ThreadPoolExecutor]
39+
_error: Optional[BaseException]
40+
3641
def __init__(self, checkpoint_io: Optional["CheckpointIO"] = None) -> None:
3742
super().__init__(checkpoint_io)
43+
self._executor = None
44+
self._error = None
45+
46+
# CheckpointIO doesn't have a setup method so we have to do something like.
47+
def _ensure_setup(self) -> None:
48+
"""Ensures that the executor is setup.
3849
39-
self._executor = ThreadPoolExecutor(max_workers=1)
40-
self._error: Optional[BaseException] = None
50+
We can't do setup in __init__ because if train or validate is called more than once, the teardown method deletes
51+
the executor.
52+
53+
"""
54+
if self._executor is None:
55+
self._executor = ThreadPoolExecutor(max_workers=1)
4156

4257
@override
4358
def save_checkpoint(self, *args: Any, **kwargs: Any) -> None:
4459
"""Uses the ``ThreadPoolExecutor`` to save the checkpoints using the base ``checkpoint_io``."""
4560

46-
# snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
47-
def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
48-
# detach to avoid autograd history and clone to take a point-in-time copy
49-
return t.detach().clone()
61+
self._ensure_setup()
5062

5163
# rebuild args/kwargs with a cloned checkpoint (supports positional or kw form)
5264
if "checkpoint" in kwargs:
@@ -61,6 +73,7 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None:
6173
except BaseException as ex:
6274
self._error = ex
6375

76+
assert self._executor is not None
6477
self._executor.submit(_save_checkpoint, *args, **kwargs)
6578

6679
# if an error was raised between the previous time `save_checkpoint`` was called and now,
@@ -71,8 +84,17 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None:
7184
@override
7285
def teardown(self) -> None:
7386
"""This method is called to close the threads."""
74-
self._executor.shutdown(wait=True)
87+
if self._executor is not None:
88+
self._executor.shutdown(wait=True)
89+
self._executor = None
7590

7691
# if an error was raised anytime in any of the `executor.submit` calls
7792
if self._error:
7893
raise self._error
94+
95+
96+
# snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation
97+
def _clone_tensor(t: torch.Tensor) -> torch.Tensor:
98+
"""Clones a tensor on the caller thread."""
99+
# detach to avoid autograd history and clone to take a point-in-time copy
100+
return t.detach().clone()

0 commit comments

Comments
 (0)