Skip to content

Commit 14b8890

Browse files
authored
Merge branch 'master' into torch_28_add_missing_device_id
2 parents 9cb2a8c + d85c474 commit 14b8890

File tree

10 files changed

+114
-61
lines changed

10 files changed

+114
-61
lines changed

.lightning/workflows/fabric.yml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,22 @@ trigger:
44
pull_request:
55
branches: ["master"]
66

7-
timeout: "75" # minutes
8-
machine: "L4_X_2"
7+
timeout: "55" # minutes
98
parametrize:
109
matrix: {}
1110
include:
12-
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
11+
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
1312
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
1413
PACKAGE_NAME: "fabric"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
14+
machine: "A100_X_2"
15+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
1616
PACKAGE_NAME: "fabric"
17+
machine: "L4_X_2"
1718
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
1819
# PACKAGE_NAME: "fabric"
19-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
2021
PACKAGE_NAME: "lightning"
22+
machine: "L4_X_2"
2123
exclude: []
2224

2325
env:
@@ -30,6 +32,7 @@ run: |
3032
python --version
3133
pip --version
3234
pip install -q fire wget packaging
35+
pip list
3336
set -ex
3437
3538
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
4043
echo "Torch URL: ${TORCH_URL}"
4144
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
4245
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46+
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4347
4448
if [ "${TORCH_VER}" == "2.1" ]; then
4549
echo "Set oldest versions"
46-
cd requirements/fabric
50+
pip uninstall -y deepspeed
4751
pip install -U "lightning-utilities[cli]"
52+
cd requirements/fabric
4853
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
54+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
4955
cd ../..
5056
pip install "cython<3.0" wheel # for compatibility
5157
fi
@@ -92,6 +98,7 @@ run: |
9298
export PL_RUN_STANDALONE_TESTS=1
9399
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
94100
bash ./run_standalone_tests.sh "tests_fabric"
101+
export PL_RUN_STANDALONE_TESTS=0
95102
96103
# echo "Reporting coverage" # todo
97104
# python -m coverage report

.lightning/workflows/pytorch.yml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,22 @@ trigger:
44
pull_request:
55
branches: ["master"]
66

7-
timeout: "75" # minutes
8-
machine: "L4_X_2"
7+
timeout: "55" # minutes
98
parametrize:
109
matrix: {}
1110
include:
12-
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
11+
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
1312
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
1413
PACKAGE_NAME: "pytorch"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
14+
machine: "A100_X_2"
15+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
1616
PACKAGE_NAME: "pytorch"
17+
machine: "L4_X_2"
1718
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
1819
# PACKAGE_NAME: "pytorch"
19-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
2021
PACKAGE_NAME: "lightning"
22+
machine: "L4_X_2"
2123
exclude: []
2224

2325
env:
@@ -30,6 +32,7 @@ run: |
3032
python --version
3133
pip --version
3234
pip install -q fire wget packaging
35+
pip list
3336
set -ex
3437
3538
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
4043
echo "Torch URL: ${TORCH_URL}"
4144
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
4245
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46+
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4347
4448
if [ "${TORCH_VER}" == "2.1" ]; then
45-
recho "Set oldest versions"
46-
cd requirements/pytorch
49+
echo "Set oldest versions"
50+
pip uninstall -y deepspeed
4751
pip install -U "lightning-utilities[cli]"
52+
cd requirements/pytorch
4853
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
54+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
4955
cd ../..
5056
pip install "cython<3.0" wheel # for compatibility
5157
fi
@@ -108,6 +114,7 @@ run: |
108114
export PL_RUN_STANDALONE_TESTS=1
109115
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
110116
bash ./run_standalone_tests.sh "tests_pytorch"
117+
export PL_RUN_STANDALONE_TESTS=0
111118
112119
echo "Testing: PyTorch standalone tasks"
113120
cd tests_pytorch/

dockers/base-cuda/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
ARG UBUNTU_VERSION=22.04
16-
ARG CUDA_VERSION=11.7.1
16+
ARG CUDA_VERSION=12.1.1
1717

1818

1919
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

docs/source-pytorch/versioning.rst

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,8 @@ API Evolution
5353

5454
Lightning's development is driven by research and best practices in a rapidly developing field of AI and machine learning. Change is inevitable and when it happens, the Lightning team is committed to minimizing user friction and maximizing ease of transition from one version to the next. We take backwards compatibility and reproducibility very seriously.
5555

56-
For API removal, renaming or other forms of backwards-incompatible changes, the procedure is:
57-
58-
#. A deprecation process is initiated at a minor version ``MAJOR.MINOR.PATCH`` (e.g. ``1.5.0``), producing a deprecation warning at runtime and removing it from the documentation.
59-
#. The deprecated API remains unchanged during the deprecation phase for two minor versions or the next major update, whichever comes first.
60-
#. The breaking change is done in version ``MAJOR.(MINOR+2).0`` (e.g. ``1.7.0``), or ``(MAJOR+1).0.0`` (e.g. ``2.0.0``), whichever comes first.
61-
#. From that version onward, the deprecation warning gets converted into a helpful error, which will remain until next major release.
56+
Excepting extenuating circumstances (e.g. a critical bug), API removal, renaming or other forms of backwards-incompatible changes are limited to major version upgrades — that is ``(MAJOR+1).0.0``.
57+
Concretely, a breaking change for an API introduced in ``2.x.x`` can be introduced with Lightning ``3.0.0``.
6258

6359
This policy is not strict. Shorter or longer deprecation cycles may apply to some cases.
6460
For example, in the past DDP2 was removed without a deprecation process because the feature was broken and unusable beyond fixing as discussed in `#12584 <https://github.com/Lightning-AI/pytorch-lightning/issues/12584>`_.
@@ -69,6 +65,14 @@ Compatibility matrix
6965

7066
PyTorch Lightning follows `NEP 29 <https://numpy.org/neps/nep-0029-deprecation_policy.html>`_ which PyTorch also follows (`#74203 <https://github.com/pytorch/pytorch/issues/74203>`_).
7167
The table below indicates the coverage of tested versions in our CI. Versions outside the ranges may unofficially work in some cases.
68+
Since the release of PyTorch `2.0`, Lightning strives to officially support the latest 5 PyTorch minor releases with no breaking changes within major versions [1]_.
69+
70+
71+
.. note::
72+
Legend used in the table below:
73+
74+
- "≥ X.Y" minimum supported version.
75+
- "▼ X.Y" last CI-tested version (informational, not a hard upper bound).
7276

7377
.. list-table::
7478
:header-rows: 1
@@ -82,102 +86,104 @@ The table below indicates the coverage of tested versions in our CI. Versions ou
8286
* - 2.5
8387
- 2.5
8488
- 2.5
85-
- ≥2.1, ≤2.7
89+
- ≥2.1 (▼ 2.8)
8690
- ≥0.7.0
87-
- ≥3.9, ≤3.12
91+
- ≥3.9 (▼ 3.12)
8892
* - 2.4
8993
- 2.4
9094
- 2.4
91-
- ≥2.1, ≤2.6
95+
- ≥2.1 (▼ 2.6)
9296
- ≥0.7.0
93-
- ≥3.9, ≤3.12
97+
- ≥3.9 (▼ 3.12)
9498
* - 2.3
9599
- 2.3
96100
- 2.3
97-
- ≥2.0, ≤2.3
101+
- ≥2.0 (▼ 2.3)
98102
- ≥0.7.0
99-
- ≥3.8, ≤3.11
103+
- ≥3.8 (▼ 3.11)
100104
* - 2.2
101105
- 2.2
102106
- 2.2
103-
- ≥1.13, ≤2.2
107+
- ≥1.13 (▼ 2.2)
104108
- ≥0.7.0
105-
- ≥3.8, ≤3.11
109+
- ≥3.8 (▼ 3.11)
106110
* - 2.1
107111
- 2.1
108112
- 2.1
109-
- ≥1.12, ≤2.1
113+
- ≥1.12 (▼ 2.1)
110114
- ≥0.7.0
111-
- ≥3.8, ≤3.11
115+
- ≥3.8 (▼ 3.11)
112116
* - 2.0
113117
- 2.0
114118
- 2.0 (GA)
115-
- ≥1.11, ≤2.0
119+
- ≥1.11 (▼ 2.0)
116120
- ≥0.7.0
117-
- ≥3.8, ≤3.10
121+
- ≥3.8 (▼ 3.10)
118122
* - 1.9
119123
- 1.9
120124
- 1.9 (experimental)
121-
- ≥1.10, ≤1.13
125+
- ≥1.10 (▼ 1.13)
122126
- ≥0.7.0
123-
- ≥3.7, ≤3.10
127+
- ≥3.7 (▼ 3.10)
124128
* - 1.8**
125129
- 1.8
126130
- n/a***
127-
- ≥1.10, ≤1.13
131+
- ≥1.10 (▼ 1.13)
128132
- ≥0.7.0
129-
- ≥3.7, ≤3.10
133+
- ≥3.7 (▼ 3.10)
130134
* - n/a
131135
- 1.7
132136
- n/a***
133-
- ≥1.9, ≤1.12
137+
- ≥1.9 (▼ 1.12)
134138
- ≥0.7.0
135-
- ≥3.7, ≤3.10
139+
- ≥3.7 (▼ 3.10)
136140
* - n/a
137141
- 1.6
138142
- n/a***
139-
- ≥1.8, ≤1.11
143+
- ≥1.8 (▼ 1.11)
140144
- ≥0.4.1
141-
- ≥3.7, ≤3.9
145+
- ≥3.7 (▼ 3.9)
142146
* - n/a
143147
- 1.5
144148
- n/a***
145-
- ≥1.7, ≤1.10
149+
- ≥1.7 (▼ 1.10)
146150
- ≥0.4.1
147-
- ≥3.6, ≤3.9
151+
- ≥3.6 (▼ 3.9)
148152
* - n/a
149153
- 1.4
150154
- n/a
151-
- ≥1.6, ≤1.9
155+
- ≥1.6 (▼ 1.9)
152156
- ≥0.4.0
153-
- ≥3.6, ≤3.9
157+
- ≥3.6 (▼ 3.9)
154158
* - n/a
155159
- 1.3
156160
- n/a
157-
- ≥1.4, ≤1.8
161+
- ≥1.4 (▼ 1.8)
158162
- ≥0.2.0
159-
- ≥3.6, ≤3.9
163+
- ≥3.6 (▼ 3.9)
160164
* - n/a
161165
- 1.2
162166
- n/a
163-
- ≥1.4, ≤1.8
167+
- ≥1.4 (▼ 1.8)
164168
- n/a*
165-
- ≥3.6, ≤3.8
169+
- ≥3.6 (▼ 3.8)
166170
* - n/a
167171
- 1.1
168172
- n/a
169-
- ≥1.3, ≤1.8
173+
- ≥1.3 (▼ 1.8)
170174
- n/a*
171-
- ≥3.6, ≤3.8
175+
- ≥3.6 (▼ 3.8)
172176
* - n/a
173177
- 1.0
174178
- n/a
175-
- ≥1.3, ≤1.7
179+
- ≥1.3 (▼ 1.7)
176180
- n/a*
177-
- ≥3.6, ≤3.8
181+
- ≥3.6 (▼ 3.8)
178182

179183
\* ``torchmetrics`` was part of ``pytorch_lightning`` at the time and was decoupled to a separate package in v1.3.
180184

181185
\*\* The joint ``lightning`` package was first published in version 1.8
182186

183187
\*\*\* Fabric is the evolution of ``LightningLite`` which was released inside ``pytorch_lightning`` 1.5 and was decoupled to a separate package in v1.9
188+
189+
.. [1] See `this community discussion <https://github.com/Lightning-AI/pytorch-lightning/issues/21073#issuecomment-3201706857>`_.

requirements/pytorch/test.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
1919

2020
tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`
2121

22-
--find-links https://download.pytorch.org/whl/torch-tensorrt
2322
torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"

src/lightning/fabric/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2929

3030
---
3131

32+
## [2.5.4] - 2025-08-29
33+
34+
### Changed
35+
36+
- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119))
37+
38+
39+
3240
## [2.5.3] - 2025-08-13
3341

3442
### Changed

src/lightning/fabric/utilities/throughput.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,23 @@ def measure_flops(
304304

305305
_CUDA_FLOPS: dict[str, dict[Union[str, torch.dtype], float]] = {
306306
# Hopper
307+
# source: https://nvdam.widen.net/s/nb5zzzsjdf/hpc-datasheet-sc23-h200-datasheet-3002446
308+
"h200 sxm1": {
309+
torch.float64: 3.4e13,
310+
torch.float32: 6.7e13,
311+
"tfloat32": 9.9e14,
312+
torch.bfloat16: 2.0e15,
313+
torch.float16: 2.0e15,
314+
torch.int8: 4.0e15,
315+
},
316+
"h200 nvl1": {
317+
torch.float64: 3.0e13,
318+
torch.float32: 6.0e13,
319+
"tfloat32": 8.4e14,
320+
torch.bfloat16: 1.7e15,
321+
torch.float16: 1.7e15,
322+
torch.int8: 3.3e15,
323+
},
307324
# source: https://resources.nvidia.com/en-us-tensor-core
308325
"h100 nvl": {
309326
torch.float64: 67e12,
@@ -536,7 +553,12 @@ def get_available_flops(device: torch.device, dtype: Union[torch.dtype, str]) ->
536553
if device.type == "cuda":
537554
device_name = torch.cuda.get_device_name(device)
538555
chip = device_name.lower()
539-
if "h100" in chip:
556+
if "h200" in chip:
557+
if "sxm1" in chip:
558+
chip = "h200 sxm1"
559+
elif "nvl1" in chip:
560+
chip = "h200 nvl1"
561+
elif "h100" in chip:
540562
if "hbm3" in chip:
541563
chip = "h100 sxm"
542564
elif "nvl" in chip:

src/lightning/pytorch/CHANGELOG.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1818

1919
### Changed
2020

21-
- Default to RichProgressBar and RichModelSummary if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
21+
- Default to `RichProgressBar` and `RichModelSummary` if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
2222

2323

2424
### Removed
@@ -28,21 +28,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2828

2929
### Fixed
3030

31-
- Fixed `AsyncCheckpointIO` snapshots tensors to avoid race with parameter mutation ([#21079](https://github.com/Lightning-AI/pytorch-lightning/pull/21079))
31+
-
3232

3333

34-
- Fixed `AsyncCheckpointIO` threadpool exception if calling fit or validate more than one ([#20952](https://github.com/Lightning-AI/pytorch-lightning/pull/20952))
34+
---
3535

3636

37-
- Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068))
37+
## [2.5.4] - 2025-08-29
3838

39+
### Fixed
3940

41+
- Fixed `AsyncCheckpointIO` snapshots tensors to avoid race with parameter mutation ([#21079](https://github.com/Lightning-AI/pytorch-lightning/pull/21079))
42+
- Fixed `AsyncCheckpointIO` threadpool exception if calling fit or validate more than one ([#20952](https://github.com/Lightning-AI/pytorch-lightning/pull/20952))
43+
- Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068))
4044
- Fixed misalignment column while using rich model summary in `DeepSpeedstrategy` ([#21100](https://github.com/Lightning-AI/pytorch-lightning/pull/21100))
41-
42-
4345
- Fixed `RichProgressBar` crashing when sanity checking using val dataloader with 0 len ([#21108](https://github.com/Lightning-AI/pytorch-lightning/pull/21108))
4446

45-
---
4647

4748
## [2.5.3] - 2025-08-13
4849

0 commit comments

Comments
 (0)