Skip to content

Commit 3e9e0a9

Browse files
authored
Merge branch 'master' into deepspeed_mics_init
2 parents b65481e + 03635d2 commit 3e9e0a9

File tree

373 files changed

+1965
-1342
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

373 files changed

+1965
-1342
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,6 @@ jobs:
108108
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
109109
env:
110110
PL_RUN_CUDA_TESTS: "1"
111+
PL_RUN_STANDALONE_TESTS: "1"
111112
displayName: "Testing: fabric standalone tasks"
112113
timeoutInMinutes: "10"

.azure/gpu-tests-fabric.yml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,17 @@ jobs:
5656
options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp"
5757
strategy:
5858
matrix:
59+
"Fabric | oldest":
60+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1"
61+
PACKAGE_NAME: "fabric"
5962
"Fabric | latest":
60-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
63+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
64+
PACKAGE_NAME: "fabric"
65+
"Fabric | future":
66+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
6167
PACKAGE_NAME: "fabric"
6268
"Lightning | latest":
63-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0"
69+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
6470
PACKAGE_NAME: "lightning"
6571
workspace:
6672
clean: all
@@ -77,9 +83,8 @@ jobs:
7783
displayName: "set env. vars"
7884
- bash: |
7985
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
80-
echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl"
8186
condition: endsWith(variables['Agent.JobName'], 'future')
82-
displayName: "set env. vars 4 future"
87+
displayName: "extend env. vars 4 future"
8388
8489
- bash: |
8590
echo $(DEVICES)
@@ -105,15 +110,17 @@ jobs:
105110
displayName: "Adjust dependencies"
106111
107112
- bash: |
113+
set -e
108114
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
109-
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
115+
pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}"
110116
pip install setuptools==75.6.0 jsonargparse==4.35.0
111117
displayName: "Install package & dependencies"
112118
113119
- bash: |
114120
set -e
115121
python requirements/collect_env_details.py
116122
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
123+
python requirements/pytorch/check-avail-extras.py
117124
python -c "import bitsandbytes"
118125
displayName: "Env details"
119126
@@ -140,10 +147,12 @@ jobs:
140147
displayName: "Testing: fabric standard"
141148
timeoutInMinutes: "10"
142149

143-
- bash: bash ./run_standalone_tests.sh "tests_fabric"
150+
- bash: |
151+
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
152+
bash ./run_standalone_tests.sh "tests_fabric"
144153
workingDirectory: tests/
145154
env:
146-
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
155+
PL_RUN_STANDALONE_TESTS: "1"
147156
displayName: "Testing: fabric standalone"
148157
timeoutInMinutes: "10"
149158

.azure/gpu-tests-pytorch.yml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,17 @@ jobs:
4949
cancelTimeoutInMinutes: "2"
5050
strategy:
5151
matrix:
52+
"PyTorch | oldest":
53+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1"
54+
PACKAGE_NAME: "pytorch"
5255
"PyTorch | latest":
53-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
56+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
57+
PACKAGE_NAME: "pytorch"
58+
"PyTorch | future":
59+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
5460
PACKAGE_NAME: "pytorch"
5561
"Lightning | latest":
56-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0"
62+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
5763
PACKAGE_NAME: "lightning"
5864
pool: lit-rtx-3090
5965
variables:
@@ -81,9 +87,8 @@ jobs:
8187
displayName: "set env. vars"
8288
- bash: |
8389
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
84-
echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl"
8590
condition: endsWith(variables['Agent.JobName'], 'future')
86-
displayName: "set env. vars 4 future"
91+
displayName: "extend env. vars 4 future"
8792
8893
- bash: |
8994
echo $(DEVICES)
@@ -109,8 +114,9 @@ jobs:
109114
displayName: "Adjust dependencies"
110115
111116
- bash: |
117+
set -e
112118
extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
113-
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
119+
pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}"
114120
pip install setuptools==75.6.0 jsonargparse==4.35.0
115121
displayName: "Install package & dependencies"
116122
@@ -161,11 +167,13 @@ jobs:
161167
displayName: "Testing: PyTorch standard"
162168
timeoutInMinutes: "35"
163169

164-
- bash: bash ./run_standalone_tests.sh "tests_pytorch"
170+
- bash: |
171+
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
172+
bash ./run_standalone_tests.sh "tests_pytorch"
165173
workingDirectory: tests/
166174
env:
167175
PL_USE_MOCKED_MNIST: "1"
168-
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
176+
PL_RUN_STANDALONE_TESTS: "1"
169177
displayName: "Testing: PyTorch standalone tests"
170178
timeoutInMinutes: "35"
171179

.github/CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,14 +182,14 @@ We welcome any useful contribution! For your convenience here's a recommended wo
182182
1. Use tags in PR name for the following cases:
183183

184184
- **\[blocked by #<number>\]** if your work is dependent on other PRs.
185-
- **\[wip\]** when you start to re-edit your work, mark it so no one will accidentally merge it in meantime.
185+
- **[wip]** when you start to re-edit your work, mark it so no one will accidentally merge it in meantime.
186186

187187
### Question & Answer
188188

189189
#### How can I help/contribute?
190190

191191
All types of contributions are welcome - reporting bugs, fixing documentation, adding test cases, solving issues, and preparing bug fixes.
192-
To get started with code contributions, look for issues marked with the label [good first issue](https://github.com/Lightning-AI/lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) or chose something close to your domain with the label [help wanted](https://github.com/Lightning-AI/lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22). Before coding, make sure that the issue description is clear and comment on the issue so that we can assign it to you (or simply self-assign if you can).
192+
To get started with code contributions, look for issues marked with the label [good first issue](https://github.com/Lightning-AI/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) or chose something close to your domain with the label [help wanted](https://github.com/Lightning-AI/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22). Before coding, make sure that the issue description is clear and comment on the issue so that we can assign it to you (or simply self-assign if you can).
193193

194194
#### Is there a recommendation for branch names?
195195

.github/actions/pip-wheels/action.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ runs:
4646
run: |
4747
# cat requirements.dump
4848
pip wheel -r requirements.dump --prefer-binary \
49-
--wheel-dir=.wheels \
50-
-f ${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }}
49+
--wheel-dir=".wheels" \
50+
--extra-index-url=${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }}
5151
ls -lh .wheels/
5252
shell: bash
5353

.github/checkgroup.yml

Lines changed: 6 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,7 @@ subprojects:
1919
- "!*.md"
2020
- "!**/*.md"
2121
checks:
22-
- "pl-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
23-
- "pl-cpu (macOS-14, lightning, 3.10, 2.1)"
24-
- "pl-cpu (macOS-14, lightning, 3.11, 2.2.2)"
25-
- "pl-cpu (macOS-14, lightning, 3.11, 2.3)"
26-
- "pl-cpu (macOS-14, lightning, 3.12, 2.4.1)"
27-
- "pl-cpu (macOS-14, lightning, 3.12, 2.5.1)"
28-
- "pl-cpu (ubuntu-20.04, lightning, 3.9, 2.1, oldest)"
29-
- "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
30-
- "pl-cpu (ubuntu-20.04, lightning, 3.11, 2.2.2)"
31-
- "pl-cpu (ubuntu-20.04, lightning, 3.11, 2.3)"
32-
- "pl-cpu (ubuntu-22.04, lightning, 3.12, 2.4.1)"
33-
- "pl-cpu (ubuntu-22.04, lightning, 3.12, 2.5.1)"
34-
- "pl-cpu (windows-2022, lightning, 3.9, 2.1, oldest)"
35-
- "pl-cpu (windows-2022, lightning, 3.10, 2.1)"
36-
- "pl-cpu (windows-2022, lightning, 3.11, 2.2.2)"
37-
- "pl-cpu (windows-2022, lightning, 3.11, 2.3)"
38-
- "pl-cpu (windows-2022, lightning, 3.12, 2.4.1)"
39-
- "pl-cpu (windows-2022, lightning, 3.12, 2.5.1)"
40-
- "pl-cpu (macOS-14, pytorch, 3.9, 2.1)"
41-
- "pl-cpu (ubuntu-20.04, pytorch, 3.9, 2.1)"
42-
- "pl-cpu (windows-2022, pytorch, 3.9, 2.1)"
43-
- "pl-cpu (macOS-14, pytorch, 3.12, 2.5.1)"
44-
- "pl-cpu (ubuntu-22.04, pytorch, 3.12, 2.5.1)"
45-
- "pl-cpu (windows-2022, pytorch, 3.12, 2.5.1)"
22+
- "pl-cpu-guardian" # aggregated check for all cases
4623

4724
- id: "pytorch_lightning: Azure GPU"
4825
paths:
@@ -150,12 +127,14 @@ subprojects:
150127
- "build-cuda (3.11, 2.3.1, 12.1.1)"
151128
- "build-cuda (3.11, 2.4.1, 12.1.1)"
152129
- "build-cuda (3.12, 2.5.1, 12.1.1)"
130+
- "build-cuda (3.12, 2.6.0, 12.4.1)"
153131
#- "build-NGC"
154132
- "build-pl (3.10, 2.1, 12.1.1)"
155133
- "build-pl (3.11, 2.2, 12.1.1)"
156134
- "build-pl (3.11, 2.3, 12.1.1)"
157135
- "build-pl (3.11, 2.4, 12.1.1)"
158-
- "build-pl (3.12, 2.5, 12.1.1, true)"
136+
- "build-pl (3.12, 2.5, 12.1.1)"
137+
- "build-pl (3.12, 2.6, 12.4.1, true)"
159138

160139
# SECTION: lightning_fabric
161140

@@ -172,30 +151,7 @@ subprojects:
172151
- "!*.md"
173152
- "!**/*.md"
174153
checks:
175-
- "fabric-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
176-
- "fabric-cpu (macOS-14, lightning, 3.10, 2.1)"
177-
- "fabric-cpu (macOS-14, lightning, 3.11, 2.2.2)"
178-
- "fabric-cpu (macOS-14, lightning, 3.11, 2.3)"
179-
- "fabric-cpu (macOS-14, lightning, 3.12, 2.4.1)"
180-
- "fabric-cpu (macOS-14, lightning, 3.12, 2.5.1)"
181-
- "fabric-cpu (ubuntu-20.04, lightning, 3.9, 2.1, oldest)"
182-
- "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
183-
- "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2.2)"
184-
- "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)"
185-
- "fabric-cpu (ubuntu-22.04, lightning, 3.12, 2.4.1)"
186-
- "fabric-cpu (ubuntu-22.04, lightning, 3.12, 2.5.1)"
187-
- "fabric-cpu (windows-2022, lightning, 3.9, 2.1, oldest)"
188-
- "fabric-cpu (windows-2022, lightning, 3.10, 2.1)"
189-
- "fabric-cpu (windows-2022, lightning, 3.11, 2.2.2)"
190-
- "fabric-cpu (windows-2022, lightning, 3.11, 2.3)"
191-
- "fabric-cpu (windows-2022, lightning, 3.12, 2.4.1)"
192-
- "fabric-cpu (windows-2022, lightning, 3.12, 2.5.1)"
193-
- "fabric-cpu (macOS-14, fabric, 3.9, 2.1)"
194-
- "fabric-cpu (ubuntu-20.04, fabric, 3.9, 2.1)"
195-
- "fabric-cpu (windows-2022, fabric, 3.9, 2.1)"
196-
- "fabric-cpu (macOS-14, fabric, 3.12, 2.5.1)"
197-
- "fabric-cpu (ubuntu-22.04, fabric, 3.12, 2.5.1)"
198-
- "fabric-cpu (windows-2022, fabric, 3.12, 2.5.1)"
154+
- "fabric-cpu-guardian" # aggregated check for all cases
199155

200156
- id: "lightning_fabric: Azure GPU"
201157
paths:
@@ -259,27 +215,4 @@ subprojects:
259215
- "!*.md"
260216
- "!**/*.md"
261217
checks:
262-
- "install-pkg (ubuntu-22.04, fabric, 3.9)"
263-
- "install-pkg (ubuntu-22.04, fabric, 3.11)"
264-
- "install-pkg (ubuntu-22.04, pytorch, 3.9)"
265-
- "install-pkg (ubuntu-22.04, pytorch, 3.11)"
266-
- "install-pkg (ubuntu-22.04, lightning, 3.9)"
267-
- "install-pkg (ubuntu-22.04, lightning, 3.11)"
268-
- "install-pkg (ubuntu-22.04, notset, 3.9)"
269-
- "install-pkg (ubuntu-22.04, notset, 3.11)"
270-
- "install-pkg (macOS-14, fabric, 3.9)"
271-
- "install-pkg (macOS-14, fabric, 3.11)"
272-
- "install-pkg (macOS-14, pytorch, 3.9)"
273-
- "install-pkg (macOS-14, pytorch, 3.11)"
274-
- "install-pkg (macOS-14, lightning, 3.9)"
275-
- "install-pkg (macOS-14, lightning, 3.11)"
276-
- "install-pkg (macOS-14, notset, 3.9)"
277-
- "install-pkg (macOS-14, notset, 3.11)"
278-
- "install-pkg (windows-2022, fabric, 3.9)"
279-
- "install-pkg (windows-2022, fabric, 3.11)"
280-
- "install-pkg (windows-2022, pytorch, 3.9)"
281-
- "install-pkg (windows-2022, pytorch, 3.11)"
282-
- "install-pkg (windows-2022, lightning, 3.9)"
283-
- "install-pkg (windows-2022, lightning, 3.11)"
284-
- "install-pkg (windows-2022, notset, 3.9)"
285-
- "install-pkg (windows-2022, notset, 3.11)"
218+
- "install-pkg-guardian" # aggregated check for all cases

.github/workflows/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Brief description of all our automation tools used for boosting development perf
1616
| .azure-pipelines/gpu-benchmarks.yml | Run speed/memory benchmarks for parity with vanila PyTorch. | GPU |
1717
| .github/workflows/ci-flagship-apps.yml | Run end-2-end tests with full applications, including deployment to the production cloud. | CPU |
1818
| .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU |
19-
| .github/workflows/tpu-tests.yml | Run only TPU-specific tests. Requires that the PR title contains '\[TPU\]' | TPU |
19+
| .github/workflows/tpu-tests.yml | Run only TPU-specific tests. Requires that the PR title contains '[TPU]' | TPU |
2020

2121
\* Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases.
2222

.github/workflows/_legacy-checkpoints.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ on:
4343

4444
env:
4545
LEGACY_FOLDER: "tests/legacy"
46-
TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html"
46+
TORCH_URL: "https://download.pytorch.org/whl/cpu/"
4747

4848
defaults:
4949
run:
@@ -67,12 +67,12 @@ jobs:
6767
PACKAGE_NAME: pytorch
6868
FREEZE_REQUIREMENTS: 1
6969
timeout-minutes: 20
70-
run: pip install . -f ${TORCH_URL}
70+
run: pip install . --extra-index-url="${TORCH_URL}"
7171
if: inputs.pl_version == ''
7272

7373
- name: Install PL version
7474
timeout-minutes: 20
75-
run: pip install "pytorch-lightning==${{ inputs.pl_version }}" -f ${TORCH_URL}
75+
run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
7676
if: inputs.pl_version != ''
7777

7878
- name: Adjust tests -> PL
@@ -144,7 +144,7 @@ jobs:
144144
title: Adding test for legacy checkpoint created with ${{ env.PL_VERSION }}
145145
committer: GitHub <[email protected]>
146146
author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
147-
commit-message: "update tutorials to `${{ env.PL_VERSION }}`"
147+
commit-message: "adding `${{ env.PL_VERSION }}` checkpoint"
148148
body: "**This is automated addition of created checkpoints with the latest `lightning` release!**"
149149
delete-branch: true
150150
token: ${{ secrets.PAT_GHOST }}

.github/workflows/call-clear-cache.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,18 @@ on:
2323
jobs:
2424
cron-clear:
2525
if: github.event_name == 'schedule' || github.event_name == 'pull_request'
26-
uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.11.9
26+
uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.3
2727
with:
28-
scripts-ref: v0.11.8
28+
scripts-ref: v0.14.3
2929
dry-run: ${{ github.event_name == 'pull_request' }}
3030
pattern: "latest|docs"
3131
age-days: 7
3232

3333
direct-clear:
3434
if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
35-
uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.11.9
35+
uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.3
3636
with:
37-
scripts-ref: v0.11.8
37+
scripts-ref: v0.14.3
3838
dry-run: ${{ github.event_name == 'pull_request' }}
3939
pattern: ${{ inputs.pattern || 'pypi_wheels' }} # setting str in case of PR / debugging
4040
age-days: ${{ fromJSON(inputs.age-days) || 0 }} # setting 0 in case of PR / debugging

.github/workflows/ci-check-md-links.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ on:
1414

1515
jobs:
1616
check-md-links:
17-
uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.11.9
17+
uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.14.3
1818
with:
1919
config-file: ".github/markdown-links-config.json"
2020
base-branch: "master"

0 commit comments

Comments
 (0)