Skip to content

Commit ca91286

Browse files
authored
Merge branch 'master' into ioannis@18861-CSVLogger-fails-on-remote-fs
2 parents 936dc8b + 9f757c0 commit ca91286

File tree

28 files changed

+710
-148
lines changed

28 files changed

+710
-148
lines changed

.github/workflows/_legacy-checkpoints.yml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,28 +57,32 @@ jobs:
5757
steps:
5858
- uses: actions/checkout@v5
5959

60-
- uses: actions/setup-python@v5
60+
- name: Install uv and set Python version
61+
uses: astral-sh/setup-uv@v6
6162
with:
62-
# Python version here needs to be supported by all PL versions listed in back-compatible-versions.txt.
6363
python-version: "3.9"
64+
# TODO: Avoid activating environment like this
65+
# see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
66+
activate-environment: true
67+
enable-cache: true
6468

6569
- name: Install PL from source
6670
env:
6771
PACKAGE_NAME: pytorch
6872
FREEZE_REQUIREMENTS: 1
6973
timeout-minutes: 20
70-
run: pip install . --extra-index-url="${TORCH_URL}"
74+
run: uv pip install . --extra-index-url="${TORCH_URL}"
7175
if: inputs.pl_version == ''
7276

7377
- name: Install PL version
7478
timeout-minutes: 20
75-
run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
79+
run: uv pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}"
7680
if: inputs.pl_version != ''
7781

7882
- name: Adjust tests -> PL
7983
if: ${{ matrix.pkg-name != 'lightning' }}
8084
run: |
81-
pip install -q -r .actions/requirements.txt
85+
uv pip install -q -r .actions/requirements.txt
8286
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
8387
--source_import="lightning.fabric,lightning.pytorch" \
8488
--target_import="lightning_fabric,pytorch_lightning"
@@ -115,7 +119,7 @@ jobs:
115119
# export to env bool if secrets.AWS_REGION is not empty
116120
run: echo "WITH_SECRETS=$([ -n '${{ secrets.AWS_REGION }}' ] && echo 1 || echo 0)" >> $GITHUB_ENV
117121

118-
- run: pip install -r requirements/ci.txt
122+
- run: uv pip install -r requirements/ci.txt
119123
- name: Upload checkpoints to S3
120124
if: ${{ env.WITH_SECRETS == '1' }}
121125
working-directory: ${{ env.LEGACY_FOLDER }}

.github/workflows/ci-tests-fabric.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
run: pip install -q -r .actions/requirements.txt
8080

8181
- name: Set min. dependencies
82-
if: ${{ matrix.requires == 'oldest' }}
82+
if: ${{ matrix.config.requires == 'oldest' }}
8383
run: |
8484
cd requirements/fabric
8585
pip install -U "lightning-utilities[cli]"
@@ -88,7 +88,7 @@ jobs:
8888
pip install "pyyaml==5.4" --no-build-isolation
8989
9090
- name: Adjust PyTorch versions in requirements files
91-
if: ${{ matrix.requires != 'oldest' }}
91+
if: ${{ matrix.config.requires != 'oldest' }}
9292
run: |
9393
pip install -q -r requirements/ci.txt
9494
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py

.github/workflows/ci-tests-pytorch.yml

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -69,48 +69,49 @@ jobs:
6969
TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
7070
TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
7171
FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
72-
PYPI_CACHE_DIR: "_pip-wheels"
7372
# TODO: Remove this - Enable running MPS tests on this platform
7473
DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
7574
steps:
7675
- uses: actions/checkout@v5
7776

78-
- name: Set up Python ${{ matrix.config.python-version }}
79-
uses: actions/setup-python@v5
77+
- name: Install uv and set Python version
78+
uses: astral-sh/setup-uv@v6
8079
with:
8180
python-version: ${{ matrix.config.python-version || '3.9' }}
81+
# TODO: Avoid activating environment like this
82+
# see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
83+
activate-environment: true
84+
enable-cache: true
8285

83-
- name: basic setup
84-
run: pip install -q -r .actions/requirements.txt
86+
- name: Basic setup
87+
run: uv pip install -q -r .actions/requirements.txt
88+
89+
- name: Append Env. vars for Linux
90+
if: ${{ runner.os == 'Linux' }}
91+
run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
92+
- name: Append Env. vars for MacOS
93+
if: ${{ runner.os == 'macOS' }}
94+
run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
8595

8696
- name: Set min. dependencies
87-
if: ${{ matrix.requires == 'oldest' }}
97+
if: ${{ matrix.config.requires == 'oldest' }}
8898
run: |
8999
cd requirements/pytorch
90-
pip install -U "lightning-utilities[cli]"
100+
uv pip install -U "lightning-utilities[cli]"
91101
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt', 'test.txt']"
92-
pip install "cython<3.0" wheel
93-
pip install "pyyaml==5.4" --no-build-isolation
102+
uv pip install "cython<3.0" wheel
103+
uv pip install "pyyaml==5.4" --no-build-isolation
94104
95105
- name: Adjust PyTorch versions in requirements files
96-
if: ${{ matrix.requires != 'oldest' }}
106+
if: ${{ matrix.config.requires != 'oldest' }}
97107
run: |
98-
pip install -q -r requirements/ci.txt
108+
uv pip install -q -r requirements/ci.txt
99109
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
100110
for fpath in `ls requirements/**/*.txt`; do \
101111
python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
102112
done
103113
cat requirements/pytorch/base.txt
104114
105-
- name: pip wheels cache
106-
uses: actions/cache/restore@v4
107-
with:
108-
path: ${{ env.PYPI_CACHE_DIR }}
109-
key: pypi_wheels
110-
- run: |
111-
mkdir -p $PYPI_CACHE_DIR
112-
ls -lh $PYPI_CACHE_DIR
113-
114115
- name: Env. variables
115116
run: |
116117
# Switch PyTorch URL between stable and test/future
@@ -125,20 +126,22 @@ jobs:
125126
- name: Install package & dependencies
126127
timeout-minutes: 20
127128
run: |
128-
pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
129-
-U --upgrade-strategy=eager --prefer-binary \
129+
uv pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
130+
--upgrade \
130131
-r requirements/_integrations/accelerators.txt \
131-
--extra-index-url="${TORCH_URL}" \
132-
--find-links="${PYPI_CACHE_DIR}" \
132+
--find-links="${TORCH_URL}" \
133133
--find-links="https://download.pytorch.org/whl/torch-tensorrt"
134-
pip list
134+
uv pip list
135+
135136
- name: Drop LAI from extensions
136137
if: ${{ matrix.config.pkg-name != 'lightning' }}
137138
# Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it
138-
run: pip uninstall -y lightning
139+
run: uv pip uninstall lightning
140+
139141
- name: Drop PL for LAI
140142
if: ${{ matrix.config.pkg-name == 'lightning' }}
141-
run: pip uninstall -y pytorch-lightning
143+
run: uv pip uninstall pytorch-lightning
144+
142145
- name: Dump handy wheels
143146
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
144147
continue-on-error: true
@@ -215,6 +218,9 @@ jobs:
215218
name: CPU-coverage
216219
fail_ci_if_error: false
217220

221+
- name: Minimize uv cache
222+
run: uv cache prune --ci
223+
218224
pl-cpu-guardian:
219225
runs-on: ubuntu-latest
220226
needs: pl-cpu

docs/source-pytorch/accelerators/gpu_faq.rst

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,71 @@
55
GPU training (FAQ)
66
==================
77

8-
******************************************************************
9-
How should I adjust the learning rate when using multiple devices?
10-
******************************************************************
8+
***************************************************************
9+
How should I adjust the batch size when using multiple devices?
10+
***************************************************************
1111

12-
When using distributed training make sure to modify your learning rate according to your effective
13-
batch size.
12+
Lightning automatically shards your data across multiple GPUs, meaning that each device only sees a unique subset of your
13+
data, but the `batch_size` in your DataLoader remains the same. This means that the effective batch size e.g. the
14+
total number of samples processed in one forward/backward pass is
1415

15-
Let's say you have a batch size of 7 in your dataloader.
16+
.. math::
1617
17-
.. testcode::
18+
\text{Effective Batch Size} = \text{DataLoader Batch Size} \times \text{Number of Devices} \times \text{Number of Nodes}
1819
19-
class LitModel(LightningModule):
20-
def train_dataloader(self):
21-
return Dataset(..., batch_size=7)
22-
23-
Whenever you use multiple devices and/or nodes, your effective batch size will be 7 * devices * num_nodes.
20+
A couple of examples to illustrate this:
2421

2522
.. code-block:: python
2623
27-
# effective batch size = 7 * 8
24+
dataloader = DataLoader(..., batch_size=7)
25+
26+
# Single GPU: effective batch size = 7
27+
Trainer(accelerator="gpu", devices=1)
28+
29+
# Multi-GPU: effective batch size = 7 * 8 = 56
2830
Trainer(accelerator="gpu", devices=8, strategy=...)
2931
30-
# effective batch size = 7 * 8 * 10
32+
# Multi-node: effective batch size = 7 * 8 * 10 = 560
3133
Trainer(accelerator="gpu", devices=8, num_nodes=10, strategy=...)
3234
35+
In general you should be able to use the same `batch_size` in your DataLoader regardless of the number of devices you are
36+
using.
37+
38+
.. note::
39+
40+
If you want distributed training to work exactly the same as single GPU training, you need to set the `batch_size`
41+
in your DataLoader to `original_batch_size / num_devices` to maintain the same effective batch size. However, this
42+
can lead to poor GPU utilization.
43+
44+
----
45+
46+
******************************************************************
47+
How should I adjust the learning rate when using multiple devices?
48+
******************************************************************
49+
50+
Because the effective batch size is larger when using multiple devices, you need to adjust your learning rate
51+
accordingly. Because the learning rate is a hyperparameter that controls how much to change the model in response to
52+
the estimated error each time the model weights are updated, it is important to scale it with the effective batch size.
53+
54+
In general, there are two common scaling rules:
55+
56+
1. **Linear scaling**: Increase the learning rate linearly with the number of devices.
57+
58+
.. code-block:: python
59+
60+
# Example: Linear scaling
61+
base_lr = 1e-3
62+
num_devices = 8
63+
scaled_lr = base_lr * num_devices # 8e-3
64+
65+
2. **Square root scaling**: Increase the learning rate by the square root of the number of devices.
66+
67+
.. code-block:: python
68+
69+
# Example: Square root scaling
70+
base_lr = 1e-3
71+
num_devices = 8
72+
scaled_lr = base_lr * (num_devices ** 0.5) # 2.83e-3
3373
3474
.. note:: Huge batch sizes are actually really bad for convergence. Check out:
3575
`Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>`_

docs/source-pytorch/common/hooks.rst

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,30 @@ with the source of each hook indicated:
8383
trainer.fit()
8484
8585
├── setup(stage="fit")
86-
│ └── [Callbacks only]
87-
88-
├── on_fit_start()
86+
│ ├── [LightningDataModule]
8987
│ ├── [Callbacks]
9088
│ ├── [LightningModule]
89+
│ ├── [LightningModule.configure_shared_model()]
90+
│ ├── [LightningModule.configure_model()]
91+
│ ├── Strategy.restore_checkpoint_before_setup
92+
│ │ ├── [LightningModule.on_load_checkpoint()]
93+
│ │ ├── [LightningModule.load_state_dict()]
94+
│ │ ├── [LightningDataModule.load_state_dict()]
95+
│ │ ├── [Callbacks.on_load_checkpoint()]
96+
│ │ └── [Callbacks.load_state_dict()]
9197
│ └── [Strategy]
9298
99+
├── on_fit_start()
100+
│ ├── [Callbacks]
101+
│ └── [LightningModule]
102+
103+
├── Strategy.restore_checkpoint_after_setup
104+
│ ├── [LightningModule.on_load_checkpoint()]
105+
│ ├── [LightningModule.load_state_dict()]
106+
│ ├── [LightningDataModule.load_state_dict()]
107+
│ ├── [Callbacks.on_load_checkpoint()]
108+
│ └── [Callbacks.load_state_dict()]
109+
93110
├── on_sanity_check_start()
94111
│ ├── [Callbacks]
95112
│ ├── [LightningModule]
@@ -143,23 +160,24 @@ with the source of each hook indicated:
143160
│ │ │ ├── [LightningModule]
144161
│ │ │ └── [Strategy]
145162
│ │ │
146-
│ │ ├── on_before_zero_grad()
147-
│ │ │ ├── [Callbacks]
148-
│ │ │ └── [LightningModule]
149-
│ │ │
150163
│ │ ├── [Forward Pass - training_step()]
151164
│ │ │ └── [Strategy only]
152165
│ │ │
153-
│ │ ├── on_before_backward()
166+
│ │ ├── on_before_zero_grad()
154167
│ │ │ ├── [Callbacks]
155168
│ │ │ └── [LightningModule]
156169
│ │ │
157-
│ │ ├── [Backward Pass]
158-
│ │ │ └── [Strategy only]
170+
│ │ ├── optimizer_zero_grad()
171+
│ │ │ └── [LightningModule only - optimizer_zero_grad()]
159172
│ │ │
160-
│ │ ├── on_after_backward()
161-
│ │ │ ├── [Callbacks]
162-
│ │ │ └── [LightningModule]
173+
│ │ ├── [Backward Pass - Strategy.backward()]
174+
│ │ │ ├── on_before_backward()
175+
│ │ │ │ ├── [Callbacks]
176+
│ │ │ │ └── [LightningModule]
177+
│ │ │ ├── LightningModule.backward()
178+
│ │ │ └── on_after_backward()
179+
│ │ │ ├── [Callbacks]
180+
│ │ │ └── [LightningModule]
163181
│ │ │
164182
│ │ ├── on_before_optimizer_step()
165183
│ │ │ ├── [Callbacks]
@@ -212,13 +230,14 @@ with the source of each hook indicated:
212230
│ ├── [LightningModule]
213231
│ └── [Strategy]
214232
215-
├── on_fit_end()
216-
│ ├── [Callbacks]
217-
│ ├── [LightningModule]
218-
│ └── [Strategy]
219-
220233
└── teardown(stage="fit")
221-
└── [Callbacks only]
234+
├── [Strategy]
235+
├── on_fit_end()
236+
│ ├── [Callbacks]
237+
│ └── [LightningModule]
238+
├── [LightningDataModule]
239+
├── [Callbacks]
240+
└── [LightningModule]
222241
223242
***********************
224243
Testing Loop Hook Order

docs/source-pytorch/common/trainer.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ limit_train_batches
510510

511511
How much of training dataset to check.
512512
Useful when debugging or testing something that happens at the end of an epoch.
513+
Value is per device.
513514

514515
.. testcode::
515516

@@ -535,7 +536,7 @@ limit_test_batches
535536
:width: 400
536537
:muted:
537538

538-
How much of test dataset to check.
539+
How much of test dataset to check. Value is per device.
539540

540541
.. testcode::
541542

@@ -560,6 +561,7 @@ limit_val_batches
560561

561562
How much of validation dataset to check.
562563
Useful when debugging or testing something that happens at the end of an epoch.
564+
Value is per device.
563565

564566
.. testcode::
565567

docs/source-pytorch/expertise_levels.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Learn to scale up your models and enable collaborative model development at acad
8484
.. Add callout items below this line
8585
8686
.. displayitem::
87-
:header: Level 7: Interactive cloud development
87+
:header: Level 7: Hardware acceleration
8888
:description: Learn how to access GPUs and TPUs on the cloud.
8989
:button_link: levels/intermediate_level_7.html
9090
:col_css: col-md-6

docs/source-pytorch/levels/intermediate.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Learn to scale up your models and enable collaborative model development at acad
1616
.. Add callout items below this line
1717
1818
.. displayitem::
19-
:header: Level 7: Interactive cloud development
19+
:header: Level 7: Hardware acceleration
2020
:description: Learn how to access GPUs and TPUs on the cloud.
2121
:button_link: intermediate_level_7.html
2222
:col_css: col-md-6

0 commit comments

Comments
 (0)