Skip to content

Commit 37f0f30

Browse files
authored
Merge branch 'master' into docs/lightning_module_enhancements
2 parents dcf8b25 + 1fc077b commit 37f0f30

File tree

30 files changed

+582
-103
lines changed

30 files changed

+582
-103
lines changed

.github/workflows/ci-tests-fabric.yml

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -62,49 +62,57 @@ jobs:
6262
env:
6363
PACKAGE_NAME: ${{ matrix.config.pkg-name }}
6464
FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
65-
PYPI_CACHE_DIR: "_pip-wheels"
6665
TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/"
6766
TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/"
6867
# TODO: Remove this - Enable running MPS tests on this platform
6968
DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
7069
steps:
7170
- uses: actions/checkout@v5
7271

73-
- name: Set up Python ${{ matrix.config.python-version }}
74-
uses: actions/setup-python@v5
72+
- name: Install uv and set Python version
73+
uses: astral-sh/setup-uv@v6
7574
with:
7675
python-version: ${{ matrix.config.python-version || '3.9' }}
76+
# TODO: Avoid activating environment like this
77+
# see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
78+
activate-environment: true
79+
enable-cache: true
7780

78-
- name: basic setup
79-
run: pip install -q -r .actions/requirements.txt
81+
- name: Basic setup
82+
run: uv pip install -q -r .actions/requirements.txt
83+
84+
- name: Append Env. vars for Linux
85+
if: ${{ runner.os == 'Linux' }}
86+
run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
87+
88+
- name: Append Env. vars for MacOS
89+
if: ${{ runner.os == 'macOS' }}
90+
run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
91+
92+
- name: Append Env. vars for Windows
93+
if: ${{ runner.os == 'windows' }}
94+
run: |
95+
# Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
96+
echo "USE_LIBUV=0" >> $GITHUB_ENV
8097
8198
- name: Set min. dependencies
8299
if: ${{ matrix.config.requires == 'oldest' }}
83100
run: |
84101
cd requirements/fabric
85-
pip install -U "lightning-utilities[cli]"
102+
uv pip install -U "lightning-utilities[cli]"
86103
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt', 'test.txt']"
87-
pip install "cython<3.0" wheel
88-
pip install "pyyaml==5.4" --no-build-isolation
104+
uv pip install "cython<3.0" wheel
105+
uv pip install "pyyaml==5.4" --no-build-isolation
89106
90107
- name: Adjust PyTorch versions in requirements files
91108
if: ${{ matrix.config.requires != 'oldest' }}
92109
run: |
93-
pip install -q -r requirements/ci.txt
110+
uv pip install -q -r requirements/ci.txt
94111
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
95112
for fpath in `ls requirements/**/*.txt`; do \
96113
python ./adjust-torch-versions.py $fpath ${{ matrix.config.pytorch-version }}; \
97114
done
98115
99-
- name: pip wheels cache
100-
uses: actions/cache/restore@v4
101-
with:
102-
path: ${{ env.PYPI_CACHE_DIR }}
103-
key: pypi_wheels
104-
- run: |
105-
mkdir -p $PYPI_CACHE_DIR
106-
ls -lh $PYPI_CACHE_DIR
107-
108116
- name: Expand Env. variables
109117
run: |
110118
# Switch PyTorch URL between stable and test/future
@@ -113,25 +121,15 @@ jobs:
113121
python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.config.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV
114122
# if you install mono-package set dependency only for this subpackage
115123
python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.config.pkg-name}}' != 'lightning' else 'fabric-'))" >> $GITHUB_ENV
116-
- name: Append Env. vars for MacOS
117-
if: ${{ runner.os == 'macOS' }}
118-
run: |
119-
# trying to avoid "gloo" issue with SIGABRT
120-
echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV
121-
- name: Append Env. vars for Windows
122-
if: ${{ runner.os == 'windows' }}
123-
run: |
124-
# Avoid issue on Windows with PyTorch 2.4: "RuntimeError: use_libuv was requested but PyTorch was build without libuv support"
125-
echo "USE_LIBUV=0" >> $GITHUB_ENV
126124
127125
- name: Install package & dependencies
128126
timeout-minutes: 20
129127
run: |
130-
pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
131-
-U --upgrade-strategy=eager --prefer-binary \
132-
--extra-index-url="${TORCH_URL}" \
133-
--find-links="${PYPI_CACHE_DIR}"
134-
pip list
128+
uv pip install ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
129+
--upgrade \
130+
--find-links="${TORCH_URL}"
131+
uv pip list
132+
135133
- name: Dump handy wheels
136134
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
137135
continue-on-error: true
@@ -179,6 +177,9 @@ jobs:
179177
name: CPU-coverage
180178
fail_ci_if_error: false
181179

180+
- name: Minimize uv cache
181+
run: uv cache prune --ci
182+
182183
fabric-cpu-guardian:
183184
runs-on: ubuntu-latest
184185
needs: fabric-cpu

.github/workflows/ci-tests-pytorch.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ jobs:
8989
- name: Append Env. vars for Linux
9090
if: ${{ runner.os == 'Linux' }}
9191
run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
92+
9293
- name: Append Env. vars for MacOS
9394
if: ${{ runner.os == 'macOS' }}
9495
run: echo "GLOO_SOCKET_IFNAME=lo0" >> $GITHUB_ENV

docs/source-fabric/guide/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ Build your own Trainer
7878
<div class="row">
7979

8080
.. displayitem::
81-
:header: Organize your model code with with LightningModule
81+
:header: Organize your model code with LightningModule
8282
:description: Organize your code in a LightningModule and use it with Fabric
8383
:button_link: lightning_module.html
8484
:col_css: col-md-4

docs/source-fabric/levels/intermediate.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Intermediate skills
1919
<div class="row">
2020

2121
.. displayitem::
22-
:header: Organize your model code with with LightningModule
22+
:header: Organize your model code with LightningModule
2323
:description: Organize your code in a LightningModule and use it with Fabric
2424
:button_link: ../guide/lightning_module.html
2525
:col_css: col-md-4

docs/source-pytorch/advanced/speed.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,8 @@ Validation Within Training Epoch
297297

298298
For large datasets, it's often desirable to check validation multiple times within a training epoch.
299299
Pass in a float to check that often within one training epoch. Pass in an int ``K`` to check every ``K`` training batch.
300-
Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
300+
Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`. Alternatively, pass a string ("DD:HH:MM:SS"),
301+
a dict of ``datetime.timedelta`` kwargs, or a ``datetime.timedelta`` to check validation after a given amount of wall-clock time.
301302

302303
.. testcode::
303304

@@ -310,6 +311,16 @@ Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
310311
# check every 100 train batches (ie: for IterableDatasets or fixed frequency)
311312
trainer = Trainer(val_check_interval=100)
312313

314+
# check validation every 15 minutes of wall-clock time
315+
trainer = Trainer(val_check_interval="00:00:15:00")
316+
317+
# alternatively, pass a dict of timedelta kwargs
318+
trainer = Trainer(val_check_interval={"minutes": 1})
319+
320+
# or use a timedelta object directly
321+
from datetime import timedelta
322+
trainer = Trainer(val_check_interval=timedelta(hours=1))
323+
313324
Learn more in our :ref:`trainer_flags` guide.
314325

315326

docs/source-pytorch/common/trainer.rst

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,11 +991,23 @@ val_check_interval
991991
:muted:
992992

993993
How often within one training epoch to check the validation set.
994-
Can specify as float or int.
994+
Can specify as float, int, or a time-based duration.
995995

996996
- pass a ``float`` in the range [0.0, 1.0] to check after a fraction of the training epoch.
997997
- pass an ``int`` to check after a fixed number of training batches. An ``int`` value can only be higher than the number of training
998998
batches when ``check_val_every_n_epoch=None``, which validates after every ``N`` training batches across epochs or iteration-based training.
999+
- pass a ``string`` duration in the format "DD:HH:MM:SS", a ``datetime.timedelta`` object, or a ``dictionary`` of keyword arguments that can be passed
1000+
to ``datetime.timedelta`` for time-based validation. When using a time-based duration, validation will trigger once the elapsed wall-clock time
1001+
since the last validation exceeds the interval. The validation check occurs after the current batch completes, the validation loop runs, and
1002+
the timer resets.
1003+
1004+
**Time-based validation behavior with check_val_every_n_epoch:** When used together with ``val_check_interval`` (time-based) and
1005+
``check_val_every_n_epoch > 1``, validation is aligned to epoch multiples:
1006+
1007+
- If the time-based interval elapses **before** the next multiple-N epoch, validation runs at the start of that epoch (after the first batch),
1008+
and the timer resets.
1009+
- If the interval elapses **during** a multiple-N epoch, validation runs after the current batch.
1010+
- For cases where ``check_val_every_n_epoch=None`` or ``1``, the time-based behavior of ``val_check_interval`` applies without additional alignment.
9991011

10001012
.. testcode::
10011013

@@ -1013,10 +1025,25 @@ Can specify as float or int.
10131025
# (ie: production cases with streaming data)
10141026
trainer = Trainer(val_check_interval=1000, check_val_every_n_epoch=None)
10151027

1028+
# check validation every 15 minutes of wall-clock time using a string-based approach
1029+
trainer = Trainer(val_check_interval="00:00:15:00")
1030+
1031+
# check validation every 15 minutes of wall-clock time using a dictionary-based approach
1032+
trainer = Trainer(val_check_interval={"minutes": 15})
1033+
1034+
# check validation every 1 hour of wall-clock time using a dictionary-based approach
1035+
trainer = Trainer(val_check_interval={"hours": 1})
1036+
1037+
# check validation every 1 hour of wall-clock time using a datetime.timedelta object
1038+
from datetime import timedelta
1039+
trainer = Trainer(val_check_interval=timedelta(hours=1))
1040+
1041+
10161042

10171043
.. code-block:: python
10181044
10191045
# Here is the computation to estimate the total number of batches seen within an epoch.
1046+
# This logic applies when `val_check_interval` is specified as an integer or a float.
10201047
10211048
# Find the total number of train batches
10221049
total_train_batches = total_train_samples // (train_batch_size * world_size)

src/lightning/fabric/CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2222

2323
### Changed
2424

25-
-
25+
- let `_get_default_process_group_backend_for_device` support more hardware platforms (
26+
[#21057](https://github.com/Lightning-AI/pytorch-lightning/pull/21057), [#21093](https://github.com/Lightning-AI/pytorch-lightning/pull/21093))
2627

2728

2829
### Fixed
2930

3031
- Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105))
3132

3233

34+
- Respect `verbose=False` in `seed_everything` when no seed is provided
35+
36+
3337
---
3438

3539
## [2.5.4] - 2025-08-29

src/lightning/fabric/strategies/ddp.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,17 @@ def barrier(self, *args: Any, **kwargs: Any) -> None:
160160
if torch.distributed.get_backend() == "nccl":
161161
torch.distributed.barrier(device_ids=self._determine_ddp_device_ids())
162162
else:
163-
torch.distributed.barrier()
163+
# Handle PyTorch bug where barrier() fails on CPU with "PrivateUse1HooksInterface" error
164+
try:
165+
torch.distributed.barrier()
166+
except RuntimeError as e:
167+
if "PrivateUse1HooksInterface" in str(e):
168+
# Fallback: Use all_reduce as barrier - all processes must participate
169+
# This achieves the same synchronization effect as barrier()
170+
dummy_tensor = torch.tensor(0.0, device=self.root_device)
171+
torch.distributed.all_reduce(dummy_tensor)
172+
else:
173+
raise
164174

165175
@override
166176
def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:

src/lightning/fabric/utilities/distributed.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,11 @@ def _destroy_dist_connection() -> None:
319319

320320

321321
def _get_default_process_group_backend_for_device(device: torch.device) -> str:
322-
return "nccl" if device.type == "cuda" else "gloo"
322+
"""Return corresponding distributed backend for a given device."""
323+
device_backend_map = torch.distributed.Backend.default_device_backend_map
324+
if device.type in device_backend_map:
325+
return device_backend_map[device.type]
326+
return "gloo"
323327

324328

325329
class _DatasetSamplerWrapper(Dataset):

src/lightning/fabric/utilities/seed.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
4040
env_seed = os.environ.get("PL_GLOBAL_SEED")
4141
if env_seed is None:
4242
seed = 0
43-
rank_zero_warn(f"No seed found, seed set to {seed}")
43+
if verbose:
44+
rank_zero_warn(f"No seed found, seed set to {seed}")
4445
else:
4546
try:
4647
seed = int(env_seed)

0 commit comments

Comments
 (0)