Skip to content

Commit a83fcef

Browse files
Allow cross-device local checkpoints with fsspec>=2025.5.0 (Lightning-AI#20780)
Cross-device transactions via fsspec (used for example in ModelCheckpoint) resulted in permission errors. The permission errors were caused by attempts to change file modes on different filesystem. This was fixed in fsspec 2025.3.3. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent e15cd67 commit a83fcef

File tree

4 files changed

+45
-5
lines changed

4 files changed

+45
-5
lines changed

src/lightning/fabric/utilities/cloud_io.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
"""Utilities related to data saving/loading."""
1515

16+
import errno
1617
import io
1718
import logging
1819
from pathlib import Path
@@ -84,10 +85,16 @@ def _atomic_save(checkpoint: dict[str, Any], filepath: Union[str, Path]) -> None
8485
log.debug(f"Saving checkpoint: {filepath}")
8586
torch.save(checkpoint, bytesbuffer)
8687

87-
# We use a transaction here to avoid file corruption if the save gets interrupted
88-
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
89-
with fs.transaction, fs.open(urlpath, "wb") as f:
90-
f.write(bytesbuffer.getvalue())
88+
try:
89+
# We use a transaction here to avoid file corruption if the save gets interrupted
90+
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
91+
with fs.transaction, fs.open(urlpath, "wb") as f:
92+
f.write(bytesbuffer.getvalue())
93+
except PermissionError as e:
94+
if isinstance(e.__context__, OSError) and getattr(e.__context__, "errno", None) == errno.EXDEV:
95+
raise RuntimeError(
96+
'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"',
97+
) from e
9198

9299

93100
def _is_object_storage(fs: AbstractFileSystem) -> bool:

src/lightning/pytorch/CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1111

1212
- Add enable_autolog_hparams argument to Trainer ([#20593](https://github.com/Lightning-AI/pytorch-lightning/pull/20593))
1313

14+
- For cross-device local checkpoints, instruct users to install `fsspec>=2025.5.0` if unavailable ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780))
15+
1416

1517
### Changed
1618

1719
-
1820

19-
2021
### Removed
2122

2223
-

src/lightning/pytorch/callbacks/model_checkpoint.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ class ModelCheckpoint(Checkpoint):
155155
If the checkpoint's ``dirpath`` changed from what it was before while resuming the training,
156156
only ``best_model_path`` will be reloaded and a warning will be issued.
157157
158+
If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod``
159+
to raise a ``PermissionError``), install `fsspec>=2025.5.0`. Then the error is caught, the file's permissions
160+
remain unchanged, and the checkpoint is still saved. Otherwise, no checkpoint will be saved and training stops.
161+
158162
Raises:
159163
MisconfigurationException:
160164
If ``save_top_k`` is smaller than ``-1``,

tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import errno
1415
import os
16+
import re
1517
from unittest import mock
1618
from unittest.mock import ANY, Mock
1719

20+
import fsspec
1821
import pytest
1922
import torch
2023

@@ -105,6 +108,31 @@ def test_hpc_max_ckpt_version(tmp_path):
105108
)
106109

107110

111+
def test_local_cross_device_checkpoint(tmpdir):
112+
"""Test that the _CheckpointConnector can write local cross-device files or raises an error if fsspec<2025.5.0."""
113+
model = BoringModel()
114+
# hardcoding dir since `tmp_path` can be windows path
115+
trainer = Trainer(
116+
default_root_dir="memory://test_ckpt_for_fsspec", limit_train_batches=1, limit_val_batches=1, max_epochs=1
117+
)
118+
trainer.fit(model)
119+
# Simulate the behavior of fsspec when writing to a local file system but other device.
120+
with (
121+
mock.patch("os.rename", side_effect=OSError(errno.EXDEV, "Invalid cross-device link")),
122+
mock.patch("os.chmod", side_effect=PermissionError("Operation not permitted")),
123+
):
124+
if fsspec.__version__ < "2025.5.0":
125+
with pytest.raises(
126+
RuntimeError,
127+
match=re.escape(
128+
'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"'
129+
),
130+
):
131+
trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt")
132+
else:
133+
trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt")
134+
135+
108136
def test_ckpt_for_fsspec():
109137
"""Test that the _CheckpointConnector is able to write to fsspec file systems."""
110138
model = BoringModel()

0 commit comments

Comments
 (0)