Merge branch 'master' into feat/dynamo_export_onnx

GdoongMathew · web-flow · commit f0146394b108 · 2025-07-12T15:03:27.000+08:00
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -109,6 +109,36 @@ ______________________________________________________________________
 
 ## Guidelines
 
+### Development environment
+
+To set up a local development environment, we recommend using `uv`, which can be installed following their [instructions](https://docs.astral.sh/uv/getting-started/installation/).
+
+Once `uv` has been installed, begin by cloning the repository:
+
+```bash
+git clone https://github.com/Lightning-AI/lightning.git
+cd lightning
+```
+
+Once in root level of the repository, create a new virtual environment and install the project dependencies.
+
+```bash
+uv venv
+# uv venv --python 3.11 # use this instead if you need a specific python version
+
+source .venv/bin/activate # command may differ based on your shell
+uv pip install ".[dev, examples]"
+```
+
+Once the dependencies have been installed, install pre-commit and set up the git hook scripts:
+
+```bash
+uv pip install pre-commit
+pre-commit install
+```
+
+If you would like more information regarding the uv commands, please refer to uv's documentation for more information on their [pip interface](https://docs.astral.sh/uv/pip/).
+
 ### Developments scripts
 
 To build the documentation locally, simply execute the following commands from project root (only for Unix):
diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -1,6 +1,6 @@
 setuptools <80.9.1
 wheel <0.46.0
-awscli >=1.30.0, <1.41.0
+awscli >=1.30.0, <1.42.0
 twine ==6.1.0
 importlib-metadata <9.0.0
 wget
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -1,4 +1,4 @@
-coverage ==7.9.1
+coverage ==7.9.2
 numpy >=1.17.2, <1.27.0
 pytest ==8.4.1
 pytest-cov ==6.2.1
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
@@ -1,7 +1,7 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-torch >=2.1.0, <2.8.0
+torch >=2.1.0, <=2.8.0
 tqdm >=4.57.0, <4.68.0
 PyYAML >5.4, <6.1.0
 fsspec[http] >=2022.5.0, <2025.6.0
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -1,4 +1,4 @@
-coverage ==7.9.1
+coverage ==7.9.2
 pytest ==8.4.1
 pytest-cov ==6.2.1
 pytest-timeout ==2.4.0
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -133,9 +133,15 @@ class ModelCheckpoint(Checkpoint):
             will only save checkpoints at epochs 0 < E <= N
             where both values for ``every_n_epochs`` and ``check_val_every_n_epoch`` evenly divide E.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch.
-            If this is ``False``, then the check runs at the end of the validation.
+            If ``True``, checkpoints are saved at the end of every training epoch.
+            If ``False``, checkpoints are saved at the end of validation.
+            If ``None`` (default), checkpointing behavior is determined based on training configuration.
+            If ``check_val_every_n_epoch != 1``, checkpointing will not be performed at the end of
+            every training epoch. If there are no validation batches of data, checkpointing will occur at the
+            end of the training epoch. If there is a non-default number of validation runs per training epoch
+            (``val_check_interval != 1``), checkpointing is performed after validation.
         enable_version_counter: Whether to append a version to the existing file name.
-            If this is ``False``, then the checkpoint files will be overwritten.
+            If ``False``, then the checkpoint files will be overwritten.
 
     Note:
         For extra customization, ModelCheckpoint includes the following attributes:
diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
@@ -453,10 +453,11 @@ def _check_strategy_and_fallback(self) -> None:
 
         if (
             strategy_flag in FSDPStrategy.get_registered_strategies() or type(self._strategy_flag) is FSDPStrategy
-        ) and self._accelerator_flag not in ("cuda", "gpu"):
+        ) and not (self._accelerator_flag in ("cuda", "gpu") or isinstance(self._accelerator_flag, CUDAAccelerator)):
             raise ValueError(
-                f"The strategy `{FSDPStrategy.strategy_name}` requires a GPU accelerator, but got:"
-                f" {self._accelerator_flag}"
+                f"The strategy `{FSDPStrategy.strategy_name}` requires a GPU accelerator, but received "
+                f"`accelerator={self._accelerator_flag!r}`. Please set `accelerator='cuda'`, `accelerator='gpu'`,"
+                " or pass a `CUDAAccelerator()` instance to use FSDP."
             )
         if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods():
             raise ValueError(
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -582,6 +582,11 @@ class AcceleratorSubclass(CPUAccelerator):
     Trainer(accelerator=AcceleratorSubclass(), strategy=FSDPStrategySubclass())
 
 
+@RunIf(min_cuda_gpus=1)
+def test_check_fsdp_strategy_and_fallback_with_cudaaccelerator():
+    Trainer(strategy="fsdp", accelerator=CUDAAccelerator())
+
+
 @mock.patch.dict(os.environ, {}, clear=True)
 def test_unsupported_tpu_choice(xla_available, tpu_available):
     # if user didn't set strategy, _Connector will choose the SingleDeviceXLAStrategy or XLAStrategy

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-coverage ==7.9.1`
	`1`	`+coverage ==7.9.2`
`2`	`2`	`numpy >=1.17.2, <1.27.0`
`3`	`3`	`pytest ==8.4.1`
`4`	`4`	`pytest-cov ==6.2.1`