Merge branch 'master' into docs/links

deependujha · web-flow · commit 051a86b6cc95 · 2025-07-14T23:59:06.000+05:30
diff --git a/.github/markdown-links-config.json b/.github/markdown-links-config.json
@@ -22,5 +22,9 @@
         "Accept-Encoding": "zstd, br, gzip, deflate"
       }
     }
-  ]
+  ],
+  "timeout": "20s",
+  "retryOn429": true,
+  "retryCount": 5,
+  "fallbackRetryDelay": "20s"
 }
diff --git a/README.md b/README.md
@@ -55,6 +55,12 @@ ______________________________________________________________________
 
 &nbsp;
 
+# Why PyTorch Lightning?   
+
+Training models in plain PyTorch is tedious and error-prone - you have to manually handle things like backprop, mixed precision, multi-GPU, and distributed training, often rewriting code for every new project. PyTorch Lightning organizes PyTorch code to automate those complexities so you can focus on your model and data, while keeping full control and scaling from CPU to multi-node without changing your core code.
+
+Fun analogy: If PyTorch is Javascript, PyTorch Lightning is ReactJS or NextJS.
+
 # Lightning has 2 core packages
 
 [PyTorch Lightning: Train and deploy PyTorch at scale](#why-pytorch-lightning).
diff --git a/_notebooks b/_notebooks
@@ -1 +1 @@
-Subproject commit fd70f5114b21f7f970bd5587b1d3def689507069
+Subproject commit 99ba2b407843a846210b64a8312bbf4ee33e51a5
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
@@ -1,7 +1,7 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-torch >=2.1.0, <2.8.0
+torch >=2.1.0, <=2.8.0
 tqdm >=4.57.0, <4.68.0
 PyYAML >5.4, <6.1.0
 fsspec[http] >=2022.5.0, <2025.6.0
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -133,9 +133,15 @@ class ModelCheckpoint(Checkpoint):
             will only save checkpoints at epochs 0 < E <= N
             where both values for ``every_n_epochs`` and ``check_val_every_n_epoch`` evenly divide E.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch.
-            If this is ``False``, then the check runs at the end of the validation.
+            If ``True``, checkpoints are saved at the end of every training epoch.
+            If ``False``, checkpoints are saved at the end of validation.
+            If ``None`` (default), checkpointing behavior is determined based on training configuration.
+            If ``check_val_every_n_epoch != 1``, checkpointing will not be performed at the end of
+            every training epoch. If there are no validation batches of data, checkpointing will occur at the
+            end of the training epoch. If there is a non-default number of validation runs per training epoch
+            (``val_check_interval != 1``), checkpointing is performed after validation.
         enable_version_counter: Whether to append a version to the existing file name.
-            If this is ``False``, then the checkpoint files will be overwritten.
+            If ``False``, then the checkpoint files will be overwritten.
 
     Note:
         For extra customization, ModelCheckpoint includes the following attributes:
diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
@@ -453,10 +453,11 @@ def _check_strategy_and_fallback(self) -> None:
 
         if (
             strategy_flag in FSDPStrategy.get_registered_strategies() or type(self._strategy_flag) is FSDPStrategy
-        ) and self._accelerator_flag not in ("cuda", "gpu"):
+        ) and not (self._accelerator_flag in ("cuda", "gpu") or isinstance(self._accelerator_flag, CUDAAccelerator)):
             raise ValueError(
-                f"The strategy `{FSDPStrategy.strategy_name}` requires a GPU accelerator, but got:"
-                f" {self._accelerator_flag}"
+                f"The strategy `{FSDPStrategy.strategy_name}` requires a GPU accelerator, but received "
+                f"`accelerator={self._accelerator_flag!r}`. Please set `accelerator='cuda'`, `accelerator='gpu'`,"
+                " or pass a `CUDAAccelerator()` instance to use FSDP."
             )
         if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods():
             raise ValueError(
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -582,6 +582,11 @@ class AcceleratorSubclass(CPUAccelerator):
     Trainer(accelerator=AcceleratorSubclass(), strategy=FSDPStrategySubclass())
 
 
+@RunIf(min_cuda_gpus=1)
+def test_check_fsdp_strategy_and_fallback_with_cudaaccelerator():
+    Trainer(strategy="fsdp", accelerator=CUDAAccelerator())
+
+
 @mock.patch.dict(os.environ, {}, clear=True)
 def test_unsupported_tpu_choice(xla_available, tpu_available):
     # if user didn't set strategy, _Connector will choose the SingleDeviceXLAStrategy or XLAStrategy

Original file line number	Diff line number	Diff line change
`@@ -22,5 +22,9 @@`
`22`	`22`	`"Accept-Encoding": "zstd, br, gzip, deflate"`
`23`	`23`	`}`
`24`	`24`	`}`
`25`		`- ]`
	`25`	`+ ],`
	`26`	`+ "timeout": "20s",`
	`27`	`+ "retryOn429": true,`
	`28`	`+ "retryCount": 5,`
	`29`	`+ "fallbackRetryDelay": "20s"`
`26`	`30`	`}`