Lightning-AI
diff --git a/‎docs/source-pytorch/advanced/speed.rst‎
Lines changed: 12 additions & 1 deletion b/‎docs/source-pytorch/advanced/speed.rst‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 28 additions & 1 deletion b/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 5 additions & 1 deletion b/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/lightning/fabric/strategies/ddp.py‎
Lines changed: 11 additions & 1 deletion b/‎src/lightning/fabric/strategies/ddp.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/lightning/fabric/utilities/distributed.py‎
Lines changed: 5 additions & 1 deletion b/‎src/lightning/fabric/utilities/distributed.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/lightning/fabric/utilities/seed.py‎
Lines changed: 2 additions & 1 deletion b/‎src/lightning/fabric/utilities/seed.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/lightning/pytorch/CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎src/lightning/pytorch/CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/lightning/pytorch/callbacks/model_checkpoint.py‎
Lines changed: 6 additions & 0 deletions b/‎src/lightning/pytorch/callbacks/model_checkpoint.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/lightning/pytorch/loops/evaluation_loop.py‎
Lines changed: 4 additions & 0 deletions b/‎src/lightning/pytorch/loops/evaluation_loop.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightning/pytorch/loops/fit_loop.py‎
Lines changed: 9 additions & 2 deletions b/‎src/lightning/pytorch/loops/fit_loop.py‎
Lines changed: 9 additions & 2 deletions
@@ -297,7 +297,8 @@ Validation Within Training Epoch
 
 For large datasets, it's often desirable to check validation multiple times within a training epoch.
 Pass in a float to check that often within one training epoch. Pass in an int ``K`` to check every ``K`` training batch.
-Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
+Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`. Alternatively, pass a string ("DD:HH:MM:SS"),
+a dict of ``datetime.timedelta`` kwargs, or a ``datetime.timedelta`` to check validation after a given amount of wall-clock time.
 
 .. testcode::
 
@@ -310,6 +311,16 @@ Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
     # check every 100 train batches (ie: for IterableDatasets or fixed frequency)
     trainer = Trainer(val_check_interval=100)
 
+    # check validation every 15 minutes of wall-clock time
+    trainer = Trainer(val_check_interval="00:00:15:00")
+
+    # alternatively, pass a dict of timedelta kwargs
+    trainer = Trainer(val_check_interval={"minutes": 1})
+
+    # or use a timedelta object directly
+    from datetime import timedelta
+    trainer = Trainer(val_check_interval=timedelta(hours=1))
+
 Learn more in our :ref:`trainer_flags` guide.
 
 
 
@@ -991,11 +991,23 @@ val_check_interval
     :muted:
 
 How often within one training epoch to check the validation set.
-Can specify as float or int.
+Can specify as float, int, or a time-based duration.
 
 - pass a ``float`` in the range [0.0, 1.0] to check after a fraction of the training epoch.
 - pass an ``int`` to check after a fixed number of training batches. An ``int`` value can only be higher than the number of training
   batches when ``check_val_every_n_epoch=None``, which validates after every ``N`` training batches across epochs or iteration-based training.
+- pass a ``string`` duration in the format "DD:HH:MM:SS", a ``datetime.timedelta`` object, or a ``dictionary`` of keyword arguments that can be passed
+  to ``datetime.timedelta`` for time-based validation. When using a time-based duration, validation will trigger once the elapsed wall-clock time
+  since the last validation exceeds the interval. The validation check occurs after the current batch completes, the validation loop runs, and
+  the timer resets.
+
+**Time-based validation behavior with check_val_every_n_epoch:**  When used together with ``val_check_interval`` (time-based) and
+``check_val_every_n_epoch > 1``, validation is aligned to epoch multiples:
+
+- If the time-based interval elapses **before** the next multiple-N epoch, validation runs at the start of that epoch (after the first batch),
+  and the timer resets.
+- If the interval elapses **during** a multiple-N epoch, validation runs after the current batch.
+- For cases where ``check_val_every_n_epoch=None`` or ``1``, the time-based behavior of ``val_check_interval`` applies without additional alignment.
 
 .. testcode::
 
@@ -1013,10 +1025,25 @@ Can specify as float or int.
     # (ie: production cases with streaming data)
     trainer = Trainer(val_check_interval=1000, check_val_every_n_epoch=None)
 
+    # check validation every 15 minutes of wall-clock time using a string-based approach
+    trainer = Trainer(val_check_interval="00:00:15:00")
+
+    # check validation every 15 minutes of wall-clock time using a dictionary-based approach
+    trainer = Trainer(val_check_interval={"minutes": 15})
+
+    # check validation every 1 hour of wall-clock time using a dictionary-based approach
+    trainer = Trainer(val_check_interval={"hours": 1})
+
+    # check validation every 1 hour of wall-clock time using a datetime.timedelta object
+    from datetime import timedelta
+    trainer = Trainer(val_check_interval=timedelta(hours=1))
+
+
 
 .. code-block:: python
 
     # Here is the computation to estimate the total number of batches seen within an epoch.
+    # This logic applies when `val_check_interval` is specified as an integer or a float.
 
     # Find the total number of train batches
     total_train_batches = total_train_samples // (train_batch_size * world_size)
 
@@ -22,14 +22,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
--
+- let `_get_default_process_group_backend_for_device` support more hardware platforms (
+    [#21057](https://github.com/Lightning-AI/pytorch-lightning/pull/21057), [#21093](https://github.com/Lightning-AI/pytorch-lightning/pull/21093))
 
 
 ### Fixed
 
 - Fixed with adding a missing device id for pytorch 2.8 ([#21105](https://github.com/Lightning-AI/pytorch-lightning/pull/21105))
 
 
+- Respect `verbose=False` in `seed_everything` when no seed is provided
+
+
 ---
 
 ## [2.5.4] - 2025-08-29
 
@@ -160,7 +160,17 @@ def barrier(self, *args: Any, **kwargs: Any) -> None:
         if torch.distributed.get_backend() == "nccl":
             torch.distributed.barrier(device_ids=self._determine_ddp_device_ids())
         else:
-            torch.distributed.barrier()
+            # Handle PyTorch bug where barrier() fails on CPU with "PrivateUse1HooksInterface" error
+            try:
+                torch.distributed.barrier()
+            except RuntimeError as e:
+                if "PrivateUse1HooksInterface" in str(e):
+                    # Fallback: Use all_reduce as barrier - all processes must participate
+                    # This achieves the same synchronization effect as barrier()
+                    dummy_tensor = torch.tensor(0.0, device=self.root_device)
+                    torch.distributed.all_reduce(dummy_tensor)
+                else:
+                    raise
 
     @override
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
 
@@ -319,7 +319,11 @@ def _destroy_dist_connection() -> None:
 
 
 def _get_default_process_group_backend_for_device(device: torch.device) -> str:
-    return "nccl" if device.type == "cuda" else "gloo"
+    """Return corresponding distributed backend for a given device."""
+    device_backend_map = torch.distributed.Backend.default_device_backend_map
+    if device.type in device_backend_map:
+        return device_backend_map[device.type]
+    return "gloo"
 
 
 class _DatasetSamplerWrapper(Dataset):
 
@@ -40,7 +40,8 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False, verbose:
         env_seed = os.environ.get("PL_GLOBAL_SEED")
         if env_seed is None:
             seed = 0
-            rank_zero_warn(f"No seed found, seed set to {seed}")
+            if verbose:
+                rank_zero_warn(f"No seed found, seed set to {seed}")
         else:
             try:
                 seed = int(env_seed)
 
@@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `PossibleUserWarning` that is raised if modules are in eval mode when training starts ([#21146](https://github.com/Lightning-AI/pytorch-lightning/pull/21146))
 
 
+- Added time based validation support though `val_check_interval` ([#21071](https://github.com/Lightning-AI/pytorch-lightning/pull/21071))
+
+
 ### Changed
 
 - Default to `RichProgressBar` and `RichModelSummary` if the rich package is available. Fallback to TQDMProgressBar and ModelSummary otherwise. ([#9580](https://github.com/Lightning-AI/pytorch-lightning/pull/9580))
@@ -48,6 +51,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed `TQDMProgressBar` not resetting correctly when using both a finite and iterable dataloader ([#21147](https://github.com/Lightning-AI/pytorch-lightning/pull/21147))
 
+
+- Fixed cleanup of temporary files from `Tuner` on crashes ([#21162](https://github.com/Lightning-AI/pytorch-lightning/pull/21162))
+
 ---
 
 ## [2.5.4] - 2025-08-29
 
@@ -137,6 +137,8 @@ class ModelCheckpoint(Checkpoint):
             If ``True``, checkpoints are saved at the end of every training epoch.
             If ``False``, checkpoints are saved at the end of validation.
             If ``None`` (default), checkpointing behavior is determined based on training configuration.
+            If ``val_check_interval`` is a str, dict, or `timedelta` (time-based), checkpointing is performed after
+            validation.
             If ``check_val_every_n_epoch != 1``, checkpointing will not be performed at the end of
             every training epoch. If there are no validation batches of data, checkpointing will occur at the
             end of the training epoch. If there is a non-default number of validation runs per training epoch
@@ -517,6 +519,10 @@ def _should_save_on_train_epoch_end(self, trainer: "pl.Trainer") -> bool:
         if self._save_on_train_epoch_end is not None:
             return self._save_on_train_epoch_end
 
+        # time-based validation: always defer saving to validation end
+        if getattr(trainer, "_val_check_time_interval", None) is not None:
+            return False
+
         # if `check_val_every_n_epoch != 1`, we can't say when the validation dataloader will be loaded
         # so let's not enforce saving at every training epoch end
         if trainer.check_val_every_n_epoch != 1:
 
@@ -15,6 +15,7 @@
 import os
 import shutil
 import sys
+import time
 from collections import ChainMap, OrderedDict, defaultdict
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
@@ -314,6 +315,9 @@ def on_run_end(self) -> list[_OUT_DICT]:
         if self.verbose and self.trainer.is_global_zero:
             self._print_results(logged_outputs, self._stage.value)
 
+        now = time.monotonic()
+        self.trainer._last_val_time = now
+
         return logged_outputs
 
     def teardown(self) -> None:
 
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import time
 from dataclasses import dataclass
 from typing import Any, Optional, Union
 
@@ -283,7 +284,13 @@ def setup_data(self) -> None:
         # store epoch of dataloader reset for reload_dataloaders_every_n_epochs
         self._last_train_dl_reload_epoch = trainer.current_epoch
 
-        if isinstance(trainer.val_check_interval, int):
+        # If time-based validation is enabled, disable batch-based scheduling here.
+        # Use None to clearly signal "no batch-based validation"; wall-time logic will run elsewhere.
+        if getattr(trainer, "_val_check_time_interval", None) is not None:
+            trainer.val_check_batch = None
+            trainer._train_start_time = time.monotonic()
+            trainer._last_val_time = trainer._train_start_time
+        elif isinstance(trainer.val_check_interval, int):
             trainer.val_check_batch = trainer.val_check_interval
             if trainer.val_check_batch > self.max_batches and trainer.check_val_every_n_epoch is not None:
                 raise ValueError(
@@ -299,7 +306,7 @@ def setup_data(self) -> None:
                 else:
                     raise MisconfigurationException(
                         "When using an IterableDataset for `train_dataloader`,"
-                        " `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies"
+                        " `Trainer(val_check_interval)` must be time based, `1.0` or an int. An int k specifies"
                         " checking validation every k training batches."
                     )
             else: