Lightning-AI
diff --git a/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions b/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source-pytorch/common/precision_basic.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/source-pytorch/common/precision_basic.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎requirements/doctests.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/doctests.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/fabric/strategies.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/fabric/strategies.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/strategies.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/strategies.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions b/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎src/lightning/fabric/strategies/deepspeed.py‎
Lines changed: 15 additions & 0 deletions b/‎src/lightning/fabric/strategies/deepspeed.py‎
Lines changed: 15 additions & 0 deletions
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
 
 .. code:: bash
 
-    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
+    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
 
-Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
+(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
+``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
 
 .. code:: python
 
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
     from lightning.pytorch.cli import LightningCLI
     from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
 
-
     class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
         def step(self):
             print("⚡", "using LitLRScheduler", "⚡")
 
@@ -39,6 +39,14 @@ However, this setting can sometimes lead to unstable training.
 
     Trainer(precision="16-true")
 
+.. warning::
+
+    Float16 cannot represent values smaller than ~6e-5. Values like Adam's default ``eps=1e-8`` become zero, which can cause
+    NaN during training. Increase ``eps`` to 1e-4 or higher, and avoid extremely small values in your model weights and data.
+
+.. note::
+
+    BFloat16 (``"bf16-mixed"`` or ``"bf16-true"``) has better numerical stability with a wider dynamic range.
 
 ----
 
 
@@ -1,2 +1,2 @@
 pytest ==8.4.2
-pytest-doctestplus ==1.4.0
+pytest-doctestplus ==1.5.0
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
@@ -5,7 +5,7 @@
 matplotlib>3.1, <3.11.0
 omegaconf >=2.2.3, <2.4.0
 hydra-core >=1.2.0, <1.4.0
-jsonargparse[signatures,jsonnet] >=4.39.0, <4.42.0
+jsonargparse[signatures,jsonnet] >=4.39.0, <4.43.0
 rich >=12.3.0, <14.2.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
@@ -13,7 +13,7 @@ numpy >1.20.0, <1.27.0
 onnx >1.12.0, <1.20.0
 onnxruntime >=1.12.0, <1.24.0
 onnxscript >= 0.1.0, < 0.5.0
-psutil <7.1.1 # for `DeviceStatsMonitor`
+psutil <7.1.2 # for `DeviceStatsMonitor`
 pandas >2.0, <2.4.0  # needed in benchmarks
 fastapi  # for `ServableModuleValidator`  # not setting version as re-defined in App
 uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in App
 
@@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
 
 
 ---
 
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import os
-import socket
 
 from typing_extensions import override
 
 from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
+from lightning.fabric.utilities.port_manager import get_port_manager
 from lightning.fabric.utilities.rank_zero import rank_zero_only
 
 
@@ -104,16 +104,38 @@ def teardown(self) -> None:
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
+        if self._main_port != -1:
+            get_port_manager().release_port(self._main_port)
+            self._main_port = -1
+
+        os.environ.pop("MASTER_PORT", None)
+        os.environ.pop("MASTER_ADDR", None)
+
 
 def find_free_network_port() -> int:
     """Finds a free port on localhost.
 
     It is useful in single-node training when we don't want to connect to a real main node but have to set the
     `MASTER_PORT` environment variable.
 
+    The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
+
+    Returns:
+        A port number that is reserved and free at the time of allocation
+
     """
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("", 0))
-    port = s.getsockname()[1]
-    s.close()
-    return port
+    # If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
+    # multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
+    if "MASTER_PORT" in os.environ:
+        master_port_str = os.environ["MASTER_PORT"]
+        try:
+            existing_port = int(master_port_str)
+        except ValueError:
+            pass
+        else:
+            port_manager = get_port_manager()
+            if port_manager.reserve_existing_port(existing_port):
+                return existing_port
+
+    port_manager = get_port_manager()
+    return port_manager.allocate_port()
@@ -37,6 +37,7 @@
 from lightning.fabric.strategies.registry import _StrategyRegistry
 from lightning.fabric.strategies.strategy import _Sharded
 from lightning.fabric.utilities.distributed import log
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.load import _move_state_into
 from lightning.fabric.utilities.rank_zero import rank_zero_info, rank_zero_warn
 from lightning.fabric.utilities.seed import reset_seed
@@ -47,6 +48,7 @@
     from torch.optim.lr_scheduler import _LRScheduler
 
 _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
+_DEEPSPEED_GREATER_EQUAL_0_16 = RequirementCache("deepspeed>=0.16.0")
 
 
 # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -239,6 +241,19 @@ def __init__(
                 " Install it by running `pip install -U deepspeed`."
             )
 
+        if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
+            # Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
+            # DeepSpeed added support for this behavior in version 0.16.0.
+            import deepspeed
+
+            deepspeed_version = deepspeed.__version__
+
+            raise ImportError(
+                f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
+                f"Detected DeepSpeed version: {deepspeed_version}. "
+                "Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
+            )
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`pytest ==8.4.2`
`2`		`-pytest-doctestplus ==1.4.0`
	`2`	`+pytest-doctestplus ==1.5.0`