Lightning-AI
diff --git a/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions b/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source-pytorch/common/precision_basic.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/source-pytorch/common/precision_basic.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source-pytorch/deploy/production_advanced_2.rst‎
Lines changed: 30 additions & 17 deletions b/‎docs/source-pytorch/deploy/production_advanced_2.rst‎
Lines changed: 30 additions & 17 deletions
diff --git a/‎requirements/doctests.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/doctests.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/fabric/strategies.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/fabric/strategies.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/strategies.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/strategies.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions b/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
 
 .. code:: bash
 
-    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
+    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
 
-Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
+(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
+``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
 
 .. code:: python
 
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
     from lightning.pytorch.cli import LightningCLI
     from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
 
-
     class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
         def step(self):
             print("⚡", "using LitLRScheduler", "⚡")
 
@@ -39,6 +39,14 @@ However, this setting can sometimes lead to unstable training.
 
     Trainer(precision="16-true")
 
+.. warning::
+
+    Float16 cannot represent values smaller than ~6e-5. Values like Adam's default ``eps=1e-8`` become zero, which can cause
+    NaN during training. Increase ``eps`` to 1e-4 or higher, and avoid extremely small values in your model weights and data.
+
+.. note::
+
+    BFloat16 (``"bf16-mixed"`` or ``"bf16-true"``) has better numerical stability with a wider dynamic range.
 
 ----
 
 
@@ -7,15 +7,20 @@ Deploy models into production (advanced)
 
 ----
 
-*********************************
-Compile your model to TorchScript
-*********************************
-`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ allows you to serialize your models in a way that it can be loaded in non-Python environments.
-The ``LightningModule`` has a handy method :meth:`~lightning.pytorch.core.LightningModule.to_torchscript` that returns a scripted module which you
-can save or directly use.
+************************************
+Export your model with torch.export
+************************************
+
+`torch.export <https://pytorch.org/docs/stable/export.html>`_ is the recommended way to capture PyTorch models for
+deployment in production environments. It produces a clean intermediate representation with strong soundness guarantees,
+making models suitable for inference optimization and cross-platform deployment.
+You can export any ``LightningModule`` using the ``torch.export.export()`` API.
 
 .. testcode:: python
 
+    import torch
+    from torch.export import export
+
     class SimpleModel(LightningModule):
         def __init__(self):
             super().__init__()
@@ -25,25 +30,27 @@ can save or directly use.
             return torch.relu(self.l1(x.view(x.size(0), -1)))
 
 
-    # create the model
+    # create the model and example input
     model = SimpleModel()
-    script = model.to_torchscript()
+    example_input = torch.randn(1, 64)
 
-    # save for use in production environment
-    torch.jit.save(script, "model.pt")
+    # export the model
+    exported_program = export(model, (example_input,))
 
-It is recommended that you install the latest supported version of PyTorch to use this feature without limitations.
+    # save for use in production environment
+    torch.export.save(exported_program, "model.pt2")
 
-Once you have the exported model, you can run it in PyTorch or C++ runtime:
+It is recommended that you install the latest supported version of PyTorch to use this feature without
+limitations. Once you have the exported model, you can load and run it:
 
 .. code-block:: python
 
     inp = torch.rand(1, 64)
-    scripted_module = torch.jit.load("model.pt")
-    output = scripted_module(inp)
+    loaded_program = torch.export.load("model.pt2")
+    output = loaded_program.module()(inp)
 
 
-If you want to script a different method, you can decorate the method with :func:`torch.jit.export`:
+For more complex models, you can also export specific methods by creating a wrapper:
 
 .. code-block:: python
 
@@ -54,7 +61,6 @@ If you want to script a different method, you can decorate the method with :func
             self.dropout = nn.Dropout()
             self.mc_iteration = mc_iteration
 
-        @torch.jit.export
         def predict_step(self, batch, batch_idx):
             # enable Monte Carlo Dropout
             self.dropout.train()
@@ -66,4 +72,11 @@ If you want to script a different method, you can decorate the method with :func
 
 
     model = LitMCdropoutModel(...)
-    script = model.to_torchscript(file_path="model.pt", method="script")
+    example_batch = torch.randn(32, 10)  # example input
+
+    # Export the predict_step method
+    exported_program = torch.export.export(
+        lambda batch, idx: model.predict_step(batch, idx),
+        (example_batch, 0)
+    )
+    torch.export.save(exported_program, "mc_dropout_model.pt2")
@@ -1,2 +1,2 @@
 pytest ==8.4.2
-pytest-doctestplus ==1.4.0
+pytest-doctestplus ==1.5.0
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
@@ -5,7 +5,7 @@
 matplotlib>3.1, <3.11.0
 omegaconf >=2.2.3, <2.4.0
 hydra-core >=1.2.0, <1.4.0
-jsonargparse[signatures,jsonnet] >=4.39.0, <4.42.0
+jsonargparse[signatures,jsonnet] >=4.39.0, <4.43.0
 rich >=12.3.0, <14.2.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
 bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
+deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin"  # strict
@@ -13,7 +13,7 @@ numpy >1.20.0, <1.27.0
 onnx >1.12.0, <1.20.0
 onnxruntime >=1.12.0, <1.24.0
 onnxscript >= 0.1.0, < 0.5.0
-psutil <7.1.1 # for `DeviceStatsMonitor`
+psutil <7.1.2 # for `DeviceStatsMonitor`
 pandas >2.0, <2.4.0  # needed in benchmarks
 fastapi  # for `ServableModuleValidator`  # not setting version as re-defined in App
 uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in App
 
@@ -25,7 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
 
 
 ---
 
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import os
-import socket
 
 from typing_extensions import override
 
 from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
+from lightning.fabric.utilities.port_manager import get_port_manager
 from lightning.fabric.utilities.rank_zero import rank_zero_only
 
 
@@ -104,16 +104,38 @@ def teardown(self) -> None:
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
+        if self._main_port != -1:
+            get_port_manager().release_port(self._main_port)
+            self._main_port = -1
+
+        os.environ.pop("MASTER_PORT", None)
+        os.environ.pop("MASTER_ADDR", None)
+
 
 def find_free_network_port() -> int:
     """Finds a free port on localhost.
 
     It is useful in single-node training when we don't want to connect to a real main node but have to set the
     `MASTER_PORT` environment variable.
 
+    The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
+
+    Returns:
+        A port number that is reserved and free at the time of allocation
+
     """
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("", 0))
-    port = s.getsockname()[1]
-    s.close()
-    return port
+    # If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
+    # multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
+    if "MASTER_PORT" in os.environ:
+        master_port_str = os.environ["MASTER_PORT"]
+        try:
+            existing_port = int(master_port_str)
+        except ValueError:
+            pass
+        else:
+            port_manager = get_port_manager()
+            if port_manager.reserve_existing_port(existing_port):
+                return existing_port
+
+    port_manager = get_port_manager()
+    return port_manager.allocate_port()
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`pytest ==8.4.2`
`2`		`-pytest-doctestplus ==1.4.0`
	`2`	`+pytest-doctestplus ==1.5.0`