Lightning-AI
diff --git a/‎.azure/gpu-tests-fabric.yml‎
Lines changed: 7 additions & 2 deletions b/‎.azure/gpu-tests-fabric.yml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎.azure/gpu-tests-pytorch.yml‎
Lines changed: 6 additions & 1 deletion b/‎.azure/gpu-tests-pytorch.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/checkgroup.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/checkgroup.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.lightning/workflows/fabric.yml‎
Lines changed: 4 additions & 4 deletions b/‎.lightning/workflows/fabric.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.lightning/workflows/pytorch.yml‎
Lines changed: 4 additions & 4 deletions b/‎.lightning/workflows/pytorch.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions b/‎docs/source-pytorch/cli/lightning_cli_intermediate_2.rst‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎requirements/fabric/test.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements/fabric/test.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements/pytorch/test.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions b/‎src/lightning/fabric/plugins/environments/lightning.py‎
Lines changed: 28 additions & 6 deletions
@@ -85,6 +85,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -96,6 +97,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -156,7 +161,7 @@ jobs:
       - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
         workingDirectory: tests/
         displayName: "Testing: fabric standard"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
@@ -165,7 +170,7 @@ jobs:
         env:
           PL_RUN_STANDALONE_TESTS: "1"
         displayName: "Testing: fabric standalone"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
 
@@ -84,6 +84,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -95,6 +96,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -189,7 +194,7 @@ jobs:
         env:
           PL_USE_MOCKED_MNIST: "1"
         displayName: "Testing: PyTorch standalone tasks"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
 
@@ -47,7 +47,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10)"
+      - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, pytorch, 3.10)"
       - "pytorch.yml / Lit Job (lightning, 3.12)"
       - "pytorch.yml / Lit Job (pytorch, 3.12)"
 
@@ -148,7 +148,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10)"
+      - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, fabric, 3.10)"
       - "fabric.yml / Lit Job (fabric, 3.12)"
       - "fabric.yml / Lit Job (lightning, 3.12)"
 
 
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this is setting also all oldest requirements which is linked to python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "fabric"
       python_version: "3.10"
     - PACKAGE_NAME: "fabric"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "fabric"
+    #- image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
+    #  PACKAGE_NAME: "fabric"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
 
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this also sets oldest requirements which are linked to Python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "pytorch"
       python_version: "3.10"
     - PACKAGE_NAME: "pytorch"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "pytorch"
+    #- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
+    #  PACKAGE_NAME: "pytorch"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
 
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
 
 .. code:: bash
 
-    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
+    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
 
-Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
+(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
+``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
 
 .. code:: python
 
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
     from lightning.pytorch.cli import LightningCLI
     from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
 
-
     class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
         def step(self):
             print("⚡", "using LitLRScheduler", "⚡")
 
@@ -9,3 +9,4 @@ pytest-random-order ==1.2.0
 click ==8.1.8; python_version < "3.11"
 click ==8.3.0; python_version > "3.10"
 tensorboardX >=2.6, <2.7.0  # todo: relax it back to `>=2.2` after fixing tests
+huggingface-hub
@@ -21,3 +21,4 @@ uvicorn  # for `ServableModuleValidator`  # not setting version as re-defined in
 tensorboard >=2.11, <2.21.0  # for `TensorBoardLogger`
 
 torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"
+huggingface-hub
@@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
 
 
 ---
 
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import os
-import socket
 
 from typing_extensions import override
 
 from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
+from lightning.fabric.utilities.port_manager import get_port_manager
 from lightning.fabric.utilities.rank_zero import rank_zero_only
 
 
@@ -104,16 +104,38 @@ def teardown(self) -> None:
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
+        if self._main_port != -1:
+            get_port_manager().release_port(self._main_port)
+            self._main_port = -1
+
+        os.environ.pop("MASTER_PORT", None)
+        os.environ.pop("MASTER_ADDR", None)
+
 
 def find_free_network_port() -> int:
     """Finds a free port on localhost.
 
     It is useful in single-node training when we don't want to connect to a real main node but have to set the
     `MASTER_PORT` environment variable.
 
+    The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
+
+    Returns:
+        A port number that is reserved and free at the time of allocation
+
     """
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("", 0))
-    port = s.getsockname()[1]
-    s.close()
-    return port
+    # If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
+    # multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
+    if "MASTER_PORT" in os.environ:
+        master_port_str = os.environ["MASTER_PORT"]
+        try:
+            existing_port = int(master_port_str)
+        except ValueError:
+            pass
+        else:
+            port_manager = get_port_manager()
+            if port_manager.reserve_existing_port(existing_port):
+                return existing_port
+
+    port_manager = get_port_manager()
+    return port_manager.allocate_port()
Original file line number	Diff line number	Diff line change
@@ -21,3 +21,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
`21`	`21`	tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`
`22`	`22`
`23`	`23`	`torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"`
	`24`	`+huggingface-hub`