Add port manager diagnostics and improve retry handling

littlebullGit · littlebullGit · commit bfeaa63f2129 · 2025-10-22T20:44:38.000-04:00
- Add logging when port queue utilization &gt;80% to detect exhaustion
- Enhance RuntimeError with detailed diagnostics (queue utilization %)
- Add DistNetworkError type check in pytorch conftest for subprocess failures
- Add test coverage for high queue utilization warning
- Helps diagnose EADDRINUSE issues in CI distributed tests
diff --git a/src/lightning/fabric/utilities/port_manager.py b/src/lightning/fabric/utilities/port_manager.py
@@ -14,16 +14,19 @@
 """Port allocation manager to prevent race conditions in distributed training."""
 
 import atexit
+import logging
 import socket
 import threading
 from collections import deque
 from collections.abc import Iterator
 from contextlib import contextmanager
 from typing import Optional
 
+log = logging.getLogger(__name__)
+
 # Size of the recently released ports queue
 # This prevents immediate reuse of ports that were just released
-# Increased to 1024 to reduce the chance of cycling back to TIME_WAIT ports
+# Set to 1024 to balance memory usage vs TIME_WAIT protection
 _RECENTLY_RELEASED_PORTS_MAXLEN = 1024
 
 
@@ -78,12 +81,29 @@ def allocate_port(self, preferred_port: Optional[int] = None, max_attempts: int
                 # This prevents race conditions within our process
                 if port not in self._allocated_ports and port not in self._recently_released:
                     self._allocated_ports.add(port)
+
+                    # Log diagnostics if queue utilization is high
+                    queue_count = len(self._recently_released)
+                    if queue_count > _RECENTLY_RELEASED_PORTS_MAXLEN * 0.8:  # >80% full
+                        log.warning(
+                            f"Port queue utilization high: {queue_count}/{_RECENTLY_RELEASED_PORTS_MAXLEN} "
+                            f"({queue_count / _RECENTLY_RELEASED_PORTS_MAXLEN * 100:.1f}% full). "
+                            f"Allocated port {port}. Active allocations: {len(self._allocated_ports)}"
+                        )
+
                     return port
 
+            # Provide detailed diagnostics to understand allocation failures
+            allocated_count = len(self._allocated_ports)
+            queue_count = len(self._recently_released)
+            queue_capacity = _RECENTLY_RELEASED_PORTS_MAXLEN
+            queue_utilization = (queue_count / queue_capacity * 100) if queue_capacity > 0 else 0
+
             raise RuntimeError(
                 f"Failed to allocate a free port after {max_attempts} attempts. "
-                f"Currently allocated: {len(self._allocated_ports)}, "
-                f"recently released: {len(self._recently_released)}"
+                f"Diagnostics: allocated={allocated_count}, "
+                f"recently_released={queue_count}/{queue_capacity} ({queue_utilization:.1f}% full). "
+                f"If queue is near capacity, consider increasing _RECENTLY_RELEASED_PORTS_MAXLEN."
             )
 
     def release_port(self, port: int) -> None:
diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py
@@ -230,8 +230,14 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
     """
     if call.excinfo is not None and call.when == "call":
         exception_msg = str(call.excinfo.value)
+        exception_type = str(type(call.excinfo.value).__name__)
         # Check if this is an EADDRINUSE error from distributed training
-        if "EADDRINUSE" in exception_msg or "address already in use" in exception_msg.lower():
+        # Catch both direct EADDRINUSE errors and DistNetworkError which wraps them
+        if (
+            "EADDRINUSE" in exception_msg
+            or "address already in use" in exception_msg.lower()
+            or "DistNetworkError" in exception_type
+        ):
             # Get the retry count from the test node
             retry_count = getattr(item, "_port_retry_count", 0)
             max_retries = 3
diff --git a/tests/tests_fabric/utilities/test_port_manager.py b/tests/tests_fabric/utilities/test_port_manager.py
@@ -753,3 +753,24 @@ def test_port_manager_reserve_clears_recently_released():
     assert port in manager._allocated_ports
 
     manager.release_port(port)
+
+
+def test_port_manager_high_queue_utilization_warning(caplog):
+    """Test that warning is logged when queue utilization exceeds 80%."""
+    import logging
+
+    manager = PortManager()
+
+    # Fill queue to >80% (821/1024 = 80.2%)
+    for _ in range(821):
+        port = manager.allocate_port()
+        manager.release_port(port)
+
+    # Next allocation should trigger warning
+    with caplog.at_level(logging.WARNING):
+        port = manager.allocate_port()
+        manager.release_port(port)
+
+    # Verify warning was logged
+    assert any("Port queue utilization high" in record.message for record in caplog.records)
+    assert any("80." in record.message for record in caplog.records)  # Should show 80.x%
diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
@@ -358,8 +358,14 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
     """
     if call.excinfo is not None and call.when == "call":
         exception_msg = str(call.excinfo.value)
+        exception_type = str(type(call.excinfo.value).__name__)
         # Check if this is an EADDRINUSE error from distributed training
-        if "EADDRINUSE" in exception_msg or "address already in use" in exception_msg.lower():
+        # Catch both direct EADDRINUSE errors and DistNetworkError which wraps them
+        if (
+            "EADDRINUSE" in exception_msg
+            or "address already in use" in exception_msg.lower()
+            or "DistNetworkError" in exception_type
+        ):
             # Get the retry count from the test node
             retry_count = getattr(item, "_port_retry_count", 0)
             max_retries = 3