| 
14 | 14 | """Port allocation manager to prevent race conditions in distributed training."""  | 
15 | 15 | 
 
  | 
16 | 16 | import atexit  | 
 | 17 | +import logging  | 
17 | 18 | import socket  | 
18 | 19 | import threading  | 
19 | 20 | from collections import deque  | 
20 | 21 | from collections.abc import Iterator  | 
21 | 22 | from contextlib import contextmanager  | 
22 | 23 | from typing import Optional  | 
23 | 24 | 
 
  | 
 | 25 | +log = logging.getLogger(__name__)  | 
 | 26 | + | 
24 | 27 | # Size of the recently released ports queue  | 
25 | 28 | # This prevents immediate reuse of ports that were just released  | 
26 |  | -# Increased to 1024 to reduce the chance of cycling back to TIME_WAIT ports  | 
 | 29 | +# Set to 1024 to balance memory usage vs TIME_WAIT protection  | 
27 | 30 | _RECENTLY_RELEASED_PORTS_MAXLEN = 1024  | 
28 | 31 | 
 
  | 
29 | 32 | 
 
  | 
@@ -78,12 +81,29 @@ def allocate_port(self, preferred_port: Optional[int] = None, max_attempts: int  | 
78 | 81 |                 # This prevents race conditions within our process  | 
79 | 82 |                 if port not in self._allocated_ports and port not in self._recently_released:  | 
80 | 83 |                     self._allocated_ports.add(port)  | 
 | 84 | + | 
 | 85 | +                    # Log diagnostics if queue utilization is high  | 
 | 86 | +                    queue_count = len(self._recently_released)  | 
 | 87 | +                    if queue_count > _RECENTLY_RELEASED_PORTS_MAXLEN * 0.8:  # >80% full  | 
 | 88 | +                        log.warning(  | 
 | 89 | +                            f"Port queue utilization high: {queue_count}/{_RECENTLY_RELEASED_PORTS_MAXLEN} "  | 
 | 90 | +                            f"({queue_count / _RECENTLY_RELEASED_PORTS_MAXLEN * 100:.1f}% full). "  | 
 | 91 | +                            f"Allocated port {port}. Active allocations: {len(self._allocated_ports)}"  | 
 | 92 | +                        )  | 
 | 93 | + | 
81 | 94 |                     return port  | 
82 | 95 | 
 
  | 
 | 96 | +            # Provide detailed diagnostics to understand allocation failures  | 
 | 97 | +            allocated_count = len(self._allocated_ports)  | 
 | 98 | +            queue_count = len(self._recently_released)  | 
 | 99 | +            queue_capacity = _RECENTLY_RELEASED_PORTS_MAXLEN  | 
 | 100 | +            queue_utilization = (queue_count / queue_capacity * 100) if queue_capacity > 0 else 0  | 
 | 101 | + | 
83 | 102 |             raise RuntimeError(  | 
84 | 103 |                 f"Failed to allocate a free port after {max_attempts} attempts. "  | 
85 |  | -                f"Currently allocated: {len(self._allocated_ports)}, "  | 
86 |  | -                f"recently released: {len(self._recently_released)}"  | 
 | 104 | +                f"Diagnostics: allocated={allocated_count}, "  | 
 | 105 | +                f"recently_released={queue_count}/{queue_capacity} ({queue_utilization:.1f}% full). "  | 
 | 106 | +                f"If queue is near capacity, consider increasing _RECENTLY_RELEASED_PORTS_MAXLEN."  | 
87 | 107 |             )  | 
88 | 108 | 
 
  | 
89 | 109 |     def release_port(self, port: int) -> None:  | 
 | 
0 commit comments