Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
-


---
Expand Down
39 changes: 11 additions & 28 deletions src/lightning/fabric/plugins/environments/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.

import os
import socket

from typing_extensions import override

from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
from lightning.fabric.utilities.port_manager import get_port_manager
from lightning.fabric.utilities.rank_zero import rank_zero_only


Expand Down Expand Up @@ -104,38 +104,21 @@ def teardown(self) -> None:
if "WORLD_SIZE" in os.environ:
del os.environ["WORLD_SIZE"]

if self._main_port != -1:
get_port_manager().release_port(self._main_port)
self._main_port = -1

os.environ.pop("MASTER_PORT", None)
os.environ.pop("MASTER_ADDR", None)


def find_free_network_port() -> int:
"""Finds a free port on localhost.
If the environment variable `STANDALONE_PORT` is set, its value is used as the port number.
It is useful in single-node training when we don't want to connect to a real main node but have to set the
`MASTER_PORT` environment variable.
The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
Returns:
A port number that is reserved and free at the time of allocation
"""
# If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
# multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
if "MASTER_PORT" in os.environ:
master_port_str = os.environ["MASTER_PORT"]
try:
existing_port = int(master_port_str)
except ValueError:
pass
else:
port_manager = get_port_manager()
if port_manager.reserve_existing_port(existing_port):
return existing_port

port_manager = get_port_manager()
return port_manager.allocate_port()
if "STANDALONE_PORT" in os.environ:
_port = os.environ["STANDALONE_PORT"]
return int(_port)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
port = s.getsockname()[1]
s.close()
return port
233 changes: 0 additions & 233 deletions src/lightning/fabric/utilities/port_manager.py

This file was deleted.

Loading
Loading