Skip to content

Commit 01b9247

Browse files
authored
Merge branch 'master' into feat/logger_dict
2 parents 9d0d39d + b554e99 commit 01b9247

File tree

24 files changed

+1377
-49
lines changed

24 files changed

+1377
-49
lines changed

.azure/gpu-tests-fabric.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ jobs:
8585
displayName: "extend env. vars 4 future"
8686
8787
- bash: |
88+
set -ex
8889
echo $(DEVICES)
8990
echo $CUDA_VISIBLE_DEVICES
9091
echo $CUDA_VERSION_MM
@@ -96,6 +97,10 @@ jobs:
9697
python --version
9798
pip --version
9899
pip list
100+
# todo: rather use devel base image
101+
apt-get update -qq --fix-missing
102+
apt-get install -y cuda-toolkit
103+
nvcc --version
99104
displayName: "Image info & NVIDIA"
100105
101106
- bash: |
@@ -156,7 +161,7 @@ jobs:
156161
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
157162
workingDirectory: tests/
158163
displayName: "Testing: fabric standard"
159-
timeoutInMinutes: "10"
164+
timeoutInMinutes: "15"
160165

161166
- bash: |
162167
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
@@ -165,7 +170,7 @@ jobs:
165170
env:
166171
PL_RUN_STANDALONE_TESTS: "1"
167172
displayName: "Testing: fabric standalone"
168-
timeoutInMinutes: "10"
173+
timeoutInMinutes: "15"
169174
170175
- bash: |
171176
python -m coverage report

.azure/gpu-tests-pytorch.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ jobs:
8484
displayName: "extend env. vars 4 future"
8585
8686
- bash: |
87+
set -ex
8788
echo $(DEVICES)
8889
echo $CUDA_VISIBLE_DEVICES
8990
echo $CUDA_VERSION_MM
@@ -95,6 +96,10 @@ jobs:
9596
python --version
9697
pip --version
9798
pip list
99+
# todo: rather use devel base image
100+
apt-get update -qq --fix-missing
101+
apt-get install -y cuda-toolkit
102+
nvcc --version
98103
displayName: "Image info & NVIDIA"
99104
100105
- bash: |
@@ -189,7 +194,7 @@ jobs:
189194
env:
190195
PL_USE_MOCKED_MNIST: "1"
191196
displayName: "Testing: PyTorch standalone tasks"
192-
timeoutInMinutes: "10"
197+
timeoutInMinutes: "15"
193198

194199
- bash: |
195200
python -m coverage report

.github/checkgroup.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ subprojects:
4747
- "!*.md"
4848
- "!**/*.md"
4949
checks:
50-
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10)"
50+
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, pytorch, 3.10)"
5151
- "pytorch.yml / Lit Job (lightning, 3.12)"
5252
- "pytorch.yml / Lit Job (pytorch, 3.12)"
5353

@@ -148,7 +148,7 @@ subprojects:
148148
- "!*.md"
149149
- "!**/*.md"
150150
checks:
151-
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10)"
151+
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, fabric, 3.10)"
152152
- "fabric.yml / Lit Job (fabric, 3.12)"
153153
- "fabric.yml / Lit Job (lightning, 3.12)"
154154

.lightning/workflows/fabric.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,18 @@ trigger:
66

77
timeout: "60" # minutes
88
machine: "L4_X_2"
9-
image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
9+
image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
1010
parametrize:
1111
matrix: {}
1212
include:
1313
# note that this is setting also all oldest requirements which is linked to python == 3.10
14-
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
14+
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
1515
PACKAGE_NAME: "fabric"
1616
python_version: "3.10"
1717
- PACKAGE_NAME: "fabric"
1818
python_version: "3.12"
19-
# - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
20-
# PACKAGE_NAME: "fabric"
19+
#- image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
20+
# PACKAGE_NAME: "fabric"
2121
- PACKAGE_NAME: "lightning"
2222
python_version: "3.12"
2323
exclude: []

.lightning/workflows/pytorch.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,18 @@ trigger:
66

77
timeout: "60" # minutes
88
machine: "L4_X_2"
9-
image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
9+
image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
1010
parametrize:
1111
matrix: {}
1212
include:
1313
# note that this also sets oldest requirements which are linked to Python == 3.10
14-
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
14+
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
1515
PACKAGE_NAME: "pytorch"
1616
python_version: "3.10"
1717
- PACKAGE_NAME: "pytorch"
1818
python_version: "3.12"
19-
# - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
20-
# PACKAGE_NAME: "pytorch"
19+
#- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
20+
# PACKAGE_NAME: "pytorch"
2121
- PACKAGE_NAME: "lightning"
2222
python_version: "3.12"
2323
exclude: []

docs/source-pytorch/cli/lightning_cli_intermediate_2.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
201201

202202
.. code:: bash
203203
204-
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
204+
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
205205
206-
Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
206+
(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
207+
``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
207208

208209
.. code:: python
209210
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
212213
from lightning.pytorch.cli import LightningCLI
213214
from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
214215
215-
216216
class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
217217
def step(self):
218218
print("", "using LitLRScheduler", "")

requirements/fabric/test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ pytest-random-order ==1.2.0
99
click ==8.1.8; python_version < "3.11"
1010
click ==8.3.0; python_version > "3.10"
1111
tensorboardX >=2.6, <2.7.0 # todo: relax it back to `>=2.2` after fixing tests
12+
huggingface-hub

requirements/pytorch/test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
2121
tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`
2222

2323
torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"
24+
huggingface-hub

src/lightning/fabric/CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2727

2828
### Fixed
2929

30-
-
30+
- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
3131

3232

3333
---

src/lightning/fabric/plugins/environments/lightning.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
# limitations under the License.
1414

1515
import os
16-
import socket
1716

1817
from typing_extensions import override
1918

2019
from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
20+
from lightning.fabric.utilities.port_manager import get_port_manager
2121
from lightning.fabric.utilities.rank_zero import rank_zero_only
2222

2323

@@ -104,16 +104,38 @@ def teardown(self) -> None:
104104
if "WORLD_SIZE" in os.environ:
105105
del os.environ["WORLD_SIZE"]
106106

107+
if self._main_port != -1:
108+
get_port_manager().release_port(self._main_port)
109+
self._main_port = -1
110+
111+
os.environ.pop("MASTER_PORT", None)
112+
os.environ.pop("MASTER_ADDR", None)
113+
107114

108115
def find_free_network_port() -> int:
109116
"""Finds a free port on localhost.
110117
111118
It is useful in single-node training when we don't want to connect to a real main node but have to set the
112119
`MASTER_PORT` environment variable.
113120
121+
The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
122+
123+
Returns:
124+
A port number that is reserved and free at the time of allocation
125+
114126
"""
115-
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
116-
s.bind(("", 0))
117-
port = s.getsockname()[1]
118-
s.close()
119-
return port
127+
# If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
128+
# multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
129+
if "MASTER_PORT" in os.environ:
130+
master_port_str = os.environ["MASTER_PORT"]
131+
try:
132+
existing_port = int(master_port_str)
133+
except ValueError:
134+
pass
135+
else:
136+
port_manager = get_port_manager()
137+
if port_manager.reserve_existing_port(existing_port):
138+
return existing_port
139+
140+
port_manager = get_port_manager()
141+
return port_manager.allocate_port()

0 commit comments

Comments
 (0)