Skip to content

Commit fa2b839

Browse files
authored
Merge branch 'master' into batch_size_scaler_newargs
2 parents 6264e8f + 6a8d943 commit fa2b839

File tree

20 files changed

+1271
-21
lines changed

20 files changed

+1271
-21
lines changed

docs/source-pytorch/cli/lightning_cli_intermediate_2.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
201201

202202
.. code:: bash
203203
204-
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
204+
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
205205
206-
Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
206+
(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
207+
``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
207208

208209
.. code:: python
209210
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
212213
from lightning.pytorch.cli import LightningCLI
213214
from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
214215
215-
216216
class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
217217
def step(self):
218218
print("", "using LitLRScheduler", "")

docs/source-pytorch/common/precision_basic.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ However, this setting can sometimes lead to unstable training.
3939
4040
Trainer(precision="16-true")
4141
42+
.. warning::
43+
44+
Float16 cannot represent values smaller than ~6e-5. Values like Adam's default ``eps=1e-8`` become zero, which can cause
45+
NaN during training. Increase ``eps`` to 1e-4 or higher, and avoid extremely small values in your model weights and data.
46+
47+
.. note::
48+
49+
BFloat16 (``"bf16-mixed"`` or ``"bf16-true"``) has better numerical stability with a wider dynamic range.
4250

4351
----
4452

requirements/doctests.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
pytest ==8.4.2
2-
pytest-doctestplus ==1.4.0
2+
pytest-doctestplus ==1.5.0

requirements/fabric/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55

66
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
77
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
8-
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
8+
deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin" # strict
99
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"

requirements/pytorch/extra.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
matplotlib>3.1, <3.11.0
66
omegaconf >=2.2.3, <2.4.0
77
hydra-core >=1.2.0, <1.4.0
8-
jsonargparse[signatures,jsonnet] >=4.39.0, <4.42.0
8+
jsonargparse[signatures,jsonnet] >=4.39.0, <4.43.0
99
rich >=12.3.0, <14.2.0
1010
tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute
1111
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"

requirements/pytorch/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
55
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
6-
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
6+
deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin" # strict

requirements/pytorch/test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ numpy >1.20.0, <1.27.0
1313
onnx >1.12.0, <1.20.0
1414
onnxruntime >=1.12.0, <1.24.0
1515
onnxscript >= 0.1.0, < 0.5.0
16-
psutil <7.1.1 # for `DeviceStatsMonitor`
16+
psutil <7.1.2 # for `DeviceStatsMonitor`
1717
pandas >2.0, <2.4.0 # needed in benchmarks
1818
fastapi # for `ServableModuleValidator` # not setting version as re-defined in App
1919
uvicorn # for `ServableModuleValidator` # not setting version as re-defined in App

src/lightning/fabric/CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2727

2828
### Fixed
2929

30-
-
30+
- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
3131

3232

3333
---

src/lightning/fabric/plugins/environments/lightning.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
# limitations under the License.
1414

1515
import os
16-
import socket
1716

1817
from typing_extensions import override
1918

2019
from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
20+
from lightning.fabric.utilities.port_manager import get_port_manager
2121
from lightning.fabric.utilities.rank_zero import rank_zero_only
2222

2323

@@ -104,16 +104,38 @@ def teardown(self) -> None:
104104
if "WORLD_SIZE" in os.environ:
105105
del os.environ["WORLD_SIZE"]
106106

107+
if self._main_port != -1:
108+
get_port_manager().release_port(self._main_port)
109+
self._main_port = -1
110+
111+
os.environ.pop("MASTER_PORT", None)
112+
os.environ.pop("MASTER_ADDR", None)
113+
107114

108115
def find_free_network_port() -> int:
109116
"""Finds a free port on localhost.
110117
111118
It is useful in single-node training when we don't want to connect to a real main node but have to set the
112119
`MASTER_PORT` environment variable.
113120
121+
The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
122+
123+
Returns:
124+
A port number that is reserved and free at the time of allocation
125+
114126
"""
115-
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
116-
s.bind(("", 0))
117-
port = s.getsockname()[1]
118-
s.close()
119-
return port
127+
# If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
128+
# multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
129+
if "MASTER_PORT" in os.environ:
130+
master_port_str = os.environ["MASTER_PORT"]
131+
try:
132+
existing_port = int(master_port_str)
133+
except ValueError:
134+
pass
135+
else:
136+
port_manager = get_port_manager()
137+
if port_manager.reserve_existing_port(existing_port):
138+
return existing_port
139+
140+
port_manager = get_port_manager()
141+
return port_manager.allocate_port()

src/lightning/fabric/strategies/deepspeed.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from lightning.fabric.strategies.registry import _StrategyRegistry
3838
from lightning.fabric.strategies.strategy import _Sharded
3939
from lightning.fabric.utilities.distributed import log
40+
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
4041
from lightning.fabric.utilities.load import _move_state_into
4142
from lightning.fabric.utilities.rank_zero import rank_zero_info, rank_zero_warn
4243
from lightning.fabric.utilities.seed import reset_seed
@@ -47,6 +48,7 @@
4748
from torch.optim.lr_scheduler import _LRScheduler
4849

4950
_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
51+
_DEEPSPEED_GREATER_EQUAL_0_16 = RequirementCache("deepspeed>=0.16.0")
5052

5153

5254
# TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
@@ -239,6 +241,19 @@ def __init__(
239241
" Install it by running `pip install -U deepspeed`."
240242
)
241243

244+
if _TORCH_GREATER_EQUAL_2_6 and not _DEEPSPEED_GREATER_EQUAL_0_16:
245+
# Starting with PyTorch 2.6, `torch.load` defaults to `weights_only=True` when loading full checkpoints.
246+
# DeepSpeed added support for this behavior in version 0.16.0.
247+
import deepspeed
248+
249+
deepspeed_version = deepspeed.__version__
250+
251+
raise ImportError(
252+
f"PyTorch >= 2.6 requires DeepSpeed >= 0.16.0. "
253+
f"Detected DeepSpeed version: {deepspeed_version}. "
254+
"Please upgrade by running `pip install -U 'deepspeed>=0.16.0'`."
255+
)
256+
242257
super().__init__(
243258
accelerator=accelerator,
244259
parallel_devices=parallel_devices,

0 commit comments

Comments
 (0)