Skip to content

Commit 4853c4b

Browse files
authored
Merge branch 'master' into weights-only-compatibility
2 parents ef11c76 + 6a8d943 commit 4853c4b

File tree

25 files changed

+1400
-47
lines changed

25 files changed

+1400
-47
lines changed

docs/source-pytorch/cli/lightning_cli_intermediate_2.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,10 @@ If the scheduler you want needs other arguments, add them via the CLI (no need t
201201

202202
.. code:: bash
203203
204-
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
204+
python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=train_loss
205205
206-
Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
206+
(assuming you have a ``train_loss`` metric logged). Furthermore, any custom subclass of
207+
``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
207208

208209
.. code:: python
209210
@@ -212,7 +213,6 @@ Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can
212213
from lightning.pytorch.cli import LightningCLI
213214
from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule
214215
215-
216216
class LitLRScheduler(torch.optim.lr_scheduler.CosineAnnealingLR):
217217
def step(self):
218218
print("", "using LitLRScheduler", "")

docs/source-pytorch/common/precision_basic.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ However, this setting can sometimes lead to unstable training.
3939
4040
Trainer(precision="16-true")
4141
42+
.. warning::
43+
44+
Float16 cannot represent values smaller than ~6e-5. Values like Adam's default ``eps=1e-8`` become zero, which can cause
45+
NaN during training. Increase ``eps`` to 1e-4 or higher, and avoid extremely small values in your model weights and data.
46+
47+
.. note::
48+
49+
BFloat16 (``"bf16-mixed"`` or ``"bf16-true"``) has better numerical stability with a wider dynamic range.
4250

4351
----
4452

docs/source-pytorch/deploy/production_advanced_2.rst

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,20 @@ Deploy models into production (advanced)
77

88
----
99

10-
*********************************
11-
Compile your model to TorchScript
12-
*********************************
13-
`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ allows you to serialize your models in a way that it can be loaded in non-Python environments.
14-
The ``LightningModule`` has a handy method :meth:`~lightning.pytorch.core.LightningModule.to_torchscript` that returns a scripted module which you
15-
can save or directly use.
10+
************************************
11+
Export your model with torch.export
12+
************************************
13+
14+
`torch.export <https://pytorch.org/docs/stable/export.html>`_ is the recommended way to capture PyTorch models for
15+
deployment in production environments. It produces a clean intermediate representation with strong soundness guarantees,
16+
making models suitable for inference optimization and cross-platform deployment.
17+
You can export any ``LightningModule`` using the ``torch.export.export()`` API.
1618

1719
.. testcode:: python
1820

21+
import torch
22+
from torch.export import export
23+
1924
class SimpleModel(LightningModule):
2025
def __init__(self):
2126
super().__init__()
@@ -25,25 +30,27 @@ can save or directly use.
2530
return torch.relu(self.l1(x.view(x.size(0), -1)))
2631

2732

28-
# create the model
33+
# create the model and example input
2934
model = SimpleModel()
30-
script = model.to_torchscript()
35+
example_input = torch.randn(1, 64)
3136

32-
# save for use in production environment
33-
torch.jit.save(script, "model.pt")
37+
# export the model
38+
exported_program = export(model, (example_input,))
3439

35-
It is recommended that you install the latest supported version of PyTorch to use this feature without limitations.
40+
# save for use in production environment
41+
torch.export.save(exported_program, "model.pt2")
3642

37-
Once you have the exported model, you can run it in PyTorch or C++ runtime:
43+
It is recommended that you install the latest supported version of PyTorch to use this feature without
44+
limitations. Once you have the exported model, you can load and run it:
3845

3946
.. code-block:: python
4047
4148
inp = torch.rand(1, 64)
42-
scripted_module = torch.jit.load("model.pt")
43-
output = scripted_module(inp)
49+
loaded_program = torch.export.load("model.pt2")
50+
output = loaded_program.module()(inp)
4451
4552
46-
If you want to script a different method, you can decorate the method with :func:`torch.jit.export`:
53+
For more complex models, you can also export specific methods by creating a wrapper:
4754

4855
.. code-block:: python
4956
@@ -54,7 +61,6 @@ If you want to script a different method, you can decorate the method with :func
5461
self.dropout = nn.Dropout()
5562
self.mc_iteration = mc_iteration
5663
57-
@torch.jit.export
5864
def predict_step(self, batch, batch_idx):
5965
# enable Monte Carlo Dropout
6066
self.dropout.train()
@@ -66,4 +72,11 @@ If you want to script a different method, you can decorate the method with :func
6672
6773
6874
model = LitMCdropoutModel(...)
69-
script = model.to_torchscript(file_path="model.pt", method="script")
75+
example_batch = torch.randn(32, 10) # example input
76+
77+
# Export the predict_step method
78+
exported_program = torch.export.export(
79+
lambda batch, idx: model.predict_step(batch, idx),
80+
(example_batch, 0)
81+
)
82+
torch.export.save(exported_program, "mc_dropout_model.pt2")

requirements/doctests.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
pytest ==8.4.2
2-
pytest-doctestplus ==1.4.0
2+
pytest-doctestplus ==1.5.0

requirements/fabric/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55

66
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
77
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
8-
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
8+
deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin" # strict
99
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"

requirements/pytorch/extra.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
matplotlib>3.1, <3.11.0
66
omegaconf >=2.2.3, <2.4.0
77
hydra-core >=1.2.0, <1.4.0
8-
jsonargparse[signatures,jsonnet] >=4.39.0, <4.42.0
8+
jsonargparse[signatures,jsonnet] >=4.39.0, <4.43.0
99
rich >=12.3.0, <14.2.0
1010
tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute
1111
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"

requirements/pytorch/strategies.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
55
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
6-
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
6+
deepspeed >=0.15.0,<0.17.0; platform_system != "Windows" and platform_system != "Darwin" # strict

requirements/pytorch/test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ numpy >1.20.0, <1.27.0
1313
onnx >1.12.0, <1.20.0
1414
onnxruntime >=1.12.0, <1.24.0
1515
onnxscript >= 0.1.0, < 0.5.0
16-
psutil <7.1.1 # for `DeviceStatsMonitor`
16+
psutil <7.1.2 # for `DeviceStatsMonitor`
1717
pandas >2.0, <2.4.0 # needed in benchmarks
1818
fastapi # for `ServableModuleValidator` # not setting version as re-defined in App
1919
uvicorn # for `ServableModuleValidator` # not setting version as re-defined in App

src/lightning/fabric/CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2525

2626
### Fixed
2727

28-
-
28+
- Fixed `EADDRINUSE` errors in distributed tests with port manager and retry logic ([#21309](https://github.com/Lightning-AI/pytorch-lightning/pull/21309))
2929

3030

3131
---

src/lightning/fabric/plugins/environments/lightning.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
# limitations under the License.
1414

1515
import os
16-
import socket
1716

1817
from typing_extensions import override
1918

2019
from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
20+
from lightning.fabric.utilities.port_manager import get_port_manager
2121
from lightning.fabric.utilities.rank_zero import rank_zero_only
2222

2323

@@ -104,16 +104,38 @@ def teardown(self) -> None:
104104
if "WORLD_SIZE" in os.environ:
105105
del os.environ["WORLD_SIZE"]
106106

107+
if self._main_port != -1:
108+
get_port_manager().release_port(self._main_port)
109+
self._main_port = -1
110+
111+
os.environ.pop("MASTER_PORT", None)
112+
os.environ.pop("MASTER_ADDR", None)
113+
107114

108115
def find_free_network_port() -> int:
109116
"""Finds a free port on localhost.
110117
111118
It is useful in single-node training when we don't want to connect to a real main node but have to set the
112119
`MASTER_PORT` environment variable.
113120
121+
The allocated port is reserved and won't be returned by subsequent calls until it's explicitly released.
122+
123+
Returns:
124+
A port number that is reserved and free at the time of allocation
125+
114126
"""
115-
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
116-
s.bind(("", 0))
117-
port = s.getsockname()[1]
118-
s.close()
119-
return port
127+
# If an external launcher already specified a MASTER_PORT (for example, torch.distributed.spawn or
128+
# multiprocessing helpers), reserve it through the port manager so no other test reuses the same number.
129+
if "MASTER_PORT" in os.environ:
130+
master_port_str = os.environ["MASTER_PORT"]
131+
try:
132+
existing_port = int(master_port_str)
133+
except ValueError:
134+
pass
135+
else:
136+
port_manager = get_port_manager()
137+
if port_manager.reserve_existing_port(existing_port):
138+
return existing_port
139+
140+
port_manager = get_port_manager()
141+
return port_manager.allocate_port()

0 commit comments

Comments
 (0)