Skip to content

Commit c6a77a9

Browse files
authored
Merge branch 'master' into chualan/fix-19658
2 parents 4567c49 + 601c060 commit c6a77a9

File tree

17 files changed

+176
-11
lines changed

17 files changed

+176
-11
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ jobs:
7575
pip list
7676
displayName: "Image info & NVIDIA"
7777
78-
- bash: pip install -e .[dev] --find-links ${TORCH_URL}
78+
- bash: |
79+
pip install -e .[dev] --find-links ${TORCH_URL}
80+
pip install setuptools==75.6.0
7981
env:
8082
FREEZE_REQUIREMENTS: "1"
8183
displayName: "Install package"

.azure/gpu-tests-fabric.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ jobs:
107107
- bash: |
108108
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
109109
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
110+
pip install setuptools==75.6.0
110111
displayName: "Install package & dependencies"
111112
112113
- bash: |

.azure/gpu-tests-pytorch.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ jobs:
111111
- bash: |
112112
extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
113113
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
114+
pip install setuptools==75.6.0
114115
displayName: "Install package & dependencies"
115116
116117
- bash: pip uninstall -y lightning

dockers/base-cuda/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ RUN \
5959
add-apt-repository ppa:deadsnakes/ppa && \
6060
apt-get install -y \
6161
python${PYTHON_VERSION} \
62-
python3-setuptools \
6362
python${PYTHON_VERSION}-dev \
6463
&& \
6564
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
@@ -79,6 +78,8 @@ RUN \
7978
curl https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} && \
8079
# Disable cache \
8180
pip config set global.cache-dir false && \
81+
# Install recent setuptools to obtain pkg_resources \
82+
pip install setuptools==75.6.0 && \
8283
# set particular PyTorch version \
8384
pip install -q wget packaging && \
8485
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py && \

dockers/release/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ RUN \
3939
fi && \
4040
# otherwise there is collision with folder name and pkg name on Pypi
4141
cd pytorch-lightning && \
42-
pip install setuptools && \
42+
pip install setuptools==75.6.0 && \
4343
PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \
4444
PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \
4545
cd .. && \

docs/source-pytorch/common/checkpointing_basic.rst

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ PyTorch Lightning checkpoints are fully usable in plain PyTorch.
2020

2121
----
2222

23+
.. important::
24+
25+
**Important Update: Deprecated Method**
26+
27+
Starting from PyTorch Lightning v1.0.0, the `resume_from_checkpoint` argument has been deprecated. To resume training from a checkpoint, use the `ckpt_path` argument in the `fit()` method.
28+
Please update your code accordingly to avoid potential compatibility issues.
29+
2330
************************
2431
Contents of a checkpoint
2532
************************
@@ -197,16 +204,31 @@ You can disable checkpointing by passing:
197204

198205
----
199206

207+
200208
*********************
201209
Resume training state
202210
*********************
203211

204212
If you don't just want to load weights, but instead restore the full training, do the following:
205213

214+
Correct usage:
215+
206216
.. code-block:: python
207217
208218
model = LitModel()
209219
trainer = Trainer()
210220
211221
# automatically restores model, epoch, step, LR schedulers, etc...
212-
trainer.fit(model, ckpt_path="some/path/to/my_checkpoint.ckpt")
222+
trainer.fit(model, ckpt_path="path/to/your/checkpoint.ckpt")
223+
224+
.. warning::
225+
226+
The argument `resume_from_checkpoint` has been deprecated in versions of PyTorch Lightning >= 1.0.0.
227+
To resume training from a checkpoint, use the `ckpt_path` argument in the `fit()` method instead.
228+
229+
Incorrect (deprecated) usage:
230+
231+
.. code-block:: python
232+
233+
trainer = Trainer(resume_from_checkpoint="path/to/your/checkpoint.ckpt")
234+
trainer.fit(model)

docs/source-pytorch/common/index.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
../data/data
2424
../model/own_your_loop
2525
../advanced/model_init
26+
../common/tbptt
2627

2728

2829
#############
@@ -202,6 +203,13 @@ How-to Guides
202203
:col_css: col-md-4
203204
:height: 180
204205

206+
.. displayitem::
207+
:header: Truncated Back-Propagation Through Time
208+
:description: Efficiently step through time when training recurrent models
209+
:button_link: ../common/tbptt.html
210+
:col_css: col-md-4
211+
:height: 180
212+
205213
.. raw:: html
206214

207215
</div>
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
##############################################
2+
Truncated Backpropagation Through Time (TBPTT)
3+
##############################################
4+
5+
Truncated Backpropagation Through Time (TBPTT) performs backpropogation every k steps of
6+
a much longer sequence. This is made possible by passing training batches
7+
split along the time-dimensions into splits of size k to the
8+
``training_step``. In order to keep the same forward propagation behavior, all
9+
hidden states should be kept in-between each time-dimension split.
10+
11+
12+
.. code-block:: python
13+
14+
import torch
15+
import torch.optim as optim
16+
import pytorch_lightning as pl
17+
from pytorch_lightning import LightningModule
18+
19+
class LitModel(LightningModule):
20+
21+
def __init__(self):
22+
super().__init__()
23+
24+
# 1. Switch to manual optimization
25+
self.automatic_optimization = False
26+
27+
self.truncated_bptt_steps = 10
28+
self.my_rnn = ParityModuleRNN() # Define RNN model using ParityModuleRNN
29+
30+
# 2. Remove the `hiddens` argument
31+
def training_step(self, batch, batch_idx):
32+
33+
# 3. Split the batch in chunks along the time dimension
34+
split_batches = split_batch(batch, self.truncated_bptt_steps)
35+
36+
batch_size = 10
37+
hidden_dim = 20
38+
hiddens = torch.zeros(1, batch_size, hidden_dim, device=self.device)
39+
for split_batch in range(split_batches):
40+
# 4. Perform the optimization in a loop
41+
loss, hiddens = self.my_rnn(split_batch, hiddens)
42+
self.backward(loss)
43+
self.optimizer.step()
44+
self.optimizer.zero_grad()
45+
46+
# 5. "Truncate"
47+
hiddens = hiddens.detach()
48+
49+
# 6. Remove the return of `hiddens`
50+
# Returning loss in manual optimization is not needed
51+
return None
52+
53+
def configure_optimizers(self):
54+
return optim.Adam(self.my_rnn.parameters(), lr=0.001)
55+
56+
if __name__ == "__main__":
57+
model = LitModel()
58+
trainer = pl.Trainer(max_epochs=5)
59+
trainer.fit(model, train_dataloader) # Define your own dataloader

docs/source-pytorch/conf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,9 @@ def _load_py_module(name: str, location: str) -> ModuleType:
462462
("py:obj", "lightning.pytorch.utilities.memory.is_out_of_cpu_memory"),
463463
("py:func", "lightning.pytorch.utilities.rank_zero.rank_zero_only"),
464464
("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfig"),
465-
("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfig"),
465+
("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfigType"),
466+
("py:class", "lightning.pytorch.utilities.types.OptimizerConfigType"),
467+
("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfigType"),
466468
("py:class", "lightning_habana.pytorch.plugins.precision.HPUPrecisionPlugin"),
467469
("py:class", "lightning_habana.pytorch.strategies.HPUDDPStrategy"),
468470
("py:class", "lightning_habana.pytorch.strategies.HPUParallelStrategy"),

src/lightning/fabric/plugins/precision/fsdp.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ def __init__(self, precision: _PRECISION_INPUT, scaler: Optional["ShardedGradSca
7474
}
7575
self._desired_input_dtype = precision_to_type[self.precision]
7676

77+
@override
78+
def convert_module(self, module: Module) -> Module:
79+
if "true" in self.precision:
80+
return module.to(dtype=self._desired_input_dtype)
81+
return module
82+
7783
@property
7884
def mixed_precision_config(self) -> "TorchMixedPrecision":
7985
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision

0 commit comments

Comments
 (0)