Skip to content

Commit e25e22a

Browse files
authored
Merge branch 'master' into docs_checkpoint_location
2 parents 0f3fd0f + b7ca4d3 commit e25e22a

File tree

7 files changed

+121
-22
lines changed

7 files changed

+121
-22
lines changed

.azure/gpu-tests-pytorch.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
# Python package
2-
# Create and test a Python package on multiple Python versions.
3-
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
4-
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
5-
61
trigger:
72
tags:
83
include: ["*"]
@@ -24,18 +19,18 @@ pr:
2419
- "examples/run_pl_examples.sh"
2520
- "examples/pytorch/basics/backbone_image_classifier.py"
2621
- "examples/pytorch/basics/autoencoder.py"
22+
- "requirements/fabric/**"
2723
- "requirements/pytorch/**"
2824
- "src/lightning/__init__.py"
2925
- "src/lightning/__setup__.py"
3026
- "src/lightning/__version__.py"
31-
- "src/lightning/pytorch/**"
27+
- "src/lightning_fabric/*"
28+
- "src/lightning/fabric/**"
3229
- "src/pytorch_lightning/*"
30+
- "src/lightning/pytorch/**"
3331
- "tests/tests_pytorch/**"
3432
- "tests/run_standalone_*.sh"
3533
- "pyproject.toml" # includes pytest config
36-
- "requirements/fabric/**"
37-
- "src/lightning/fabric/**"
38-
- "src/lightning_fabric/*"
3934
exclude:
4035
- "requirements/*/docs.txt"
4136
- "*.md"

docs/source-pytorch/common/trainer.rst

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,27 @@ Example::
246246
See also: :ref:`gradient_accumulation` to enable more fine-grained accumulation schedules.
247247

248248

249+
barebones
250+
^^^^^^^^^
251+
252+
Whether to run in "barebones mode", where all features that may impact raw speed are disabled. This is meant for
253+
analyzing the Trainer overhead and is discouraged during regular training runs.
254+
255+
When enabled, the following features are automatically deactivated:
256+
- Checkpointing: ``enable_checkpointing=False``
257+
- Logging: ``logger=False``, ``log_every_n_steps=0``
258+
- Progress bar: ``enable_progress_bar=False``
259+
- Model summary: ``enable_model_summary=False``
260+
- Sanity checking: ``num_sanity_val_steps=0``
261+
262+
.. testcode::
263+
264+
# default used by the Trainer
265+
trainer = Trainer(barebones=False)
266+
267+
# enable barebones mode for speed analysis
268+
trainer = Trainer(barebones=True)
269+
249270
benchmark
250271
^^^^^^^^^
251272

@@ -364,6 +385,22 @@ will need to be set up to use remote filepaths.
364385
# default used by the Trainer
365386
trainer = Trainer(default_root_dir=os.getcwd())
366387

388+
389+
detect_anomaly
390+
^^^^^^^^^^^^^^
391+
392+
Enable anomaly detection for the autograd engine. This will significantly slow down compute speed and is recommended
393+
only for model debugging.
394+
395+
.. testcode::
396+
397+
# default used by the Trainer
398+
trainer = Trainer(detect_anomaly=False)
399+
400+
# enable anomaly detection for debugging
401+
trainer = Trainer(detect_anomaly=True)
402+
403+
367404
devices
368405
^^^^^^^
369406

@@ -548,6 +585,24 @@ impact to subsequent runs. These are the changes enabled:
548585
- If using the CLI, the configuration file is not saved.
549586

550587

588+
gradient_clip_algorithm
589+
^^^^^^^^^^^^^^^^^^^^^^^
590+
591+
The gradient clipping algorithm to use. Pass ``gradient_clip_algorithm="value"`` to clip by value, and
592+
``gradient_clip_algorithm="norm"`` to clip by norm. By default it will be set to ``"norm"``.
593+
594+
.. testcode::
595+
596+
# default used by the Trainer (defaults to "norm" when gradient_clip_val is set)
597+
trainer = Trainer(gradient_clip_algorithm=None)
598+
599+
# clip by value
600+
trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
601+
602+
# clip by norm
603+
trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="norm")
604+
605+
551606
gradient_clip_val
552607
^^^^^^^^^^^^^^^^^
553608

@@ -624,6 +679,26 @@ Example::
624679
# run through only 10 batches of the training set each epoch
625680
trainer = Trainer(limit_train_batches=10)
626681

682+
683+
limit_predict_batches
684+
^^^^^^^^^^^^^^^^^^^^^
685+
686+
How much of prediction dataset to check. Value is per device.
687+
688+
.. testcode::
689+
690+
# default used by the Trainer
691+
trainer = Trainer(limit_predict_batches=1.0)
692+
693+
# run through only 25% of the prediction set
694+
trainer = Trainer(limit_predict_batches=0.25)
695+
696+
# run for only 10 batches
697+
trainer = Trainer(limit_predict_batches=10)
698+
699+
In the case of multiple prediction dataloaders, the limit applies to each dataloader individually.
700+
701+
627702
limit_test_batches
628703
^^^^^^^^^^^^^^^^^^
629704

@@ -801,6 +876,23 @@ For customizable options use the :class:`~lightning.pytorch.callbacks.timer.Time
801876
In case ``max_time`` is used together with ``min_steps`` or ``min_epochs``, the ``min_*`` requirement
802877
always has precedence.
803878

879+
880+
model_registry
881+
^^^^^^^^^^^^^^
882+
883+
If specified will upload the model to lightning model registry under the provided name.
884+
885+
.. testcode::
886+
887+
# default used by the Trainer
888+
trainer = Trainer(model_registry=None)
889+
890+
# specify model name for model hub upload
891+
trainer = Trainer(model_registry="my-model-name")
892+
893+
See `Lightning model registry docs <https://lightning.ai/docs/overview/finetune-models/model-registry>`_ for more info.
894+
895+
804896
num_nodes
805897
^^^^^^^^^
806898

@@ -875,12 +967,25 @@ Useful for quickly debugging or trying to overfit on purpose.
875967

876968
# debug using a single consistent train batch and a single consistent val batch
877969

970+
plugins
971+
^^^^^^^
878972

879-
:ref:`Plugins` allow you to connect arbitrary backends, precision libraries, clusters etc. For example:
880-
973+
Plugins allow you to connect arbitrary backends, precision libraries, clusters etc. and modification of core lightning logic.
974+
Examples of plugin types:
881975
- :ref:`Checkpoint IO <checkpointing_expert>`
882976
- `TorchElastic <https://pytorch.org/elastic/0.2.2/index.html>`_
883977
- :ref:`Precision Plugins <precision_expert>`
978+
- :class:`~lightning.pytorch.plugins.environments.ClusterEnvironment`
979+
980+
.. testcode::
981+
982+
# default used by the Trainer
983+
trainer = Trainer(plugins=None)
984+
985+
# example using built in slurm plugin
986+
from lightning.fabric.plugins.environments import SLURMEnvironment
987+
trainer = Trainer(plugins=[SLURMEnvironment()])
988+
884989

885990
To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own
886991
:class:`~lightning.pytorch.plugins.environments.ClusterEnvironment`.

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ ignore = [
120120
"S607", # todo: Starting a process with a partial executable path
121121
"RET504", # todo:Unnecessary variable assignment before `return` statement
122122
"PT004", # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
123-
"PT011", # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
124123
"PT012", # todo: `pytest.raises()` block should contain a single simple statement
125124
"PT019", # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
126125
]

tests/tests_pytorch/loggers/test_neptune.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,19 +121,19 @@ def test_online_with_wrong_kwargs(neptune_mock):
121121
init."""
122122
run = neptune_mock.init_run()
123123

124-
with pytest.raises(ValueError):
124+
with pytest.raises(ValueError, match="Run parameter expected to be of type `neptune.Run`*"):
125125
NeptuneLogger(run="some string")
126126

127-
with pytest.raises(ValueError):
127+
with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
128128
NeptuneLogger(run=run, project="redundant project")
129129

130-
with pytest.raises(ValueError):
130+
with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
131131
NeptuneLogger(run=run, api_key="redundant api key")
132132

133-
with pytest.raises(ValueError):
133+
with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
134134
NeptuneLogger(run=run, name="redundant api name")
135135

136-
with pytest.raises(ValueError):
136+
with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
137137
NeptuneLogger(run=run, foo="random **kwarg")
138138

139139
# this should work

tests/tests_pytorch/models/test_hparams.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def __init__(self, arg1, arg2):
527527
)
528528
def test_single_config_models_fail(tmp_path, cls, config):
529529
"""Test fail on passing unsupported config type."""
530-
with pytest.raises(ValueError):
530+
with pytest.raises(ValueError, match=r"Primitives \(<class 'bool'>*"):
531531
_ = cls(**config)
532532

533533

tests/tests_pytorch/profilers/test_profiler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,12 @@ def test_simple_profiler_overhead(simple_profiler):
8686
def test_simple_profiler_value_errors(simple_profiler):
8787
"""Ensure errors are raised where expected."""
8888
action = "test"
89-
with pytest.raises(ValueError):
89+
with pytest.raises(ValueError, match="Attempting to stop recording an action*"):
9090
simple_profiler.stop(action)
9191

9292
simple_profiler.start(action)
9393

94-
with pytest.raises(ValueError):
94+
with pytest.raises(ValueError, match="Attempted to start test*"):
9595
simple_profiler.start(action)
9696

9797
simple_profiler.stop(action)
@@ -325,7 +325,7 @@ def test_advanced_profiler_dump_states(tmp_path):
325325
def test_advanced_profiler_value_errors(advanced_profiler):
326326
"""Ensure errors are raised where expected."""
327327
action = "test"
328-
with pytest.raises(ValueError):
328+
with pytest.raises(ValueError, match="Attempting to stop recording*"):
329329
advanced_profiler.stop(action)
330330

331331
advanced_profiler.start(action)

tests/tests_pytorch/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def test_lightning_cli_save_config_seed_everything(cleandir):
339339

340340

341341
def test_save_to_log_dir_false_error():
342-
with pytest.raises(ValueError):
342+
with pytest.raises(ValueError, match="`save_to_log_dir=False` only makes sense*"):
343343
SaveConfigCallback(
344344
LightningArgumentParser(),
345345
Namespace(),

0 commit comments

Comments
 (0)