Merge branch 'master' into docs_checkpoint_location

SkafteNicki · web-flow · commit e25e22a39af4 · 2025-09-15T06:57:40.000+02:00
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -1,8 +1,3 @@
-# Python package
-# Create and test a Python package on multiple Python versions.
-# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
-# https://docs.microsoft.com/azure/devops/pipelines/languages/python
-
 trigger:
   tags:
     include: ["*"]
@@ -24,18 +19,18 @@ pr:
       - "examples/run_pl_examples.sh"
       - "examples/pytorch/basics/backbone_image_classifier.py"
       - "examples/pytorch/basics/autoencoder.py"
+      - "requirements/fabric/**"
       - "requirements/pytorch/**"
       - "src/lightning/__init__.py"
       - "src/lightning/__setup__.py"
       - "src/lightning/__version__.py"
-      - "src/lightning/pytorch/**"
+      - "src/lightning_fabric/*"
+      - "src/lightning/fabric/**"
       - "src/pytorch_lightning/*"
+      - "src/lightning/pytorch/**"
       - "tests/tests_pytorch/**"
       - "tests/run_standalone_*.sh"
       - "pyproject.toml" # includes pytest config
-      - "requirements/fabric/**"
-      - "src/lightning/fabric/**"
-      - "src/lightning_fabric/*"
     exclude:
       - "requirements/*/docs.txt"
       - "*.md"
diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
@@ -246,6 +246,27 @@ Example::
 See also: :ref:`gradient_accumulation` to enable more fine-grained accumulation schedules.
 
 
+barebones
+^^^^^^^^^
+
+Whether to run in "barebones mode", where all features that may impact raw speed are disabled. This is meant for
+analyzing the Trainer overhead and is discouraged during regular training runs.
+
+When enabled, the following features are automatically deactivated:
+- Checkpointing: ``enable_checkpointing=False``
+- Logging: ``logger=False``, ``log_every_n_steps=0``
+- Progress bar: ``enable_progress_bar=False``
+- Model summary: ``enable_model_summary=False``
+- Sanity checking: ``num_sanity_val_steps=0``
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(barebones=False)
+
+    # enable barebones mode for speed analysis
+    trainer = Trainer(barebones=True)
+
 benchmark
 ^^^^^^^^^
 
@@ -364,6 +385,22 @@ will need to be set up to use remote filepaths.
     # default used by the Trainer
     trainer = Trainer(default_root_dir=os.getcwd())
 
+
+detect_anomaly
+^^^^^^^^^^^^^^
+
+Enable anomaly detection for the autograd engine. This will significantly slow down compute speed and is recommended
+only for model debugging.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(detect_anomaly=False)
+
+    # enable anomaly detection for debugging
+    trainer = Trainer(detect_anomaly=True)
+
+
 devices
 ^^^^^^^
 
@@ -548,6 +585,24 @@ impact to subsequent runs. These are the changes enabled:
 - If using the CLI, the configuration file is not saved.
 
 
+gradient_clip_algorithm
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The gradient clipping algorithm to use. Pass ``gradient_clip_algorithm="value"`` to clip by value, and
+``gradient_clip_algorithm="norm"`` to clip by norm. By default it will be set to ``"norm"``.
+
+.. testcode::
+
+    # default used by the Trainer (defaults to "norm" when gradient_clip_val is set)
+    trainer = Trainer(gradient_clip_algorithm=None)
+
+    # clip by value
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
+
+    # clip by norm
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="norm")
+
+
 gradient_clip_val
 ^^^^^^^^^^^^^^^^^
 
@@ -624,6 +679,26 @@ Example::
     # run through only 10 batches of the training set each epoch
     trainer = Trainer(limit_train_batches=10)
 
+
+limit_predict_batches
+^^^^^^^^^^^^^^^^^^^^^
+
+How much of prediction dataset to check. Value is per device.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(limit_predict_batches=1.0)
+
+    # run through only 25% of the prediction set
+    trainer = Trainer(limit_predict_batches=0.25)
+
+    # run for only 10 batches
+    trainer = Trainer(limit_predict_batches=10)
+
+In the case of multiple prediction dataloaders, the limit applies to each dataloader individually.
+
+
 limit_test_batches
 ^^^^^^^^^^^^^^^^^^
 
@@ -801,6 +876,23 @@ For customizable options use the :class:`~lightning.pytorch.callbacks.timer.Time
 In case ``max_time`` is used together with ``min_steps`` or ``min_epochs``, the ``min_*`` requirement
 always has precedence.
 
+
+model_registry
+^^^^^^^^^^^^^^
+
+If specified will upload the model to lightning model registry under the provided name.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(model_registry=None)
+
+    # specify model name for model hub upload
+    trainer = Trainer(model_registry="my-model-name")
+
+See `Lightning model registry docs <https://lightning.ai/docs/overview/finetune-models/model-registry>`_ for more info.
+
+
 num_nodes
 ^^^^^^^^^
 
@@ -875,12 +967,25 @@ Useful for quickly debugging or trying to overfit on purpose.
 
     # debug using a single consistent train batch and a single consistent val batch
 
+plugins
+^^^^^^^
 
-:ref:`Plugins` allow you to connect arbitrary backends, precision libraries, clusters etc. For example:
-
+Plugins allow you to connect arbitrary backends, precision libraries, clusters etc. and modification of core lightning logic.
+Examples of plugin types:
 - :ref:`Checkpoint IO <checkpointing_expert>`
 - `TorchElastic <https://pytorch.org/elastic/0.2.2/index.html>`_
 - :ref:`Precision Plugins <precision_expert>`
+- :class:`~lightning.pytorch.plugins.environments.ClusterEnvironment`
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(plugins=None)
+
+    # example using built in slurm plugin
+    from lightning.fabric.plugins.environments import SLURMEnvironment
+    trainer = Trainer(plugins=[SLURMEnvironment()])
+
 
 To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own
 :class:`~lightning.pytorch.plugins.environments.ClusterEnvironment`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,6 @@ ignore = [
     "S607",  # todo: Starting a process with a partial executable path
     "RET504",  # todo:Unnecessary variable assignment before `return` statement
     "PT004",  # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
-    "PT011",  # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
     "PT012",  # todo: `pytest.raises()` block should contain a single simple statement
     "PT019",  # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
 ]
diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py
@@ -121,19 +121,19 @@ def test_online_with_wrong_kwargs(neptune_mock):
     init."""
     run = neptune_mock.init_run()
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="Run parameter expected to be of type `neptune.Run`*"):
         NeptuneLogger(run="some string")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
         NeptuneLogger(run=run, project="redundant project")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
         NeptuneLogger(run=run, api_key="redundant api key")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
         NeptuneLogger(run=run, name="redundant api name")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="When an already initialized run object is provided*"):
         NeptuneLogger(run=run, foo="random **kwarg")
 
     # this should work
diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py
@@ -527,7 +527,7 @@ def __init__(self, arg1, arg2):
 )
 def test_single_config_models_fail(tmp_path, cls, config):
     """Test fail on passing unsupported config type."""
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=r"Primitives \(<class 'bool'>*"):
         _ = cls(**config)
 
 
diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py
@@ -86,12 +86,12 @@ def test_simple_profiler_overhead(simple_profiler):
 def test_simple_profiler_value_errors(simple_profiler):
     """Ensure errors are raised where expected."""
     action = "test"
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="Attempting to stop recording an action*"):
         simple_profiler.stop(action)
 
     simple_profiler.start(action)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="Attempted to start test*"):
         simple_profiler.start(action)
 
     simple_profiler.stop(action)
@@ -325,7 +325,7 @@ def test_advanced_profiler_dump_states(tmp_path):
 def test_advanced_profiler_value_errors(advanced_profiler):
     """Ensure errors are raised where expected."""
     action = "test"
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="Attempting to stop recording*"):
         advanced_profiler.stop(action)
 
     advanced_profiler.start(action)
diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py
@@ -339,7 +339,7 @@ def test_lightning_cli_save_config_seed_everything(cleandir):
 
 
 def test_save_to_log_dir_false_error():
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="`save_to_log_dir=False` only makes sense*"):
         SaveConfigCallback(
             LightningArgumentParser(),
             Namespace(),

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,6 @@ ignore = [`
`120`	`120`	`"S607", # todo: Starting a process with a partial executable path`
`121`	`121`	"RET504", # todo:Unnecessary variable assignment before `return` statement
`122`	`122`	"PT004", # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
`123`		- "PT011", # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
`124`	`123`	"PT012", # todo: `pytest.raises()` block should contain a single simple statement
`125`	`124`	"PT019", # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
`126`	`125`	`]`
Original file line number	Diff line number	Diff line change
`@@ -527,7 +527,7 @@ def __init__(self, arg1, arg2):`
`527`	`527`	`)`
`528`	`528`	`def test_single_config_models_fail(tmp_path, cls, config):`
`529`	`529`	`"""Test fail on passing unsupported config type."""`
`530`		`- with pytest.raises(ValueError):`
	`530`	`+ with pytest.raises(ValueError, match=r"Primitives \(<class 'bool'>*"):`
`531`	`531`	`_ = cls(**config)`
`532`	`532`
`533`	`533`