Merge branch 'master' into ci/bump-pt-2.6

Borda · web-flow · commit 1ef39401236b · 2025-03-10T19:53:03.000+01:00
diff --git a/.github/workflows/call-clear-cache.yml b/.github/workflows/call-clear-cache.yml
@@ -23,18 +23,18 @@ on:
 jobs:
   cron-clear:
     if: github.event_name == 'schedule' || github.event_name == 'pull_request'
-    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.12.0
+    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.0
     with:
-      scripts-ref: v0.11.8
+      scripts-ref: v0.14.0
       dry-run: ${{ github.event_name == 'pull_request' }}
       pattern: "latest|docs"
       age-days: 7
 
   direct-clear:
     if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.12.0
+    uses: Lightning-AI/utilities/.github/workflows/cleanup-caches.yml@v0.14.0
     with:
-      scripts-ref: v0.11.8
+      scripts-ref: v0.14.0
       dry-run: ${{ github.event_name == 'pull_request' }}
       pattern: ${{ inputs.pattern || 'pypi_wheels' }} # setting str in case of PR / debugging
       age-days: ${{ fromJSON(inputs.age-days) || 0 }} # setting 0 in case of PR / debugging
diff --git a/.github/workflows/ci-check-md-links.yml b/.github/workflows/ci-check-md-links.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   check-md-links:
-    uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.12.0
+    uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.14.0
     with:
       config-file: ".github/markdown-links-config.json"
       base-branch: "master"
diff --git a/.github/workflows/ci-schema.yml b/.github/workflows/ci-schema.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   check:
-    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.12.0
+    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.14.0
     with:
       # skip azure due to the wrong schema file by MSFT
       # https://github.com/Lightning-AI/lightning-flash/pull/1455#issuecomment-1244793607
diff --git a/docs/source-pytorch/visualize/loggers.rst b/docs/source-pytorch/visualize/loggers.rst
@@ -54,3 +54,37 @@ Track and Visualize Experiments
 
         </div>
     </div>
+
+.. _mlflow_logger:
+
+MLflow Logger
+-------------
+
+The MLflow logger in PyTorch Lightning now includes a `checkpoint_path_prefix` parameter. This parameter allows you to prefix the checkpoint artifact's path when logging checkpoints as artifacts.
+
+Example usage:
+
+.. code-block:: python
+
+    import lightning as L
+    from lightning.pytorch.loggers import MLFlowLogger
+
+    mlf_logger = MLFlowLogger(
+        experiment_name="lightning_logs",
+        tracking_uri="file:./ml-runs",
+        checkpoint_path_prefix="my_prefix"
+    )
+    trainer = L.Trainer(logger=mlf_logger)
+
+    # Your LightningModule definition
+    class LitModel(L.LightningModule):
+        def training_step(self, batch, batch_idx):
+            # example
+            self.logger.experiment.whatever_ml_flow_supports(...)
+
+        def any_lightning_module_function_or_hook(self):
+            self.logger.experiment.whatever_ml_flow_supports(...)
+
+    # Train your model
+    model = LitModel()
+    trainer.fit(model)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -8,15 +8,31 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Allow LightningCLI to use a customized argument parser class ([#20596](https://github.com/Lightning-AI/pytorch-lightning/pull/20596))
+
+
 ### Changed
 
+- Change `wandb` default x-axis to `tensorboard`'s `global_step` when `sync_tensorboard=True` ([#20611](https://github.com/Lightning-AI/pytorch-lightning/pull/20611))
+
+
+- Added a new `checkpoint_path_prefix` parameter to the MLflow logger which can control the path to where the MLflow artifacts for the model checkpoints are stored ([#20538](https://github.com/Lightning-AI/pytorch-lightning/pull/20538))
+
+
+
 ### Removed
 
+-
+
+
 ### Fixed
 
 - Fix CSVLogger logging hyperparameter at every write which increase latency  ([#20594](https://github.com/Lightning-AI/pytorch-lightning/pull/20594))
 
 
+- Always call `WandbLogger.experiment` first in `_call_setup_hook` to ensure `tensorboard` logs can sync to `wandb` ([#20610](https://github.com/Lightning-AI/pytorch-lightning/pull/20610))
+
+
 ## [2.5.0] - 2024-12-19
 
 ### Added
diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py
@@ -314,6 +314,7 @@ def __init__(
         trainer_defaults: Optional[dict[str, Any]] = None,
         seed_everything_default: Union[bool, int] = True,
         parser_kwargs: Optional[Union[dict[str, Any], dict[str, dict[str, Any]]]] = None,
+        parser_class: type[LightningArgumentParser] = LightningArgumentParser,
         subclass_mode_model: bool = False,
         subclass_mode_data: bool = False,
         args: ArgsType = None,
@@ -367,6 +368,7 @@ def __init__(
         self.trainer_defaults = trainer_defaults or {}
         self.seed_everything_default = seed_everything_default
         self.parser_kwargs = parser_kwargs or {}
+        self.parser_class = parser_class
         self.auto_configure_optimizers = auto_configure_optimizers
 
         self.model_class = model_class
@@ -404,7 +406,7 @@ def _setup_parser_kwargs(self, parser_kwargs: dict[str, Any]) -> tuple[dict[str,
     def init_parser(self, **kwargs: Any) -> LightningArgumentParser:
         """Method that instantiates the argument parser."""
         kwargs.setdefault("dump_header", [f"lightning.pytorch=={pl.__version__}"])
-        parser = LightningArgumentParser(**kwargs)
+        parser = self.parser_class(**kwargs)
         parser.add_argument(
             "-c", "--config", action=ActionConfigFile, help="Path to a configuration file in json or yaml format."
         )
diff --git a/src/lightning/pytorch/loggers/mlflow.py b/src/lightning/pytorch/loggers/mlflow.py
@@ -97,7 +97,7 @@ def any_lightning_module_function_or_hook(self):
               :paramref:`~lightning.pytorch.callbacks.Checkpoint.save_top_k` ``== -1``
               which also logs every checkpoint during training.
             * if ``log_model == False`` (default), no checkpoint is logged.
-
+        checkpoint_path_prefix: A string to prefix the checkpoint artifact's path.
         prefix: A string to put at the beginning of metric keys.
         artifact_location: The location to store run artifacts. If not provided, the server picks an appropriate
             default.
@@ -121,6 +121,7 @@ def __init__(
         tags: Optional[dict[str, Any]] = None,
         save_dir: Optional[str] = "./mlruns",
         log_model: Literal[True, False, "all"] = False,
+        checkpoint_path_prefix: str = "",
         prefix: str = "",
         artifact_location: Optional[str] = None,
         run_id: Optional[str] = None,
@@ -147,6 +148,7 @@ def __init__(
         self._artifact_location = artifact_location
         self._log_batch_kwargs = {} if synchronous is None else {"synchronous": synchronous}
         self._initialized = False
+        self._checkpoint_path_prefix = checkpoint_path_prefix
 
         from mlflow.tracking import MlflowClient
 
@@ -361,7 +363,7 @@ def _scan_and_log_checkpoints(self, checkpoint_callback: ModelCheckpoint) -> Non
             aliases = ["latest", "best"] if p == checkpoint_callback.best_model_path else ["latest"]
 
             # Artifact path on mlflow
-            artifact_path = Path(p).stem
+            artifact_path = Path(self._checkpoint_path_prefix) / Path(p).stem
 
             # Log the checkpoint
             self.experiment.log_artifact(self._run_id, p, artifact_path)
diff --git a/src/lightning/pytorch/loggers/wandb.py b/src/lightning/pytorch/loggers/wandb.py
@@ -410,8 +410,11 @@ def experiment(self) -> Union["Run", "RunDisabled"]:
                 if isinstance(self._experiment, (Run, RunDisabled)) and getattr(
                     self._experiment, "define_metric", None
                 ):
-                    self._experiment.define_metric("trainer/global_step")
-                    self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True)
+                    if self._wandb_init.get("sync_tensorboard"):
+                        self._experiment.define_metric("*", step_metric="global_step")
+                    else:
+                        self._experiment.define_metric("trainer/global_step")
+                        self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True)
 
         return self._experiment
 
@@ -434,7 +437,7 @@ def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None)
         assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
 
         metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR)
-        if step is not None:
+        if step is not None and not self._wandb_init.get("sync_tensorboard"):
             self.experiment.log(dict(metrics, **{"trainer/global_step": step}))
         else:
             self.experiment.log(metrics)
diff --git a/src/lightning/pytorch/trainer/call.py b/src/lightning/pytorch/trainer/call.py
@@ -21,6 +21,7 @@
 import lightning.pytorch as pl
 from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin
 from lightning.pytorch.callbacks import Checkpoint, EarlyStopping
+from lightning.pytorch.loggers import WandbLogger
 from lightning.pytorch.strategies.launchers import _SubprocessScriptLauncher
 from lightning.pytorch.trainer.connectors.signal_connector import _get_sigkill_signal
 from lightning.pytorch.trainer.states import TrainerStatus
@@ -91,8 +92,12 @@ def _call_setup_hook(trainer: "pl.Trainer") -> None:
         if isinstance(module, _DeviceDtypeModuleMixin):
             module._device = trainer.strategy.root_device
 
+    # wandb.init must be called before any tensorboard writers are created in order to sync tensorboard logs to wandb:
+    # https://github.com/wandb/wandb/issues/1782#issuecomment-779161203
+    loggers = sorted(trainer.loggers, key=lambda logger: not isinstance(logger, WandbLogger))
+
     # Trigger lazy creation of experiment in loggers so loggers have their metadata available
-    for logger in trainer.loggers:
+    for logger in loggers:
         if hasattr(logger, "experiment"):
             _ = logger.experiment
 
diff --git a/tests/tests_pytorch/core/test_results.py b/tests/tests_pytorch/core/test_results.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from functools import partial
 
+import pytest
 import torch
 import torch.distributed as dist
 
@@ -48,6 +49,8 @@ def result_reduce_ddp_fn(strategy):
     assert actual.item() == dist.get_world_size()
 
 
+# flaky with "process 0 terminated with signal SIGABRT"
+@pytest.mark.flaky(reruns=3, only_rerun="torch.multiprocessing.spawn.ProcessExitedException")
 @RunIf(skip_windows=True)
 def test_result_reduce_ddp():
     spawn_launch(result_reduce_ddp_fn, [torch.device("cpu")] * 2)
diff --git a/tests/tests_pytorch/loggers/conftest.py b/tests/tests_pytorch/loggers/conftest.py
@@ -55,6 +55,7 @@ class RunType:  # to make isinstance checks pass
         watch=Mock(),
         log_artifact=Mock(),
         use_artifact=Mock(),
+        define_metric=Mock(),
         id="run_id",
     )
 
diff --git a/tests/tests_pytorch/loggers/test_mlflow.py b/tests/tests_pytorch/loggers/test_mlflow.py
@@ -427,3 +427,33 @@ def test_set_tracking_uri(mlflow_mock):
     mlflow_mock.set_tracking_uri.assert_not_called()
     _ = logger.experiment
     mlflow_mock.set_tracking_uri.assert_called_with("the_tracking_uri")
+
+
+@mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock())
+def test_mlflow_log_model_with_checkpoint_path_prefix(mlflow_mock, tmp_path):
+    """Test that the logger creates the folders and files in the right place with a prefix."""
+    client = mlflow_mock.tracking.MlflowClient
+
+    # Get model, logger, trainer and train
+    model = BoringModel()
+    logger = MLFlowLogger("test", save_dir=str(tmp_path), log_model="all", checkpoint_path_prefix="my_prefix")
+    logger = mock_mlflow_run_creation(logger, experiment_id="test-id")
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        logger=logger,
+        max_epochs=2,
+        limit_train_batches=3,
+        limit_val_batches=3,
+    )
+    trainer.fit(model)
+
+    # Checkpoint log
+    assert client.return_value.log_artifact.call_count == 2
+    # Metadata and aliases log
+    assert client.return_value.log_artifacts.call_count == 2
+
+    # Check that the prefix is used in the artifact path
+    for call in client.return_value.log_artifact.call_args_list:
+        args, _ = call
+        assert str(args[2]).startswith("my_prefix")
diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py
@@ -24,7 +24,7 @@
 from lightning.pytorch.callbacks import ModelCheckpoint
 from lightning.pytorch.cli import LightningCLI
 from lightning.pytorch.demos.boring_classes import BoringModel
-from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from tests_pytorch.test_cli import _xfail_python_ge_3_11_9
 
@@ -126,13 +126,68 @@ def test_wandb_logger_init(wandb_mock):
     assert logger.version == wandb_mock.init().id
 
 
+def test_wandb_logger_sync_tensorboard(wandb_mock):
+    logger = WandbLogger(sync_tensorboard=True)
+    wandb_mock.run = None
+    logger.experiment
+
+    # test that tensorboard's global_step is set as the default x-axis if sync_tensorboard=True
+    wandb_mock.init.return_value.define_metric.assert_called_once_with("*", step_metric="global_step")
+
+
+def test_wandb_logger_sync_tensorboard_log_metrics(wandb_mock):
+    logger = WandbLogger(sync_tensorboard=True)
+    metrics = {"loss": 1e-3, "accuracy": 0.99}
+    logger.log_metrics(metrics)
+
+    # test that trainer/global_step is not added to the logged metrics if sync_tensorboard=True
+    wandb_mock.run.log.assert_called_once_with(metrics)
+
+
 def test_wandb_logger_init_before_spawn(wandb_mock):
     logger = WandbLogger()
     assert logger._experiment is None
     logger.__getstate__()
     assert logger._experiment is not None
 
 
+def test_wandb_logger_experiment_called_first(wandb_mock, tmp_path):
+    wandb_experiment_called = False
+
+    def tensorboard_experiment_side_effect() -> mock.MagicMock:
+        nonlocal wandb_experiment_called
+        assert wandb_experiment_called
+        return mock.MagicMock()
+
+    def wandb_experiment_side_effect() -> mock.MagicMock:
+        nonlocal wandb_experiment_called
+        wandb_experiment_called = True
+        return mock.MagicMock()
+
+    with (
+        mock.patch.object(
+            TensorBoardLogger,
+            "experiment",
+            new_callable=lambda: mock.PropertyMock(side_effect=tensorboard_experiment_side_effect),
+        ),
+        mock.patch.object(
+            WandbLogger,
+            "experiment",
+            new_callable=lambda: mock.PropertyMock(side_effect=wandb_experiment_side_effect),
+        ),
+    ):
+        model = BoringModel()
+        trainer = Trainer(
+            default_root_dir=tmp_path,
+            log_every_n_steps=1,
+            limit_train_batches=0,
+            limit_val_batches=0,
+            max_steps=1,
+            logger=[TensorBoardLogger(tmp_path), WandbLogger(save_dir=tmp_path)],
+        )
+        trainer.fit(model)
+
+
 def test_wandb_pickle(wandb_mock, tmp_path):
     """Verify that pickling trainer with wandb logger works.