diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst index f0c12464e6db2..6f3daa1fa7d43 100644 --- a/docs/source-pytorch/extensions/logging.rst +++ b/docs/source-pytorch/extensions/logging.rst @@ -31,6 +31,7 @@ The following are loggers we support: CSVLogger MLFlowLogger NeptuneLogger + NeptuneScaleLogger TensorBoardLogger WandbLogger diff --git a/docs/source-pytorch/visualize/supported_exp_managers.rst b/docs/source-pytorch/visualize/supported_exp_managers.rst index 79c15f1c1309e..90bd3b7d0c56d 100644 --- a/docs/source-pytorch/visualize/supported_exp_managers.rst +++ b/docs/source-pytorch/visualize/supported_exp_managers.rst @@ -60,9 +60,9 @@ Here's the full documentation for the :class:`~lightning.pytorch.loggers.MLFlowL ---- -Neptune.ai +Neptune 2.x ========== -To use `Neptune.ai `_ first install the neptune package: +To use `Neptune 2.x `_ first install the neptune package: .. code-block:: bash @@ -101,9 +101,9 @@ Here's the full documentation for the :class:`~lightning.pytorch.loggers.Neptune ---- -Neptune Scale +Neptune 3.x (Neptune Scale) ========== -To use `Neptune Scale `_ first install the neptune-scale package: +To use `Neptune 3.x `_ first install the neptune-scale package: .. code-block:: bash @@ -119,8 +119,8 @@ Configure the logger and pass it to the :class:`~lightning.pytorch.trainer.train from lightning.pytorch.loggers import NeptuneScaleLogger neptune_scale_logger = NeptuneScaleLogger( - api_key=, # replace with your own - project="common/pytorch-lightning-integration", # format "" + api_key="", # replace with your own + project="/", # replace with your own ) trainer = Trainer(logger=neptune_scale_logger) diff --git a/requirements/pytorch/loggers.info b/requirements/pytorch/loggers.info index 35f0126fcd629..ca0c8369935ab 100644 --- a/requirements/pytorch/loggers.info +++ b/requirements/pytorch/loggers.info @@ -1,7 +1,7 @@ # all supported loggers. this list is here as a reference, but they are not installed in CI neptune >=1.0.0 -neptune-scale +neptune-scale >= 0.12.0 comet-ml >=3.31.0 mlflow >=1.0.0 wandb >=0.12.10 diff --git a/src/lightning/pytorch/loggers/neptune.py b/src/lightning/pytorch/loggers/neptune.py index c0bd5bf3839b0..802e59a0fc254 100644 --- a/src/lightning/pytorch/loggers/neptune.py +++ b/src/lightning/pytorch/loggers/neptune.py @@ -69,7 +69,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: class NeptuneLogger(Logger): - r"""Log using `Neptune `_. + r"""Log using `Neptune `_. Install it with pip: @@ -129,7 +129,7 @@ def any_lightning_module_function_or_hook(self): Note that the syntax ``self.logger.experiment["your/metadata/structure"].append(metadata)`` is specific to Neptune and extends the logger capabilities. It lets you log various types of metadata, such as scores, files, images, interactive visuals, and CSVs. - Refer to the `Neptune docs `_ + Refer to the `Neptune docs `_ for details. You can also use the regular logger methods ``log_metrics()``, and ``log_hyperparams()`` with NeptuneLogger. @@ -184,7 +184,7 @@ def any_lightning_module_function_or_hook(self): ) trainer = Trainer(max_epochs=3, logger=neptune_logger) - Check `run documentation `_ + Check `run documentation `_ for more info about additional run parameters. **Details about Neptune run structure** @@ -196,18 +196,18 @@ def any_lightning_module_function_or_hook(self): See also: - Read about - `what objects you can log to Neptune `_. + `what objects you can log to Neptune `_. - Check out an `example run `_ with multiple types of metadata logged. - For more detailed examples, see the - `user guide `_. + `user guide `_. Args: api_key: Optional. Neptune API token, found on https://www.neptune.ai upon registration. You should save your token to the `NEPTUNE_API_TOKEN` environment variable and leave the api_key argument out of your code. - Instructions: `Setting your API token `_. + Instructions: `Setting your API token `_. project: Optional. Name of a project in the form "workspace-name/project-name", for example "tom/mask-rcnn". If ``None``, the value of `NEPTUNE_PROJECT` environment variable is used. @@ -377,7 +377,7 @@ def training_step(self, batch, batch_idx): is specific to Neptune and extends the logger capabilities. It lets you log various types of metadata, such as scores, files, images, interactive visuals, and CSVs. Refer to the - `Neptune docs `_ + `Neptune docs `_ for more detailed explanations. You can also use the regular logger methods ``log_metrics()``, and ``log_hyperparams()`` with NeptuneLogger. @@ -600,7 +600,7 @@ def version(self) -> Optional[str]: class NeptuneScaleLogger(Logger): - r"""Log using `Neptune Scale `_. + r"""Log using `Neptune Scale `_. Install it with pip: @@ -630,7 +630,6 @@ class NeptuneScaleLogger(Logger): .. code-block:: python - from neptune.types import File from lightning.pytorch import LightningModule @@ -647,7 +646,7 @@ def any_lightning_module_function_or_hook(self): Note that the syntax ``self.logger.run.log_metrics(data={"your/metadata/structure": metadata}, step=step)`` is specific to Neptune Scale. - Refer to the `Neptune Scale docs `_ for details. + Refer to the `Neptune Scale docs `_ for details. You can also use the regular logger methods ``log_metrics()``, and ``log_hyperparams()`` with NeptuneScaleLogger. **Log after fitting or testing is finished** @@ -670,6 +669,18 @@ def any_lightning_module_function_or_hook(self): neptune_logger.run.log_configs(data={"your/metadata/structure": metadata}) neptune_logger.run.add_tags(["tag1", "tag2"]) + **Log model checkpoint paths** + + If you have :class:`~lightning.pytorch.callbacks.ModelCheckpoint` configured, + the Neptune logger can log model checkpoint paths. + Paths will be logged to the "model/checkpoints" namespace in the Neptune run. + You can disable this option with: + + .. code-block:: python + + neptune_logger = NeptuneScaleLogger(log_model_checkpoints=False) + + Note: All model checkpoint paths will be logged. ``save_last`` and ``save_top_k`` are currently not supported. **Pass additional parameters to the Neptune run** @@ -688,7 +699,7 @@ def any_lightning_module_function_or_hook(self): ) trainer = Trainer(max_epochs=3, logger=neptune_scale_logger) - Check `run documentation `_ for more info about additional run + Check `run documentation `_ for more info about additional run parameters. **Details about Neptune run structure** @@ -712,26 +723,30 @@ def any_lightning_module_function_or_hook(self): Neptune API token, found on https://scale.neptune.ai upon registration. You should save your token to the `NEPTUNE_API_TOKEN` environment variable and leave the api_token argument out of your code. - Instructions: `Setting your API token `_. + Instructions: `Setting your API token `_. resume: Optional. If `False`, creates a new run. To continue an existing run, set to `True` and pass the ID of an existing run to the `run_id` argument. In this case, omit the `experiment_name` parameter. To fork a run, use `fork_run_id` and `fork_step` instead. mode: Optional. - `Mode `_ of operation. + `Mode `_ of operation. If "disabled", the run doesn't log any metadata. - If "offline", the run is only stored locally. For details, see `Offline logging `_. + If "offline", the run is only stored locally. For details, see `Offline logging `_. If this parameter and the - `NEPTUNE_MODE `_ + `NEPTUNE_MODE `_ environment variable are not set, the default is "async". experiment_name: Optional. - Name of the experiment to associate the run with. + Name of the experiment to associate the run with. Can't be used together with the `resume` parameter. To make the name easy to read in the app, ensure that it's at most 190 characters long. run: Optional. Default is ``None``. A Neptune ``Run`` object. If specified, this existing run will be used for logging, instead of a new run being created. prefix: Optional. Default is ``"training"``. Root namespace for all metadata logging. + log_model_checkpoints: Optional. Default is ``True``. Log model checkpoint paths to Neptune. + Works only if ``ModelCheckpoint`` is passed to the ``Trainer``. + NOTE: All model checkpoint paths will be logged. + ``save_last`` and ``save_top_k`` are currently not supported. neptune_run_kwargs: Additional arguments like ``creation_time``, ``log_directory``, ``fork_run_id``, ``fork_step``, ``*_callback``, etc. used when a run is created. @@ -757,6 +772,7 @@ def __init__( experiment_name: Optional[str] = None, run: Optional["Run"] = None, prefix: str = "training", + log_model_checkpoints: Optional[bool] = True, **neptune_run_kwargs: Any, ): if not _NEPTUNE_SCALE_AVAILABLE: @@ -778,16 +794,12 @@ def __init__( self._run_id = run_id self._experiment_name = experiment_name self._prefix = prefix + self._log_model_checkpoints = log_model_checkpoints self._neptune_run_kwargs = neptune_run_kwargs self._description = self._neptune_run_kwargs.pop("description", None) self._tags = self._neptune_run_kwargs.pop("tags", None) self._group_tags = self._neptune_run_kwargs.pop("group_tags", None) - if "log_model_checkpoints" in self._neptune_run_kwargs: - log.warning("Neptune Scale does not support logging model checkpoints.") - del self._neptune_run_kwargs["log_model_checkpoints"] - self._log_model_checkpoints = False - if self._run_instance is not None: self._retrieve_run_data() @@ -887,7 +899,7 @@ def training_step(self, batch, batch_idx): Note that the syntax ``self.logger.run.log_metrics(data={"your/metadata/structure": metadata}, step=step)`` is specific to Neptune Scale. Refer to the - `Neptune Scale docs `_ + `Neptune Scale docs `_ for more detailed explanations. You can also use the regular logger methods ``log_metrics()``, and ``log_hyperparams()`` with NeptuneScaleLogger. @@ -1004,7 +1016,7 @@ def finalize(self, status: str) -> None: # initialized there return if status: - self.run._status = status + self.run.log_configs({self._construct_path_with_prefix("status"): status}) super().finalize(status) @@ -1025,25 +1037,100 @@ def save_dir(self) -> Optional[str]: @rank_zero_only def log_model_summary(self, model: "pl.LightningModule", max_depth: int = -1) -> None: - """Not implemented for Neptune Scale.""" - log.warning("Neptune Scale does not support logging model summaries.") - return + """Logs a summary of all layers in the model to Neptune as a text file.""" + from neptune_scale.types import File + + model_str = str(ModelSummary(model=model, max_depth=max_depth)) + self.run.assign_files({ + self._construct_path_with_prefix("model/summary"): File( + source=model_str.encode("utf-8"), mime_type="text/plain" + ) + }) @override @rank_zero_only def after_save_checkpoint(self, checkpoint_callback: Checkpoint) -> None: - """Not implemented for Neptune Scale.""" - return + """Automatically log checkpointed model's path. Called after model checkpoint callback saves a new checkpoint. + + Args: + checkpoint_callback: the model checkpoint callback instance + + """ + if not self._log_model_checkpoints: + return + + file_names = set() + checkpoints_namespace = self._construct_path_with_prefix("model/checkpoints") + + # save last model + if hasattr(checkpoint_callback, "last_model_path") and checkpoint_callback.last_model_path: + model_last_name = self._get_full_model_name(checkpoint_callback.last_model_path, checkpoint_callback) + file_names.add(model_last_name) + self.run.log_configs({ + f"{checkpoints_namespace}/{model_last_name}": checkpoint_callback.last_model_path, + }) + + # save best k models + if hasattr(checkpoint_callback, "best_k_models"): + for key in checkpoint_callback.best_k_models: + model_name = self._get_full_model_name(key, checkpoint_callback) + file_names.add(model_name) + self.run.log_configs({ + f"{checkpoints_namespace}/{model_name}": key, + }) + + # log best model path and checkpoint + if hasattr(checkpoint_callback, "best_model_path") and checkpoint_callback.best_model_path: + self.run.log_configs({ + self._construct_path_with_prefix("model/best_model_path"): checkpoint_callback.best_model_path, + }) + + model_name = self._get_full_model_name(checkpoint_callback.best_model_path, checkpoint_callback) + file_names.add(model_name) + self.run.log_configs({ + f"{checkpoints_namespace}/{model_name}": checkpoint_callback.best_model_path, + }) + + # remove old models logged to experiment if they are not part of best k models at this point + # TODO: Implement after Neptune Scale supports `del` + # if self.run.exists(checkpoints_namespace): + # exp_structure = self.run.get_structure() + # uploaded_model_names = self._get_full_model_names_from_exp_structure( + # exp_structure, checkpoints_namespace + # ) + + # for file_to_drop in list(uploaded_model_names - file_names): + # del self.run[f"{checkpoints_namespace}/{file_to_drop}"] + + # log best model score + if hasattr(checkpoint_callback, "best_model_score") and checkpoint_callback.best_model_score: + self.run.log_configs({ + self._construct_path_with_prefix("model/best_model_score"): float( + checkpoint_callback.best_model_score.cpu().detach().numpy() + ), + }) @staticmethod - def _get_full_model_name(model_path: str, checkpoint_callback: Checkpoint) -> None: + def _get_full_model_name(model_path: str, checkpoint_callback: Checkpoint) -> str: """Returns model name which is string `model_path` appended to `checkpoint_callback.dirpath`.""" - return + if hasattr(checkpoint_callback, "dirpath"): + model_path = os.path.normpath(model_path) + expected_model_path = os.path.normpath(checkpoint_callback.dirpath) + if not model_path.startswith(expected_model_path): + raise ValueError(f"{model_path} was expected to start with {expected_model_path}.") + # Remove extension from filepath + filepath, _ = os.path.splitext(model_path[len(expected_model_path) + 1 :]) + return filepath.replace(os.sep, "/") + return model_path.replace(os.sep, "/") @classmethod - def _get_full_model_names_from_exp_structure(cls, exp_structure: dict[str, Any], namespace: str) -> set[None]: + def _get_full_model_names_from_exp_structure(cls, exp_structure: dict[str, Any], namespace: str) -> set[str]: """Returns all paths to properties which were already logged in `namespace`""" - return set() + structure_keys: list[str] = namespace.split(cls.LOGGER_JOIN_CHAR) + for key in structure_keys: + exp_structure = exp_structure[key] + uploaded_models_dict = exp_structure + return set(cls._dict_paths(uploaded_models_dict)) @classmethod def _dict_paths(cls, d: dict[str, Any], path_in_build: Optional[str] = None) -> Generator: diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index f3fb8cb2fd2b3..8e2bca207820c 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -252,9 +252,12 @@ trainer = Trainer(logger=loggers.CometLogger()) # mlflow trainer = Trainer(logger=loggers.MLFlowLogger()) -# neptune +# neptune 2.x trainer = Trainer(logger=loggers.NeptuneLogger()) +# neptune 3.x +trainer = Trainer(logger=loggers.NeptuneScaleLogger()) + # ... and dozens more ``` diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index 5fd82114f5a7a..a6b21d5ce2bc3 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -461,7 +461,8 @@ def test_neptune_scale_logger_finalize(neptune_scale_logger): """Test finalize method sets status correctly.""" logger, mock_run = neptune_scale_logger logger.finalize("success") - assert mock_run._status == "success" + expected_key = logger._construct_path_with_prefix("status") + mock_run.log_configs.assert_any_call({expected_key: "success"}) @pytest.mark.skipif(not _NEPTUNE_SCALE_AVAILABLE, reason="Neptune-Scale is required for this test.") @@ -472,13 +473,20 @@ def test_neptune_scale_logger_invalid_run(): @pytest.mark.skipif(not _NEPTUNE_SCALE_AVAILABLE, reason="Neptune-Scale is required for this test.") -def test_neptune_scale_logger_log_model_summary(neptune_scale_logger, caplog): - """Test that log_model_summary shows warning.""" - logger = NeptuneScaleLogger(log_model_checkpoints=True) - model = BoringModel() +def test_neptune_scale_logger_log_model_summary(neptune_scale_logger, monkeypatch): + from neptune_scale.types import File + model = BoringModel() + logger, mock_run = neptune_scale_logger + # Patch assign_files to track calls + assign_files_mock = mock.MagicMock() + monkeypatch.setattr(mock_run, "assign_files", assign_files_mock) logger.log_model_summary(model) - assert "Neptune Scale does not support logging model summaries" in caplog.text + # Check that assign_files was called with the correct key and a File instance + called_args = assign_files_mock.call_args[0][0] + assert list(called_args.keys())[0].endswith("model/summary") + file_val = list(called_args.values())[0] + assert isinstance(file_val, File) @pytest.mark.skipif(not _NEPTUNE_SCALE_AVAILABLE, reason="Neptune-Scale is required for this test.") @@ -496,3 +504,32 @@ def test_neptune_scale_logger_with_prefix(neptune_scale_logger): metrics = {"loss": 1.23} logger.log_metrics(metrics, step=5) mock_run.log_metrics.assert_called_once_with({"training/loss": 1.23}, step=5) + + +@pytest.mark.skipif(not _NEPTUNE_SCALE_AVAILABLE, reason="Neptune-Scale is required for this test.") +def test_neptune_scale_logger_after_save_checkpoint(neptune_scale_logger): + logger, mock_run = neptune_scale_logger + models_root_dir = os.path.join("path", "to", "models") + cb_mock = MagicMock( + dirpath=models_root_dir, + last_model_path=os.path.join(models_root_dir, "last"), + best_k_models={ + f"{os.path.join(models_root_dir, 'model1')}": None, + f"{os.path.join(models_root_dir, 'model2/with/slashes')}": None, + }, + best_model_path=os.path.join(models_root_dir, "best_model"), + best_model_score=None, + ) + logger.after_save_checkpoint(cb_mock) + prefix = logger._prefix + model_key_prefix = f"{prefix}/model" if prefix else "model" + expected_calls = [ + call.log_configs({f"{model_key_prefix}/checkpoints/model1": os.path.join(models_root_dir, "model1")}), + call.log_configs({ + f"{model_key_prefix}/checkpoints/model2/with/slashes": os.path.join(models_root_dir, "model2/with/slashes") + }), + call.log_configs({f"{model_key_prefix}/checkpoints/last": os.path.join(models_root_dir, "last")}), + call.log_configs({f"{model_key_prefix}/checkpoints/best_model": os.path.join(models_root_dir, "best_model")}), + call.log_configs({f"{model_key_prefix}/best_model_path": os.path.join(models_root_dir, "best_model")}), + ] + mock_run.log_configs.assert_has_calls(expected_calls, any_order=True)