Merge pull request #7 from Flegyas/develop

lucmos · web-flow · commit 3f7dabee40b6 · 2022-01-11T21:44:04.000+01:00
Release version 0.0.3
diff --git a/src/nn_core/callbacks.py b/src/nn_core/callbacks.py
@@ -1,80 +1,27 @@
-import dataclasses
 import logging
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
-import hydra
 import pytorch_lightning as pl
-from omegaconf import DictConfig
-from pytorch_lightning import Callback
-from pytorch_lightning.loggers import LightningLoggerBase
+from pytorch_lightning import Callback, Trainer
 
-from nn_core.common import PROJECT_ROOT
 from nn_core.model_logging import NNLogger
 
 pylogger = logging.getLogger(__name__)
 
 
-@dataclasses.dataclass
-class Upload:
-    checkpoint: bool = True
-    source: bool = True
-
-
-class NNLoggerConfiguration(Callback):
-    def __init__(self, upload: Optional[Dict[str, bool]], logger: Optional[DictConfig], **kwargs):
-        self.upload: Upload = Upload(**upload)
-        self.logger_cfg = logger
-        self.kwargs = kwargs
-
-        self.wandb: bool = self.logger_cfg["_target_"].endswith("WandbLogger")
+class NNTemplateCore(Callback):
+    @staticmethod
+    def _is_nnlogger(trainer: Trainer) -> bool:
+        return isinstance(trainer.logger, NNLogger)
 
     def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
-        if isinstance(trainer.logger, NNLogger):
+        if self._is_nnlogger(trainer):
+            trainer.logger.upload_source()
             trainer.logger.log_configuration(model=pl_module)
-
-        if "wandb_watch" in self.kwargs:
-            trainer.logger.wrapped.watch(pl_module, **self.kwargs["wandb_watch"])
+            trainer.logger.watch_model(pl_module=pl_module)
 
     def on_save_checkpoint(
-        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: Dict[str, Any]
-    ) -> dict:
-        data = [
-            ("best_model_path", trainer.checkpoint_callback.best_model_path),
-            ("best_model_score", str(trainer.checkpoint_callback.best_model_score.detach().cpu().item())),
-        ]
-        trainer.logger.log_text(key="storage_info", columns=["key", "value"], data=data)
-
-        return checkpoint
-
-    # on_init_end can be employed since the Trainer doesn't use the logger until then.
-    def on_init_end(self, trainer: "pl.Trainer") -> None:
-        if self.logger_cfg is None:
-            return
-
-        pylogger.info(f"Instantiating <{self.logger_cfg['_target_'].split('.')[-1]}>")
-
-        if trainer.fast_dev_run and self.wandb:
-            # Switch wandb mode to offline to prevent online logging
-            self.logger_cfg.mode = "offline"
-
-        logger: LightningLoggerBase = hydra.utils.instantiate(self.logger_cfg)
-
-        if self.upload.source:
-            if self.wandb:
-                logger.experiment.log_code(
-                    root=PROJECT_ROOT,
-                    name=None,
-                    include_fn=(
-                        lambda path: path.startswith(
-                            (
-                                str(PROJECT_ROOT / "conf"),
-                                str(PROJECT_ROOT / "src"),
-                                str(PROJECT_ROOT / "setup.cfg"),
-                                str(PROJECT_ROOT / "env.yaml"),
-                            )
-                        )
-                        and path.endswith((".py", ".yaml", ".yml", ".toml", ".cfg"))
-                    ),
-                )
-
-        trainer.logger.wrapped = logger
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule, checkpoint: Dict[str, Any]
+    ) -> None:
+        if self._is_nnlogger(trainer):
+            trainer.logger.on_save_checkpoint(trainer=trainer, pl_module=pl_module, checkpoint=checkpoint)
diff --git a/src/nn_core/hooks.py b/src/nn_core/hooks.py
@@ -1,23 +0,0 @@
-from typing import Callable, Dict, NoReturn
-
-from omegaconf import DictConfig, OmegaConf
-
-
-class OnSaveCheckpointInjection:
-    def __init__(
-        self,
-        cfg: DictConfig,
-        on_save_checkpoint: Callable[[Dict], NoReturn],
-    ):
-        """Inject the configuration into the checkpoint monkey patching the on_save_checkpoint hook.
-
-        Args:
-            cfg: the configuration to inject
-            on_save_checkpoint: the on_save_checkpoint to monkey patch
-        """
-        self.cfg = cfg
-        self.on_save_checkpoint = on_save_checkpoint
-
-    def __call__(self, checkpoint: Dict) -> None:
-        self.on_save_checkpoint(checkpoint)
-        checkpoint["cfg"] = OmegaConf.to_container(self.cfg, resolve=True)
diff --git a/src/nn_core/model_logging.py b/src/nn_core/model_logging.py
@@ -1,29 +1,101 @@
 import argparse
+import logging
 import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
+import hydra
 import pytorch_lightning
 from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import LightningLoggerBase
 
+from nn_core.common import PROJECT_ROOT
+
+pylogger = logging.getLogger(__name__)
+
+
 _STATS_KEY: str = "stats"
 
 
 class NNLogger(LightningLoggerBase):
 
     __doc__ = LightningLoggerBase.__doc__
 
-    def __init__(self, logger: Optional[LightningLoggerBase], storage_dir: str, cfg):
+    def __init__(self, logging_cfg: DictConfig, cfg: DictConfig, resume_id: Optional[str]):
         super().__init__()
-        self.wrapped: LightningLoggerBase = logger
-        self.storage_dir: str = storage_dir
+        self.logging_cfg = logging_cfg
         self.cfg = cfg
+        self.resume_id = resume_id
+
+        self.storage_dir: str = cfg.core.storage_dir
+        self.wandb: bool = self.logging_cfg.logger["_target_"].endswith("WandbLogger")
+
+        if self.cfg.train.trainer.fast_dev_run and self.wandb:
+            # Switch wandb mode to offline to prevent online logging
+            pylogger.info("Setting the logger in 'offline' mode")
+            self.logging_cfg.logger.mode = "offline"
 
-    def __getattr__(self, item):
+        pylogger.info(f"Instantiating <{self.logging_cfg.logger['_target_'].split('.')[-1]}>")
+        self.wrapped: LightningLoggerBase = hydra.utils.instantiate(self.logging_cfg.logger, version=self.resume_id)
+
+        # force experiment lazy initialization
+        _ = self.wrapped.experiment
+
+    def __getattr__(self, item: str) -> Any:
         if self.wrapped is not None:
+            pylogger.debug(f"Delegation with '__getattr__': {self.wrapped.__class__.__qualname__}.{item}")
             return getattr(self.wrapped, item)
 
+    def watch_model(self, pl_module: LightningModule):
+        if self.wandb and "wandb_watch" in self.logging_cfg:
+            pylogger.info("Starting to 'watch' the module")
+            self.wrapped.watch(pl_module, **self.logging_cfg["wandb_watch"])
+
+    def upload_source(self) -> None:
+        if self.logging_cfg.upload.source and self.wandb:
+            pylogger.info("Uploading source code to wandb")
+            self.wrapped.experiment.log_code(
+                root=PROJECT_ROOT,
+                name=None,
+                include_fn=(
+                    lambda path: path.startswith(
+                        (
+                            str(PROJECT_ROOT / "conf"),
+                            str(PROJECT_ROOT / "src"),
+                            str(PROJECT_ROOT / "setup.cfg"),
+                            str(PROJECT_ROOT / "env.yaml"),
+                        )
+                    )
+                    and path.endswith((".py", ".yaml", ".yml", ".toml", ".cfg"))
+                ),
+            )
+
+    def on_save_checkpoint(self, trainer: Trainer, pl_module: LightningModule, checkpoint: Dict[str, Any]) -> None:
+        # Attach to each checkpoint saved the configuration and the wandb run path (to resume logging from
+        # only the checkpoint)
+        pylogger.debug("Attaching 'cfg' to the checkpoint")
+        checkpoint["cfg"] = OmegaConf.to_container(trainer.logger.cfg, resolve=True)
+
+        pylogger.debug("Attaching 'run_path' to the checkpoint")
+        checkpoint[
+            "run_path"
+        ] = f"{trainer.logger.experiment.entity}/{trainer.logger.experiment.project_name()}/{trainer.logger.version}"
+
+    def after_save_checkpoint(self, checkpoint_callback: ModelCheckpoint) -> None:
+        # Log the checkpoint meta information
+        self.add_path(obj_id="checkpoints/best", obj_path=checkpoint_callback.best_model_path)
+        self.add_path(
+            obj_id="checkpoints/best_score",
+            obj_path=str(checkpoint_callback.best_model_score.detach().cpu().item()),
+        )
+
+    def add_path(self, obj_id: str, obj_path: str) -> None:
+        key = f"paths/{obj_id}"
+        pylogger.debug(f"Logging '{key}'")
+        self.experiment.config.update({key: str(obj_path)}, allow_val_change=True)
+
     @property
     def save_dir(self) -> Optional[str]:
         return self.storage_dir
@@ -55,7 +127,8 @@ def log_hyperparams(self, params: argparse.Namespace, *args, **kwargs):
             kwargs: Optional keywoard arguments, depends on the specific logger being used
         """
         raise RuntimeError(
-            "This method is called automatically by PyTorch Lightning if save_hyperparameters(logger=True) is called. The whole configuration is already logged by logger.log_configuration, set logger=False"
+            "This method is called automatically by PyTorch Lightning if save_hyperparameters(logger=True) is called. "
+            "The whole configuration is already logged by logger.log_configuration, set logger=False"
         )
 
     def log_text(self, *args, **kwargs) -> None:
@@ -115,12 +188,16 @@ def log_configuration(
         yaml_conf: str = OmegaConf.to_yaml(cfg=cfg)
         run_dir: Path = Path(self.run_dir)
         run_dir.mkdir(exist_ok=True, parents=True)
-        (run_dir / "config.yaml").write_text(yaml_conf)
+        config_save_path = run_dir / "config.yaml"
+        pylogger.debug(f"Saving the configuration in: {config_save_path}")
+        config_save_path.write_text(yaml_conf)
 
         # save number of model parameters
+        pylogger.debug("Injecting model statistics in the 'cfg'")
         cfg[f"{_STATS_KEY}/params_total"] = sum(p.numel() for p in model.parameters())
         cfg[f"{_STATS_KEY}/params_trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad)
         cfg[f"{_STATS_KEY}/params_not_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad)
 
         # send hparams to all loggers
+        pylogger.debug("Logging 'cfg'")
         self.wrapped.log_hyperparams(cfg)
diff --git a/src/nn_core/resume.py b/src/nn_core/resume.py
@@ -0,0 +1,63 @@
+import re
+from pathlib import Path
+from typing import Optional
+
+import torch
+import wandb
+from wandb.apis.public import Run
+
+RUN_PATH_PATTERN = re.compile(r"^([^/]+)/([^/]+)/([^/]+)$")
+
+
+def resolve_ckpt(ckpt_or_run_path: str) -> str:
+    """Resolve the run path or ckpt to a checkpoint.
+
+    Args:
+        ckpt_or_run_path: run identifier or checkpoint path
+
+    Returns:
+        an existing path towards the best checkpoint
+    """
+    if Path(ckpt_or_run_path).exists():
+        return ckpt_or_run_path
+
+    try:
+        api = wandb.Api()
+        run: Run = api.run(path=ckpt_or_run_path)
+        ckpt_or_run_path = run.config["paths/checkpoints/best"]
+        return ckpt_or_run_path
+    except wandb.errors.CommError:
+        raise ValueError(f"Checkpoint or run not found: {ckpt_or_run_path}")
+
+
+def resolve_run_path(ckpt_or_run_path: str) -> str:
+    """Resolve the run path or ckpt to a run path.
+
+    Args:
+        ckpt_or_run_path: run identifier or checkpoint path
+
+    Returns:
+        an wandb run path identifier
+    """
+    if RUN_PATH_PATTERN.match(ckpt_or_run_path):
+        return ckpt_or_run_path
+
+    try:
+        return torch.load(ckpt_or_run_path)["run_path"]
+    except FileNotFoundError:
+        raise ValueError(f"Checkpoint or run not found: {ckpt_or_run_path}")
+
+
+def resolve_run_version(ckpt_or_run_path: Optional[str] = None, run_path: Optional[str] = None) -> str:
+    """Resolve the run path or ckpt to the wandb run version.
+
+    Args:
+        ckpt_or_run_path: run identifier or checkpoint path
+        run_path: the run path if already available
+
+    Returns:
+        a wandb run version
+    """
+    if run_path is None:
+        run_path = resolve_run_path(ckpt_or_run_path)
+    return RUN_PATH_PATTERN.match(run_path).group(3)