1.4.6 release

awaelchli · justusschock · lexierule · commit 00c66402e730 · 2021-09-10T09:39:24.000-04:00
Co-authored-by: Justus Schock &lt;justus.schock@lfb.rwth-aachen.de&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,40 +7,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.4.6] - 2021-09-07
 
-- Fixed signature of `Timer.on_train_epoch_end` and `StochasticWeightAveraging.on_train_epoch_end` to prevent unwanted deprecation warnings ([#9347](https://github.com/PyTorchLightning/pytorch-lightning/pull/9347))
-
-
-## [1.4.5] - 2021-08-31
-
 - Fixed an issues with export to ONNX format when a model has multiple inputs ([#8800](https://github.com/PyTorchLightning/pytorch-lightning/pull/8800))
-
 - Removed deprecation warnings being called for `on_{task}_dataloader` ([#9279](https://github.com/PyTorchLightning/pytorch-lightning/pull/9279))
-
 - Fixed save/load/resume from checkpoint for DeepSpeed Plugin (
     [#8397](https://github.com/PyTorchLightning/pytorch-lightning/pull/8397),
     [#8644](https://github.com/PyTorchLightning/pytorch-lightning/pull/8644),
     [#8627](https://github.com/PyTorchLightning/pytorch-lightning/pull/8627))
-
-
 - Fixed `EarlyStopping` running on train epoch end when `check_val_every_n_epoch>1` is set ([#9156](https://github.com/PyTorchLightning/pytorch-lightning/pull/9156))
-
-
 - Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/PyTorchLightning/pytorch-lightning/issues/8333))
-
-
+- Fixed the Apex and DeepSpeed plugin closure running after the `on_before_optimizer_step` hook ([#9288](https://github.com/PyTorchLightning/pytorch-lightning/issues/9288))
+- Fixed the Native AMP plugin closure not running with manual optimization ([#9288](https://github.com/PyTorchLightning/pytorch-lightning/issues/9288))
 - Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858))
-
-
 - Fixed intra-epoch evaluation outputs staying in memory when the respective `*_epoch_end` hook wasn't overridden ([#9261](https://github.com/PyTorchLightning/pytorch-lightning/pull/9261))
-
-
 - Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267))
-
-
 - Fixed PyTorch Profiler not enabled for manual optimization ([#9316](https://github.com/PyTorchLightning/pytorch-lightning/pull/9316))
-
-
 - Fixed inspection of other args when a container is specified in `save_hyperparameters` ([#9125](https://github.com/PyTorchLightning/pytorch-lightning/pull/9125))
+- Fixed signature of `Timer.on_train_epoch_end` and `StochasticWeightAveraging.on_train_epoch_end` to prevent unwanted deprecation warnings ([#9347](https://github.com/PyTorchLightning/pytorch-lightning/pull/9347))
+
 
 ## [1.4.5] - 2021-08-31
 
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
@@ -108,7 +108,6 @@ Lightning has a few built-in callbacks.
     ModelPruning
     ProgressBar
     ProgressBarBase
-    RichProgressBar
     QuantizationAwareTraining
     StochasticWeightAveraging
     XLAStatsMonitor
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = "1.4.5"
+__version__ = "1.4.6"
 __author__ = "William Falcon et al."
 __author_email__ = "waf2107@columbia.edu"
 __license__ = "Apache-2.0"
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
@@ -91,7 +91,7 @@ def __init__(
         check_finite: bool = True,
         stopping_threshold: Optional[float] = None,
         divergence_threshold: Optional[float] = None,
-        check_on_train_epoch_end: bool = True,
+        check_on_train_epoch_end: Optional[bool] = None,
     ):
         super().__init__()
         self.min_delta = min_delta
@@ -201,7 +201,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         # when in dev debugging
         trainer.dev_debugger.track_early_stopping_history(self, current)
 
-        should_stop, reason = self._evalute_stopping_criteria(current)
+        should_stop, reason = self._evaluate_stopping_criteria(current)
 
         # stop every ddp process if any world process decides to stop
         should_stop = trainer.training_type_plugin.reduce_boolean_decision(should_stop)
@@ -211,7 +211,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         if reason and self.verbose:
             self._log_info(trainer, reason)
 
-    def _evalute_stopping_criteria(self, current: torch.Tensor) -> Tuple[bool, str]:
+    def _evaluate_stopping_criteria(self, current: torch.Tensor) -> Tuple[bool, str]:
         should_stop = False
         reason = None
         if self.check_finite and not torch.isfinite(current):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -669,10 +669,8 @@ def training_step(self, *args, **kwargs) -> STEP_OUTPUT:
 
             - :class:`~torch.Tensor` - The loss tensor
             - ``dict`` - A dictionary. Can include any keys, but must include the key ``'loss'``
-            - ``None`` - Training will skip to the next batch
-
-        Note:
-            Returning ``None`` is currently not supported for multi-GPU or TPU, or with 16-bit precision enabled.
+            - ``None`` - Training will skip to the next batch. This is only for automatic optimization.
+                This is not supported for multi-GPU or TPU, or using ``DeepSpeed``.
 
         In this step you'd normally do the forward pass and calculate the loss for a batch.
         You can also do fancier things like multiple forward passes or something model specific.
diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -97,10 +97,13 @@ def pre_optimizer_step(
         **kwargs: Any,
     ) -> bool:
         """Hook to do something before each optimizer step."""
+        result = lambda_closure()  # APEX amp does not support closures
         super().pre_optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
-        # the following should be in a `optimizer_step` hook but we don't have one in the precision plugin.
-        lambda_closure()  # APEX amp does not support closures
-        optimizer.step(**kwargs)
+        skipped_backward = result is None
+        # in manual optimization, the closure does not return a value
+        if not model.automatic_optimization or not skipped_backward:
+            # the following should be in a `optimizer_step` hook but we don't have one in the precision plugin.
+            optimizer.step(**kwargs)
         return False
 
     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py
@@ -20,6 +20,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import GradClipAlgorithmType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -42,9 +43,14 @@ def pre_optimizer_step(
         **kwargs: Any,
     ) -> bool:
         """Hook to do something before each optimizer step."""
+        result = lambda_closure()  # DeepSpeed does not support closures
         super().pre_optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
+        # in manual optimization, the closure does not return a value
+        if model.automatic_optimization and result is None:
+            raise MisconfigurationException(
+                "Skipping backward by returning `None` from your `training_step` is not supported by `DeepSpeed`"
+            )
         # the following should be in a `optimizer_step` hook but we don't have one in the precision plugin.
-        lambda_closure()  # DeepSpeed does not support closures
         deepspeed_engine = model.trainer.model
         deepspeed_engine.step()
         return False
diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
@@ -54,13 +54,13 @@ def pre_optimizer_step(
                 f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
                 " To request, please file a Github issue in PyTorch and tag @mcarilli"
             )
-        result = True
-        if model.automatic_optimization:
-            result = lambda_closure()
+        result = lambda_closure()  # native amp does not support closures
         self.scaler.unscale_(optimizer)
         super().pre_optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
-        # lambda_closure returning None indicates that backward has been skipped
-        if result is not None:
+        skipped_backward = result is None
+        # in manual optimization, the closure does not return a value
+        if not model.automatic_optimization or not skipped_backward:
+            # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
             self.scaler.step(optimizer)
             self.scaler.update()
         return False
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -35,7 +35,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.types import LRSchedulerTypeTuple
-from pytorch_lightning.utilities.warnings import _warn, LightningDeprecationWarning, warning_cache
+from pytorch_lightning.utilities.warnings import _warn, LightningDeprecationWarning
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -671,19 +671,18 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
             checkpoint: The checkpoint state dictionary
             filepath: write-target file's path
         """
-        if self.zero_stage_3 and self._multi_device and self.is_global_zero:
-            warning_cache.warn(
-                "When saving the DeepSpeed Stage 3 checkpoint, "
-                "each worker will save a shard of the checkpoint within a directory. "
-                "If a single file is required after training, "
-                "see https://pytorch-lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#"
-                "deepspeed-zero-stage-3-single-file for instructions."
-            )
-            # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
-            # dump states as a checkpoint dictionary object
-            _exclude_keys = ["state_dict", "optimizer_states", "lr_schedulers"]
-            checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
-            self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint)
+        if self.world_size > 1 and self.zero_stage_3:
+            if self.save_full_weights:
+                # todo: expose this as general function in deepspeed
+                state_dict = self.deepspeed_engine._zero3_consolidated_fp16_state_dict()
+                if self.is_global_zero:
+                    # State dict keys will include reference to wrapper LightningDeepSpeedModule
+                    # Delete `module` prefix before saving.
+                    state_dict = {k.partition("module.")[2]: state_dict[k] for k in state_dict.keys()}
+                    checkpoint["state_dict"] = state_dict
+                    return super().save_checkpoint(checkpoint, filepath)
+                return
+
             # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
             save_dir = self._filepath_to_dir(filepath)
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
@@ -286,19 +286,16 @@ def __init__(
         """
         super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
 
-        record_functions = self.__deprecation_check(profiled_functions, record_functions)
-
         self._group_by_input_shapes = group_by_input_shapes and profiler_kwargs.get("record_shapes", False)
         self._emit_nvtx = emit_nvtx
         self._export_to_chrome = export_to_chrome
         self._row_limit = row_limit
         self._sort_by_key = sort_by_key or f"{'cuda' if profiler_kwargs.get('use_cuda', False) else 'cpu'}_time_total"
-        self._user_record_functions = record_functions
+        self._user_record_functions = set(record_functions or set())
         self._record_functions_start = self._user_record_functions | self.START_RECORD_FUNCTIONS
         self._record_functions = self._user_record_functions | self.RECORD_FUNCTIONS
         self._record_module_names = record_module_names
         self._profiler_kwargs = profiler_kwargs
-
         self.profiler: Optional[_PROFILER] = None
         self.function_events: Optional["EventList"] = None
         self._lightning_module: Optional["LightningModule"] = None  # set by ProfilerConnector
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -202,11 +202,7 @@ def update_eval_epoch_metrics(self) -> _EVALUATE_OUTPUT:
 
     def on_train_split_start(self, batch_idx: int, split_idx: int, split_batch: Any) -> None:
         assert self.trainer._results is not None
-        # when the user requests `dataloader_iter`, we can't track the batch_size
-        # and this is left to user responsibility.
-        if isinstance(split_batch, pl.utilities.fetching.DataLoaderIterDataFetcher):
-            self.trainer._results.extract_batch_size(split_batch)
-
+        self.trainer._results.extract_batch_size(split_batch)
         self._batch_idx = batch_idx
         self._split_idx = split_idx
 
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import pickle
 from argparse import ArgumentParser, Namespace
-from dataclasses import dataclass
 from typing import Any, Dict
 from unittest import mock
 from unittest.mock import call, PropertyMock
@@ -25,7 +24,6 @@
 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.utilities import AttributeDict
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from tests.helpers import BoringDataModule, BoringModel
 from tests.helpers.datamodules import ClassifDataModule
@@ -561,75 +559,3 @@ def test_hyperparameters_saving():
 
     data = DataModuleWithHparams_1(OmegaConf.create({"hello": "world"}), "foo", kwarg0="bar")
     assert data.hparams == OmegaConf.create({"hello": "world"})
-
-
-def test_define_as_dataclass():
-    # makes sure that no functionality is broken and the user can still manually make
-    # super().__init__ call with parameters
-    # also tests all the dataclass features that can be enabled without breaking anything
-    @dataclass(init=True, repr=True, eq=True, order=True, unsafe_hash=True, frozen=False)
-    class BoringDataModule1(LightningDataModule):
-        batch_size: int
-        dims: int = 2
-
-        def __post_init__(self):
-            super().__init__(dims=self.dims)
-
-    # asserts for the different dunder methods added by dataclass, when __init__ is implemented, i.e.
-    # __repr__, __eq__, __lt__, __le__, etc.
-    assert BoringDataModule1(batch_size=64).dims == 2
-    assert BoringDataModule1(batch_size=32)
-    assert hasattr(BoringDataModule1, "__repr__")
-    assert BoringDataModule1(batch_size=32) == BoringDataModule1(batch_size=32)
-
-    # asserts inherent calling of super().__init__ in case user doesn't make the call
-    @dataclass
-    class BoringDataModule2(LightningDataModule):
-        batch_size: int
-
-    # asserts for the different dunder methods added by dataclass, when super class is inherently initialized, i.e.
-    # __init__, __repr__, __eq__, __lt__, __le__, etc.
-    assert BoringDataModule2(batch_size=32)
-    assert hasattr(BoringDataModule2, "__repr__")
-    assert BoringDataModule2(batch_size=32).prepare_data() is None
-    assert BoringDataModule2(batch_size=32) == BoringDataModule2(batch_size=32)
-
-    # checking for all the different multilevel inhertiance scenarios, for init call on LightningDataModule
-    @dataclass
-    class BoringModuleBase1(LightningDataModule):
-        num_features: int
-
-    class BoringModuleBase2(LightningDataModule):
-        def __init__(self, num_features: int):
-            self.num_features = num_features
-
-    @dataclass
-    class BoringModuleDerived1(BoringModuleBase1):
-        ...
-
-    class BoringModuleDerived2(BoringModuleBase1):
-        def __init__(self):
-            ...
-
-    @dataclass
-    class BoringModuleDerived3(BoringModuleBase2):
-        ...
-
-    class BoringModuleDerived4(BoringModuleBase2):
-        def __init__(self):
-            ...
-
-    assert hasattr(BoringModuleDerived1(num_features=2), "_has_prepared_data")
-    assert hasattr(BoringModuleDerived2(), "_has_prepared_data")
-    assert hasattr(BoringModuleDerived3(), "_has_prepared_data")
-    assert hasattr(BoringModuleDerived4(), "_has_prepared_data")
-
-
-def test_inconsistent_prepare_data_per_node(tmpdir):
-    with pytest.raises(MisconfigurationException, match="Inconsistent settings found for `prepare_data_per_node`."):
-        model = BoringModel()
-        dm = BoringDataModule()
-        trainer = Trainer(prepare_data_per_node=False)
-        trainer.model = model
-        trainer.datamodule = dm
-        trainer.data_connector.prepare_data()
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
@@ -156,11 +156,6 @@ def on_load_checkpoint(self, trainer, *args):
         trainer.fit(model)
 
 
-def test_v1_5_0_legacy_profiler_argument():
-    with pytest.deprecated_call(match="renamed to `record_functions` in v1.3"):
-        PyTorchProfiler(profiled_functions=[])
-
-
 def test_v1_5_0_running_sanity_check():
     trainer = Trainer()
     with pytest.deprecated_call(match="has been renamed to `Trainer.sanity_checking`"):
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py