Reset metrics before each task starts (#9410)

awaelchli · Rohit Gupta · lexierule · commit 22bd1181fc25 · 2021-09-30T08:39:49.000-04:00
Co-authored-by: Rohit Gupta &lt;goku@rmac.local&gt;
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -93,6 +93,7 @@ def on_skip(self) -> List:
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs the ``on_evaluation_model_eval``, ``on_evaluation_start`` and ``on_evaluation_epoch_start`` hooks"""
         void(*args, **kwargs)
+
         # hook
         self.on_evaluation_model_eval()
         self.trainer.lightning_module.zero_grad()
@@ -208,7 +209,7 @@ def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None:
             self.trainer.profiler.describe()
 
         # reset any `torchmetrics.Metric` and the logger connector state
-        self.trainer.logger_connector.reset(metrics=True)
+        self.trainer.logger_connector.reset_results(metrics=True)
 
     def on_evaluation_epoch_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs ``on_epoch_start`` and ``on_{validation/test}_epoch_start`` hooks"""
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -277,13 +277,15 @@ def should_reset_tensors(self, fx: str) -> bool:
             is_first_batch = self._batch_idx + self._split_idx == 0
         return is_different_fx and is_first_batch
 
-    def reset(self, metrics: Optional[bool] = None) -> None:
-        if self.trainer.sanity_checking:
-            # reset metrics
-            self._progress_bar_metrics = {}
-            self._logged_metrics = {}
-            self._callback_metrics = {}
-        self.trainer._results.reset(metrics=metrics)
+    def reset_metrics(self) -> None:
+        self._progress_bar_metrics = {}
+        self._logged_metrics = {}
+        self._callback_metrics = {}
+
+    def reset_results(self, metrics: Optional[bool] = None) -> None:
+        if self.trainer._results is not None:
+            self.trainer._results.reset(metrics=metrics)
+
         self._batch_idx = None
         self._split_idx = None
         self._current_fx = None
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -903,6 +903,11 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         # ----------------------------
         # TRAIN
         # ----------------------------
+
+        # reset logger connector
+        self.logger_connector.reset_results()
+        self.logger_connector.reset_metrics()
+
         # hook
         if self.state.fn == TrainerFn.FITTING:
             self.call_hook("on_fit_start")
@@ -1103,8 +1108,11 @@ def _run_sanity_check(self, ref_model):
             stage = self.state.stage
             self.sanity_checking = True
 
-            # hook and callback
-            self.on_sanity_check_start()
+            # reset logger connector
+            self.logger_connector.reset_results()
+            self.logger_connector.reset_metrics()
+
+            self.call_hook("on_sanity_check_start")
 
             # reload dataloaders
             self._evaluation_loop.reload_evaluation_dataloaders()
@@ -1115,8 +1123,9 @@ def _run_sanity_check(self, ref_model):
 
             self.on_sanity_check_end()
 
-            # reset validation metrics
-            self.logger_connector.reset()
+            # reset logger connector
+            self.logger_connector.reset_results()
+            self.logger_connector.reset_metrics()
 
             # reset the seed to what it was before sanity check
             # prevents sanity check to affect random sampling in training
diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py
@@ -550,6 +550,12 @@ def test_step(self, batch, batch_idx):
     # hp_metric + 2 steps + epoch + 2 steps + epoch
     expected_num_calls = 1 + 2 + 1 + 2 + 1
 
+    assert set(trainer.callback_metrics) == {
+        "train_loss",
+        "valid_loss_0_epoch",
+        "valid_loss_0",
+        "valid_loss_1",
+    }
     assert len(mock_log_metrics.mock_calls) == expected_num_calls
     assert mock_log_metrics.mock_calls[0] == call({"hp_metric": -1}, 0)
 
@@ -583,10 +589,6 @@ def get_metrics_at_idx(idx):
 
     results = trainer.test(model)
     assert set(trainer.callback_metrics) == {
-        "train_loss",
-        "valid_loss_0_epoch",
-        "valid_loss_0",
-        "valid_loss_1",
         "test_loss",
     }
     assert set(results[0]) == {"test_loss"}
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -1897,3 +1897,49 @@ def current_memory():
     trainer_2.fit(model)
 
     assert current_memory() <= initial
+
+
+def test_trainer_metrics_reset_before_each_task(tmpdir):
+    """Test that callback, logged and progress bar metrics are reset before each task starts."""
+
+    class TestMetricRestartCallback(Callback):
+        def _make_assertions(self, trainer):
+            assert trainer.callback_metrics == {}
+            assert trainer.progress_bar_metrics == {}
+            assert trainer.logged_metrics == {}
+
+        def on_train_start(self, trainer, *args, **kwargs):
+            self._make_assertions(trainer)
+
+        def on_validation_start(self, trainer, *args, **kwargs):
+            if trainer.state.fn == TrainerFn.VALIDATING:
+                self._make_assertions(trainer)
+
+        def on_test_start(self, trainer, *args, **kwargs):
+            self._make_assertions(trainer)
+
+        def on_predict_start(self, trainer, *args, **kwargs):
+            self._make_assertions(trainer)
+
+    class CustomBoringModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+
+        def training_step(self, *args, **kwargs):
+            self.log("train/metric", 7.0)
+            return super().training_step(*args, **kwargs)
+
+        def validation_step(self, *args, **kwargs):
+            self.log("val/metric", 14.0)
+            return super().validation_step(*args, **kwargs)
+
+        def test_step(self, *args, **kwargs):
+            self.log("test/metric", 21.0)
+            return super().test_step(*args, **kwargs)
+
+    model = CustomBoringModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=4, callbacks=[TestMetricRestartCallback()])
+    trainer.fit(model)
+    trainer.validate(model)
+    trainer.test(model)
+    trainer.predict(model)