Clear reference to training loss at the end of train step (#9336)

justusschock · tchaton · carmocca · lexierule · commit f2c5f5bc0748 · 2021-09-10T09:39:24.000-04:00
Without clearing this reference, the loss tensor stays live through the next training
step. This can be a problem for memory intensive models that produce very deep backward
graphs such as neural ODEs. For these models, keeping the backward graph of the previous
loss in memory can lead to OOM errors in the next training step even though the step might
have succeeded if we had cleared (and thus GC'd) the previous backward graph.

Co-authored-by: tchaton &lt;thomas@grid.ai&gt;
Co-authored-by: Carlos Mocholi &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -201,10 +201,18 @@ def update_eval_epoch_metrics(self) -> _EVALUATE_OUTPUT:
     """
 
     def on_train_split_start(self, batch_idx: int, split_idx: int, split_batch: Any) -> None:
-        self.trainer._results.extract_batch_size(split_batch)
+        assert self.trainer._results is not None
+        # when the user requests `dataloader_iter`, we can't track the batch_size
+        # and this is left to user responsibility.
+        if isinstance(split_batch, pl.utilities.fetching.DataLoaderIterDataFetcher):
+            self.trainer._results.extract_batch_size(split_batch)
+
         self._batch_idx = batch_idx
         self._split_idx = split_idx
 
+        # clear reference to this step's training loss so that it can be garbage collected before the next training step
+        self.trainer._results.minimize = None
+
     def update_train_step_metrics(self) -> None:
         if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization:
             return
diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py
@@ -190,3 +190,16 @@ def training_epoch_end(self, outputs) -> None:
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2)
     trainer.fit(model)
     assert model.on_train_batch_end_called == 2
+
+
+def test_batch_loop_releases_loss(tmpdir):
+    """Test that loss/graph is released so that it can be garbage collected before the next training step"""
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            assert self.trainer._results.minimize is None
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2)
+    trainer.fit(model)