Disable skipping training step in distributed training (#19918)

awaelchli · web-flow · commit 95d6b6b9da20 · 2024-05-30T11:54:48.000-04:00
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -28,7 +28,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Relaxed the requirement for custom batch samplers to expose `drop_last` for prediction ([#19678](https://github.com/Lightning-AI/pytorch-lightning/pull/19678))
 
--
+- It is no longer allowed to skip `training_step()` by returning `None` in distributed training ([#19918](https://github.com/Lightning-AI/pytorch-lightning/pull/19918))
+
 
 ### Deprecated
 
diff --git a/src/lightning/pytorch/loops/optimization/automatic.py b/src/lightning/pytorch/loops/optimization/automatic.py
@@ -314,8 +314,14 @@ def _training_step(self, kwargs: OrderedDict) -> ClosureResult:
         """
         trainer = self.trainer
 
-        # manually capture logged metrics
         training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
         self.trainer.strategy.post_training_step()  # unused hook - call anyway for backward compatibility
 
+        if training_step_output is None and trainer.world_size > 1:
+            raise RuntimeError(
+                "Skipping the `training_step` by returning None in distributed training is not supported."
+                " It is recommended that you rewrite your training logic to avoid having to skip the step in the first"
+                " place."
+            )
+
         return self.output_result_cls.from_training_step_output(training_step_output, trainer.accumulate_grad_batches)
diff --git a/tests/tests_pytorch/loops/optimization/test_automatic_loop.py b/tests/tests_pytorch/loops/optimization/test_automatic_loop.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from contextlib import nullcontext
 from typing import Dict, Generic, Iterator, Mapping, TypeVar
 
 import pytest
@@ -82,3 +82,27 @@ def training_step(self, batch, batch_idx):
 
     with pytest.raises(MisconfigurationException, match=match):
         trainer.fit(model)
+
+
+@pytest.mark.parametrize("world_size", [1, 2])
+def test_skip_training_step_not_allowed(world_size, tmp_path):
+    """Test that skipping the training_step in distributed training is not allowed."""
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            return None
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_steps=1,
+        barebones=True,
+    )
+    trainer.strategy.world_size = world_size  # mock world size without launching processes
+    error_context = (
+        pytest.raises(RuntimeError, match="Skipping the `training_step` .* is not supported")
+        if world_size > 1
+        else nullcontext()
+    )
+    with error_context:
+        trainer.fit(model)
diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
@@ -178,6 +178,8 @@ class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             assert batch.samples.device == self.device
             assert isinstance(batch_idx, int)
+            # the actual training step is not needed for the assertions
+            return super().training_step(torch.rand(1, 32, device=self.device), batch_idx)
 
         def train_dataloader(self):
             return torch.utils.data.DataLoader(RandomDataset(32, 64), collate_fn=collate_fn)
diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py
@@ -641,6 +641,8 @@ def __init__(self):
 
     def training_step(self, batch, batch_idx):
         self.batches_seen.append(batch)
+        # the actual training step is not needed for the assertions below
+        return super().training_step(torch.rand(1, 32, device=self.device), batch_idx)
 
     def on_train_epoch_end(self):
         world_size = 2
@@ -810,8 +812,10 @@ def __init__(self):
         super().__init__()
         self.seen_samples = []
 
-    def training_step(self, batch):
+    def training_step(self, batch, batch_idx):
         self.seen_samples.extend(batch.tolist())
+        # the actual training step is not needed for the test
+        return super().training_step(torch.rand(1, 32, device=self.device), batch_idx)
 
     def on_train_end(self):
         seen_samples = self.all_gather(self.seen_samples)