split test for save checksave point on expection for expetions in training part of callbacks in individal test for better overview

vsey · vsey · commit 2113acc2d225 · 2025-06-19T06:58:56.000+02:00
diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py
@@ -764,52 +764,126 @@ def test_ckpt_every_n_train_steps(tmp_path):
     assert set(os.listdir(tmp_path)) == set(expected)
 
 
-def test_model_checkpoint_save_on_exception_in_train_callback(tmp_path):
-    """Test that the checkpoint is saved when an exception is raised in a callback on different events."""
+def test_model_checkpoint_save_on_exception_in_train_callback_on_train_batch_start(tmp_path):
+    """Test that the checkpoint is saved when an exception is raised in a callback on train_batch_start."""
     class TroublemakerOnTrainBatchStart(Callback):
         def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
             if batch_idx == 1:
                 raise RuntimeError("Trouble!")
 
+    model = BoringModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
+    trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, TroublemakerOnTrainBatchStart()], max_epochs=5, logger=False)
+    with pytest.raises(RuntimeError, match="Trouble!"):
+        trainer.fit(model)
+    assert os.path.isfile(tmp_path / "step=1.ckpt")
+
+
+def test_model_checkpoint_save_on_exception_in_train_callback_on_train_batch_end(tmp_path):
+    """Test that the checkpoint is saved when an exception is raised in a callback on train_batch_end."""
     class TroublemakerOnTrainBatchEnd(Callback):
         def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
             if batch_idx == 1:
                 raise RuntimeError("Trouble!")
+            
+    model = BoringModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
+    trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, TroublemakerOnTrainBatchEnd()], max_epochs=5, logger=False)
+    with pytest.raises(RuntimeError, match="Trouble!"):
+        trainer.fit(model)
 
+    assert os.path.isfile(tmp_path / "step=2.ckpt")
+
+
+def test_model_checkpoint_save_on_exception_in_train_callback_on_train_epoch_start(tmp_path):
+    """Test that the checkpoint is saved when an exception is raised in a callback on train_epoch_start."""
     class TroublemakerOnTrainEpochStart(Callback):
         def on_train_epoch_start(self, trainer, pl_module):
             if trainer.current_epoch == 1:
                 raise RuntimeError("Trouble!")
+            
+    model = BoringModel()
+    epoch_length = 64
+    checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
+    trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, TroublemakerOnTrainEpochStart()], max_epochs=5, logger=False)
+    with pytest.raises(RuntimeError, match="Trouble!"):
+        trainer.fit(model)
+    assert os.path.isfile(tmp_path / f"step={epoch_length}.ckpt")
+
 
+def test_model_checkpoint_save_on_exception_in_train_callback_on_train_epoch_end(tmp_path):
+    """Test that the checkpoint is saved when an exception is raised in a callback on train_epoch_end."""
     class TroublemakerOnTrainEpochEnd(Callback):
         def on_train_epoch_end(self, trainer, pl_module):
             if trainer.current_epoch == 1:
                 raise RuntimeError("Trouble!")
 
-
-    epoch_length = 64
     model = BoringModel()
-    # use every_n_epochs so that we can differentiate between the normal and the troublemaker checkpoints
+    epoch_length = 64
     checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
+    trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, TroublemakerOnTrainEpochEnd()], max_epochs=5, logger=False)
+    with pytest.raises(RuntimeError, match="Trouble!"):
+        trainer.fit(model)
+    assert os.path.isfile(tmp_path / f"step={2*epoch_length}.ckpt")
+
+
+# def test_model_checkpoint_save_on_exception_in_train_callback(tmp_path):
+#     """Test that the checkpoint is saved when an exception is raised in a callback on different events."""
+#     class TroublemakerOnTrainBatchStart(Callback):
+#         def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+#             if batch_idx == 1:
+#                 raise RuntimeError("Trouble!")
+
+#     class TroublemakerOnTrainBatchEnd(Callback):
+#         def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+#             if batch_idx == 1:
+#                 raise RuntimeError("Trouble!")
+
+#     class TroublemakerOnTrainEpochStart(Callback):
+#         def on_train_epoch_start(self, trainer, pl_module):
+#             if trainer.current_epoch == 1:
+#                 raise RuntimeError("Trouble!")
+
+#     class TroublemakerOnTrainEpochEnd(Callback):
+#         def on_train_epoch_end(self, trainer, pl_module):
+#             if trainer.current_epoch == 1:
+#                 raise RuntimeError("Trouble!")
+
+
+#     epoch_length = 64
+#     model = BoringModel()
+#     # use every_n_epochs so that we can differentiate between the normal and the troublemaker checkpoints
+#     checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
+
+#     troublemakers = [
+#                      TroublemakerOnTrainBatchStart(),
+#                      TroublemakerOnTrainBatchEnd(),
+#                      TroublemakerOnTrainEpochStart(),
+#                      TroublemakerOnTrainEpochEnd()
+#                      ]
+
+#     expected_ckpts = ["step=1.ckpt",
+#                       'step=2.ckpt',
+#                       f'step={epoch_length}.ckpt',
+#                       f'step={2*epoch_length}.ckpt',
+#                      ]
+
+#     for troublemaker in troublemakers:
+#         trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, troublemaker], max_epochs=5, logger=False)
+
+#         with pytest.raises(RuntimeError, match="Trouble!"):
+#             trainer.fit(model)
 
-    troublemakers = [
-                     TroublemakerOnTrainBatchStart(),
-                     TroublemakerOnTrainBatchEnd(),
-                     TroublemakerOnTrainEpochStart(),
-                     TroublemakerOnTrainEpochEnd()
-                     ]
+#     assert set(os.listdir(tmp_path)) == set(expected_ckpts)
 
-    expected_ckpts = ["step=1.ckpt",
-                      'step=2.ckpt',
-                      f'step={epoch_length}.ckpt',
-                      f'step={2*epoch_length}.ckpt',
-                     ]
 
-    for troublemaker in troublemakers:
-        trainer = Trainer(default_root_dir=tmp_path, callbacks=[checkpoint_callback, troublemaker], max_epochs=5, logger=False)
+#     # use every_n_epochs so that we can differentiate between the normal and the troublemaker checkpoints
+#     checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="{step}", save_on_exception=True, every_n_epochs=4)
 
-        with pytest.raises(RuntimeError, match="Trouble!"):
-            trainer.fit(model)
+#     troublemakers = [
+#                     #  TroublemakerOnValidationBatchStart(),
+#                      TroublemakerOnValidationBatchEnd(),
+#     expected_ckpts = [f"step={2*epoch_length}.ckpt",
 
     assert set(os.listdir(tmp_path)) == set(expected_ckpts)