add testing

SkafteNicki · SkafteNicki · commit 825f605f13e8 · 2025-09-03T14:11:23.000+02:00
diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import glob
 import logging
 import math
 import os
@@ -750,3 +751,52 @@ def __init__(self):
                 assert not torch.allclose(gradients, gradients_no_spacing, rtol=0.1), (
                     "Gradients should differ significantly in exponential mode when using proper spacing"
                 )
+
+
+def test_lr_finder_checkpoint_cleanup_on_error(tmp_path):
+    """Test that temporary checkpoint files are cleaned up even when an error occurs during lr finding."""
+
+    class FailingModel(BoringModel):
+        def __init__(self, fail_on_step=2):
+            super().__init__()
+            self.fail_on_step = fail_on_step
+            self.current_step = 0
+            self.learning_rate = 1e-3
+
+        def training_step(self, batch, batch_idx):
+            self.current_step += 1
+            if self.current_step >= self.fail_on_step:
+                raise RuntimeError("Intentional failure for testing cleanup")
+            return super().training_step(batch, batch_idx)
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
+
+    model = FailingModel()
+    lr_finder = LearningRateFinder(num_training_steps=5)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        logger=False,
+        callbacks=[lr_finder],
+    )
+
+    # Check no lr_find checkpoint files exist initially
+    lr_find_checkpoints = glob.glob(os.path.join(tmp_path, ".lr_find_*.ckpt"))
+    assert len(lr_find_checkpoints) == 0, "No lr_find checkpoint files should exist initially"
+
+    # Run lr finder and expect it to fail
+    with pytest.raises(RuntimeError, match="Intentional failure for testing cleanup"):
+        trainer.fit(model)
+
+    # Check that no lr_find checkpoint files are left behind
+    lr_find_checkpoints = glob.glob(os.path.join(tmp_path, ".lr_find_*.ckpt"))
+    assert len(lr_find_checkpoints) == 0, (
+        f"lr_find checkpoint files should be cleaned up, but found: {lr_find_checkpoints}"
+    )
diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import glob
 import logging
 import os
 from copy import deepcopy
@@ -486,3 +487,49 @@ def test_batch_size_finder_callback_val_batches(tmp_path):
 
     assert trainer.num_val_batches[0] == len(trainer.val_dataloaders)
     assert trainer.num_val_batches[0] != steps_per_trial
+
+
+def test_scale_batch_size_checkpoint_cleanup_on_error(tmp_path):
+    """Test that temporary checkpoint files are cleaned up even when an error occurs during batch size scaling."""
+
+    class FailingModel(BoringModel):
+        def __init__(self, fail_on_step=2):
+            super().__init__()
+            self.fail_on_step = fail_on_step
+            self.current_step = 0
+            self.batch_size = 2
+
+        def training_step(self, batch, batch_idx):
+            self.current_step += 1
+            if self.current_step >= self.fail_on_step:
+                raise RuntimeError("Intentional failure for testing cleanup")
+            return super().training_step(batch, batch_idx)
+
+        def train_dataloader(self):
+            return DataLoader(RandomDataset(32, 64), batch_size=self.batch_size)
+
+    model = FailingModel()
+    batch_size_finder = BatchSizeFinder(max_trials=3, steps_per_trial=2)
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        logger=False,
+        callbacks=[batch_size_finder],
+    )
+
+    # Check no scale_batch_size checkpoint files exist initially
+    scale_checkpoints = glob.glob(os.path.join(tmp_path, ".scale_batch_size_*.ckpt"))
+    assert len(scale_checkpoints) == 0, "No scale_batch_size checkpoint files should exist initially"
+
+    # Run batch size scaler and expect it to fail
+    with pytest.raises(RuntimeError, match="Intentional failure for testing cleanup"):
+        trainer.fit(model)
+
+    # Check that no scale_batch_size checkpoint files are left behind
+    scale_checkpoints = glob.glob(os.path.join(tmp_path, ".scale_batch_size_*.ckpt"))
+    assert len(scale_checkpoints) == 0, (
+        f"scale_batch_size checkpoint files should be cleaned up, but found: {scale_checkpoints}"
+    )