add custom fit loop for custom hook handling

sfluegel05 · sfluegel05 · commit 5c84ec7d38f7 · 2025-09-24T20:10:57.000+02:00
diff --git a/chebai/trainer/CustomTrainer.py b/chebai/trainer/CustomTrainer.py
@@ -4,8 +4,11 @@
 import pandas as pd
 import torch
 from lightning import LightningModule, Trainer
+from lightning.fabric.utilities.data import _set_sampler_epoch
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.loops.fit_loop import _FitLoop
+from lightning.pytorch.trainer import call
 from torch.nn.utils.rnn import pad_sequence
 
 from chebai.loggers.custom import CustomLogger
@@ -39,6 +42,9 @@ def __init__(self, *args, **kwargs):
                     log_kwargs[log_key] = log_value
                 self.logger.log_hyperparams(log_kwargs)
 
+        # use custom fit loop (https://lightning.ai/docs/pytorch/LTS/extensions/loops.html#overriding-the-default-loops)
+        self.fit_loop = LoadDataLaterFitLoop(self, self.min_epochs, self.max_epochs)
+
     def _resolve_logging_argument(self, key: str, value: Any) -> Tuple[str, Any]:
         """
         Resolves logging arguments, handling nested structures such as lists and complex objects.
@@ -147,3 +153,35 @@ def log_dir(self) -> Optional[str]:
 
         dirpath = self.strategy.broadcast(dirpath)
         return dirpath
+
+
+class LoadDataLaterFitLoop(_FitLoop):
+
+    def on_advance_start(self) -> None:
+        """Calls the hook ``on_train_epoch_start`` **before** the dataloaders are setup. This is necessary
+         so that the dataloaders can get information from the model. For example: The on_train_epoch_start
+        hook sets the curr_epoch attribute of the PubChemBatched dataset. With the Lightning configuration,
+        the dataloaders would always load batch 0 first, run an epoch, then get the epoch number (usually 0,
+        unless resuming from a checkpoint), then load batch 0 again (or some other batch). With this
+        implementation, the dataloaders are setup after the epoch number is set, so that the correct
+        batch is loaded."""
+        trainer = self.trainer
+
+        # update the epoch value for all samplers
+        assert self._combined_loader is not None
+        for i, dl in enumerate(self._combined_loader.flattened):
+            _set_sampler_epoch(dl, self.epoch_progress.current.processed)
+
+        self.restarted
+        if not self.restarted_mid_epoch and not self.restarted_on_epoch_end:
+            if not self.restarted_on_epoch_start:
+                self.epoch_progress.increment_ready()
+
+            call._call_callback_hooks(trainer, "on_train_epoch_start")
+            call._call_lightning_module_hook(trainer, "on_train_epoch_start")
+
+            self.epoch_progress.increment_started()
+
+        # this is usually at the front of advance_start, but here we need it at the end
+        # might need to setup data again depending on `trainer.reload_dataloaders_every_n_epochs`
+        self.setup_data()