add weights_only to trainer.fit, validate, test, predict

matsumotosan · matsumotosan · commit 5ac969539e47 · 2025-09-07T21:01:59.000-04:00
diff --git a/src/lightning/pytorch/strategies/strategy.py b/src/lightning/pytorch/strategies/strategy.py
@@ -363,9 +363,9 @@ def lightning_module(self) -> Optional["pl.LightningModule"]:
         """Returns the pure LightningModule without potential wrappers."""
         return self._lightning_module
 
-    def load_checkpoint(self, checkpoint_path: _PATH) -> dict[str, Any]:
+    def load_checkpoint(self, checkpoint_path: _PATH, weights_only: bool) -> dict[str, Any]:
         torch.cuda.empty_cache()
-        return self.checkpoint_io.load_checkpoint(checkpoint_path)
+        return self.checkpoint_io.load_checkpoint(checkpoint_path, weights_only=weights_only)
 
     def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         assert self.lightning_module is not None
diff --git a/src/lightning/pytorch/trainer/connectors/checkpoint_connector.py b/src/lightning/pytorch/trainer/connectors/checkpoint_connector.py
@@ -64,7 +64,7 @@ def _hpc_resume_path(self) -> Optional[str]:
             return dir_path_hpc + fs.sep + f"hpc_ckpt_{max_version}.ckpt"
         return None
 
-    def resume_start(self, checkpoint_path: Optional[_PATH] = None) -> None:
+    def resume_start(self, checkpoint_path: Optional[_PATH] = None, weights_only: bool = False) -> None:
         """Attempts to pre-load the checkpoint file to memory, with the source path determined in this priority:
 
         1. from HPC weights if `checkpoint_path` is ``None`` and on SLURM or passed keyword `"hpc"`.
@@ -80,7 +80,7 @@ def resume_start(self, checkpoint_path: Optional[_PATH] = None) -> None:
 
         rank_zero_info(f"Restoring states from the checkpoint path at {checkpoint_path}")
         with pl_legacy_patch():
-            loaded_checkpoint = self.trainer.strategy.load_checkpoint(checkpoint_path)
+            loaded_checkpoint = self.trainer.strategy.load_checkpoint(checkpoint_path, weights_only)
         self._loaded_checkpoint = _pl_migrate_checkpoint(loaded_checkpoint, checkpoint_path)
 
     def _select_ckpt_path(
@@ -403,9 +403,11 @@ def restore_lr_schedulers(self) -> None:
         for config, lrs_state in zip(self.trainer.lr_scheduler_configs, lr_schedulers):
             config.scheduler.load_state_dict(lrs_state)
 
-    def _restore_modules_and_callbacks(self, checkpoint_path: Optional[_PATH] = None) -> None:
+    def _restore_modules_and_callbacks(
+        self, checkpoint_path: Optional[_PATH] = None, weights_only: bool = False
+    ) -> None:
         # restore modules after setup
-        self.resume_start(checkpoint_path)
+        self.resume_start(checkpoint_path, weights_only)
         self.restore_model()
         self.restore_datamodule()
         self.restore_callbacks()
diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py
@@ -526,6 +526,7 @@ def fit(
         val_dataloaders: Optional[EVAL_DATALOADERS] = None,
         datamodule: Optional[LightningDataModule] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
     ) -> None:
         r"""Runs the full optimization routine.
 
@@ -573,7 +574,14 @@ def fit(
         self.training = True
         self.should_stop = False
         call._call_and_handle_interrupt(
-            self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
+            self,
+            self._fit_impl,
+            model,
+            train_dataloaders,
+            val_dataloaders,
+            datamodule,
+            ckpt_path,
+            weights_only,
         )
 
     def _fit_impl(
@@ -583,6 +591,7 @@ def _fit_impl(
         val_dataloaders: Optional[EVAL_DATALOADERS] = None,
         datamodule: Optional[LightningDataModule] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
     ) -> None:
         log.debug(f"{self.__class__.__name__}: trainer fit stage")
 
@@ -610,7 +619,7 @@ def _fit_impl(
             model_provided=True,
             model_connected=self.lightning_module is not None,
         )
-        self._run(model, ckpt_path=ckpt_path)
+        self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
 
         assert self.state.stopped
         self.training = False
@@ -621,6 +630,7 @@ def validate(
         model: Optional["pl.LightningModule"] = None,
         dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
         verbose: bool = True,
         datamodule: Optional[LightningDataModule] = None,
     ) -> _EVALUATE_OUTPUT:
@@ -676,14 +686,15 @@ def validate(
         self.state.status = TrainerStatus.RUNNING
         self.validating = True
         return call._call_and_handle_interrupt(
-            self, self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule
+            self, self._validate_impl, model, dataloaders, ckpt_path, weights_only, verbose, datamodule
         )
 
     def _validate_impl(
         self,
         model: Optional["pl.LightningModule"] = None,
         dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
         verbose: bool = True,
         datamodule: Optional[LightningDataModule] = None,
     ) -> Optional[Union[_PREDICT_OUTPUT, _EVALUATE_OUTPUT]]:
@@ -717,7 +728,7 @@ def _validate_impl(
         ckpt_path = self._checkpoint_connector._select_ckpt_path(
             self.state.fn, ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
         )
-        results = self._run(model, ckpt_path=ckpt_path)
+        results = self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
         # remove the tensors from the validation results
         results = convert_tensors_to_scalars(results)
 
@@ -731,6 +742,7 @@ def test(
         model: Optional["pl.LightningModule"] = None,
         dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
         verbose: bool = True,
         datamodule: Optional[LightningDataModule] = None,
     ) -> _EVALUATE_OUTPUT:
@@ -787,14 +799,15 @@ def test(
         self.state.status = TrainerStatus.RUNNING
         self.testing = True
         return call._call_and_handle_interrupt(
-            self, self._test_impl, model, dataloaders, ckpt_path, verbose, datamodule
+            self, self._test_impl, model, dataloaders, ckpt_path, weights_only, verbose, datamodule
         )
 
     def _test_impl(
         self,
         model: Optional["pl.LightningModule"] = None,
         dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
         verbose: bool = True,
         datamodule: Optional[LightningDataModule] = None,
     ) -> Optional[Union[_PREDICT_OUTPUT, _EVALUATE_OUTPUT]]:
@@ -828,7 +841,7 @@ def _test_impl(
         ckpt_path = self._checkpoint_connector._select_ckpt_path(
             self.state.fn, ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
         )
-        results = self._run(model, ckpt_path=ckpt_path)
+        results = self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
         # remove the tensors from the test results
         results = convert_tensors_to_scalars(results)
 
@@ -844,6 +857,7 @@ def predict(
         datamodule: Optional[LightningDataModule] = None,
         return_predictions: Optional[bool] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
     ) -> Optional[_PREDICT_OUTPUT]:
         r"""Run inference on your data. This will call the model forward function to compute predictions. Useful to
         perform distributed and batched predictions. Logging is disabled in the predict hooks.
@@ -899,7 +913,7 @@ def predict(
         self.state.status = TrainerStatus.RUNNING
         self.predicting = True
         return call._call_and_handle_interrupt(
-            self, self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path
+            self, self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path, weights_only
         )
 
     def _predict_impl(
@@ -909,6 +923,7 @@ def _predict_impl(
         datamodule: Optional[LightningDataModule] = None,
         return_predictions: Optional[bool] = None,
         ckpt_path: Optional[_PATH] = None,
+        weights_only: bool = False,
     ) -> Optional[_PREDICT_OUTPUT]:
         # --------------------
         # SETUP HOOK
@@ -939,15 +954,15 @@ def _predict_impl(
         ckpt_path = self._checkpoint_connector._select_ckpt_path(
             self.state.fn, ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
         )
-        results = self._run(model, ckpt_path=ckpt_path)
+        results = self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
 
         assert self.state.stopped
         self.predicting = False
 
         return results
 
     def _run(
-        self, model: "pl.LightningModule", ckpt_path: Optional[_PATH] = None
+        self, model: "pl.LightningModule", ckpt_path: Optional[_PATH] = None, weights_only: bool = False
     ) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]:
         if self.state.fn == TrainerFn.FITTING:
             min_epochs, max_epochs = _parse_loop_limits(
@@ -992,7 +1007,7 @@ def _run(
         # check if we should delay restoring checkpoint till later
         if not self.strategy.restore_checkpoint_after_setup:
             log.debug(f"{self.__class__.__name__}: restoring module and callbacks from checkpoint path: {ckpt_path}")
-            self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path)
+            self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path, weights_only)
 
         # reset logger connector
         self._logger_connector.reset_results()