Fix strict loading from distributed checkpoints vs PyTorch nightly (#19946)

awaelchli · web-flow · commit 8bfbe0c90821 · 2024-06-04T22:09:01.000-04:00
* strict loading

* docstring
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
@@ -275,12 +275,7 @@ def load_checkpoint(
         state: Optional[Union[Module, Optimizer, Dict[str, Union[Module, Optimizer, Any]]]] = None,
         strict: bool = True,
     ) -> Dict[str, Any]:
-        """Load the contents from a checkpoint and restore the state of the given objects.
-
-        Currently does not support loading the optimizer state if the model is distributed but the checkpoint is a full,
-        non-distributed checkpoint.
-
-        """
+        """Load the contents from a checkpoint and restore the state of the given objects."""
         if not state:
             raise ValueError(
                 f"Got {type(self).__name__}.load_checkpoint(..., state={state!r}) but a state with at least "
@@ -559,14 +554,14 @@ def _load_raw_module_state(
         state_dict_options = StateDictOptions(
             broadcast_from_rank0=True,  # type: ignore[call-arg]
             full_state_dict=True,
-            strict=strict,  # gets ignored at the moment
+            # must be set False to allow loading each param separately below
+            strict=False,
         )
 
         for submodule_name, submodule in module.named_modules():
             for param_name, _ in _named_parameters_and_buffers_to_load(submodule):
                 full_param_name = f"{submodule_name}{'.' if submodule_name else ''}{param_name}"
                 if full_param_name not in state_dict:
-                    # Note: PyTorch does not currently respect the `strict` setting in state_dict_options!
                     if not strict:
                         continue
                     raise KeyError(