Fix ModelParallel single-file checkpoint with compiled modules

littlebullGit · littlebullGit · commit 42a991725581 · 2025-11-27T11:02:34.000-05:00
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ---
 
+### Fixed
+
+- Fixed ``ModelParallelStrategy`` single-file checkpointing when ``torch.compile`` wraps the model so optimizer states no longer raise ``KeyError`` during save ([#21357](https://github.com/Lightning-AI/pytorch-lightning/issues/21357))
+
+---
+
 ## [2.6.0] - 2025-11-21
 
 ### Added
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
@@ -286,8 +286,18 @@ def optimizer_state(self, optimizer: Optimizer) -> dict[str, Any]:
 
         state_dict = get_optimizer_state_dict(self.model, optimizer, options=state_dict_options)
         if not self._save_distributed_checkpoint and self.global_rank == 0:
-            # Store the optimizer state dict in standard format
-            state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, self.model)
+            # ``torch.compile`` wraps the module, so state dict keys are prefixed with ``_orig_mod.``.
+            # Rekey on the wrapped module first, then rekey again on the original module so parameter
+            # names match what the Trainer expects when saving a single-file checkpoint.
+            compiled_model = self.model
+            original_model = getattr(compiled_model, "_orig_mod", None)
+            if original_model is not None:
+                state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, compiled_model)
+                state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_NAME, compiled_model)
+                state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_NAME, original_model)
+                state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, original_model)
+            else:
+                state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, compiled_model)
         return state_dict
 
     @override