Add regression test for ModelParallel single-file checkpoint

littlebullGit · littlebullGit · commit db3d718a91f3 · 2025-11-26T22:48:14.000-05:00
diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py
@@ -237,6 +237,38 @@ def training_step(self, batch):
     trainer.fit(model)
 
 
+@RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=2)
+def test_model_parallel_single_file_checkpoint_with_compile(distributed, tmp_path):
+    """Ensure assembling non-distributed checkpoints works when the model is compiled (torch.compile)."""
+
+    seed_everything(0)
+    strategy = ModelParallelStrategy(
+        data_parallel_size=1,
+        tensor_parallel_size=2,
+        save_distributed_checkpoint=False,
+    )
+
+    trainer = Trainer(
+        accelerator="auto",
+        devices=2,
+        strategy=strategy,
+        max_steps=2,
+        limit_train_batches=2,
+        logger=False,
+        enable_model_summary=False,
+        default_root_dir=tmp_path,
+    )
+
+    with trainer.init_module(empty_init=True):
+        model = FSDP2Model(compile=True)
+
+    trainer.fit(model)
+    checkpoint_path = tmp_path / "compiled-model.ckpt"
+    trainer.save_checkpoint(checkpoint_path)
+    if trainer.is_global_zero:
+        assert checkpoint_path.is_file()
+
+
 @RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=4)
 @pytest.mark.parametrize(
     "compile",