Add regression test for ModelParallel single-file checkpoint

littlebullGit · littlebullGit · commit d4e476fa7b65 · 2025-11-26T23:56:54.000-05:00
diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py
@@ -135,6 +135,33 @@ def configure_model(self):
         parallelize(self.model, device_mesh=self.device_mesh)
 
 
+class SimpleCompiledModule(LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.model = nn.Sequential(nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, 32))
+        self._loss = nn.MSELoss()
+
+    def configure_model(self):
+        self.model = torch.compile(self.model)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        preds = self.model(x)
+        return self._loss(preds, y)
+
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=1e-3)
+
+
+def _compiled_model_dataloader(batch_size: int = 32, num_batches: int = 2):
+    total_samples = batch_size * num_batches
+    generator = torch.Generator().manual_seed(0)
+    features = torch.randn(total_samples, 32, generator=generator)
+    targets = torch.randn(total_samples, 32, generator=generator)
+    dataset = torch.utils.data.TensorDataset(features, targets)
+    return DataLoader(dataset, batch_size=batch_size)
+
+
 @RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=4)
 def test_setup_device_mesh(distributed):
     from torch.distributed.device_mesh import DeviceMesh
@@ -237,6 +264,44 @@ def training_step(self, batch):
     trainer.fit(model)
 
 
+@RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=2)
+def test_model_parallel_single_file_checkpoint_with_compile(distributed, tmp_path):
+    """Replicate the reporter's setup: compiled model + ModelParallel single-file checkpointing."""
+
+    seed_everything(0)
+    strategy = ModelParallelStrategy(
+        data_parallel_size=1,
+        tensor_parallel_size=1,
+        save_distributed_checkpoint=False,
+    )
+
+    trainer = Trainer(
+        accelerator="auto",
+        devices=1,
+        strategy=strategy,
+        max_steps=2,
+        limit_train_batches=2,
+        enable_checkpointing=False,
+        logger=False,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        default_root_dir=tmp_path,
+    )
+
+    dataloader = _compiled_model_dataloader(batch_size=32, num_batches=2)
+
+    with trainer.init_module(empty_init=True):
+        model = SimpleCompiledModule()
+
+    trainer.fit(model, dataloader)
+
+    if trainer.is_global_zero:
+        checkpoint_path = tmp_path / "compiled-model.ckpt"
+        trainer.save_checkpoint(checkpoint_path)
+
+    trainer.strategy.barrier()
+
+
 @RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=4)
 @pytest.mark.parametrize(
     "compile",