[pipelining] throw error with ZB and compile (pytorch#143599)

H-Huang · pytorchmergebot · commit 9631d1a021cb · 2025-01-09T06:53:25.000Z
Zero bubble wil SIGSEGV when operating on a `torch.compile`'d model so raising this error while I am still investigating the cause / design for a fix. Pull Request resolved: pytorch#143599 Approved by: https://github.com/wconstab
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
@@ -58,6 +58,7 @@
 class MockPipelineStage(_PipelineStageBase):
     def __init__(self, *args, **kwargs):
         # Mock the necessary attributes
+        self.submod = None
         self.num_stages = kwargs.get("num_stages", 1)
         self.group_size = kwargs.get("group_size", 1)
         self.group_rank = kwargs.get("group_rank", 0)
@@ -197,6 +198,28 @@ def test_schedule_with_single_stage(self, ScheduleClass):
 
         torch.distributed.destroy_process_group()
 
+    def test_zero_bubble_schedule_errors_with_compile(self):
+        """
+        Test that zero bubble schedules raise an error when used with torch.compile.
+        """
+        store = FakeStore()
+        torch.distributed.init_process_group(
+            backend="fake", rank=0, world_size=1, store=store
+        )
+        n_stages = 1
+        device = torch.device("cpu")
+        model = MultiMLP(8, n_layers=n_stages)
+        # full_mod
+        compiled_model = torch.compile(model)
+        stage = PipelineStage(
+            compiled_model,
+            0,
+            n_stages,
+            device,
+        )
+        with self.assertRaises(RuntimeError):
+            ScheduleInterleavedZeroBubble([stage], 2)
+
 
 instantiate_parametrized_tests(ScheduleTest)
 
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
@@ -24,6 +24,7 @@
 
 import torch
 import torch.distributed as dist
+from torch._dynamo import OptimizedModule
 from torch.distributed.fsdp import FSDPModule, UnshardHandle
 from torch.profiler import record_function
 
@@ -2020,6 +2021,15 @@ def __init__(
         kwargs_chunk_spec: Optional[Dict[str, TensorChunkSpec]] = None,
         output_merge_spec: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
     ):
+        # TODO: we don't support Zero Bubble with torch.compile so we
+        # should disable it for now
+        for stage in stages:
+            if isinstance(stage.submod, OptimizedModule):
+                raise RuntimeError(
+                    "The Zero Bubble schedule is not supported with \
+stage modules that have used torch.compile"
+                )
+
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,