Add multiple hiding nodes (pytorch#167847)

eellison · pytorchmergebot · commit 1a0a19892a4f · 2025-11-18T02:46:12.000Z
With smaller, aten nodes, we might want to overlap a single collective with multiple nodes. Updates the overlapping, and bucketing code so that a collective can be hidden by multiple nodes. Pull Request resolved: pytorch#167847 Approved by: https://github.com/fmassa
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
@@ -1061,6 +1061,63 @@ def func(a, b, c):
             correct = func(a, b, c)
             self.assertTrue(same(out, correct))
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_multiple_hiding_nodes_bucketing(self):
+        """Test that collectives hidden by multiple compute ops can bucket together."""
+
+        # Use 0.5 compute multiplier so each collective needs 2 matmuls to be fully hidden
+        def estimate_with_half_compute(fx_node, override_size=None):
+            return estimate_aten_runtime(fx_node, compute_multiplier=0.5)
+
+        def func(a, b, *, ranks):
+            # Two all_gathers that will be hidden by multiple compute operations
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+
+            # Multiple compute operations that can hide the collectives
+            # With 0.5 multiplier: mm1 and mm2 together hide ag1, mm2 and mm3 together hide ag2
+            mm1 = torch.matmul(a, a.T)
+            mm2 = torch.matmul(b, b.T)
+            mm3 = torch.matmul(a + b, (a + b).T)
+
+            return ag1.sum() + ag2.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+
+            # Patch with custom estimation that uses 0.5 multiplier
+            with torch._inductor.config.patch(
+                {
+                    "aten_distributed_optimizations.custom_runtime_estimation": estimate_with_half_compute
+                }
+            ):
+                compiled = torch.compile(func_c)
+                out, aten_graph_str = run_and_get_aten_graph(compiled, a, b)
+
+            # Should have 1 bucketed all_gather (both ag1 and ag2 bucketed together)
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
+            ).run(aten_graph_str)
+
+            # Verify bucketed collective is scheduled before all matmuls
+            FileCheck().check("functional.all_gather_into_tensor").check(
+                "aten.mm"
+            ).check("aten.mm").check("aten.mm").check("wait_tensor").run(aten_graph_str)
+
+            # Verify correctness
+            correct = func(a, b, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
 
 def get_toy_model(device_type: str):
     """
diff --git a/test/distributed/test_overlap_bucketing_unit.py b/test/distributed/test_overlap_bucketing_unit.py
@@ -49,7 +49,8 @@ def build_collective_info(graph, hiding_annotations):
     """
     Build CollectiveInfo dict from manual hiding annotations.
 
-    hiding_annotations: dict mapping collective_start -> hiding_compute_node
+    hiding_annotations: dict mapping collective_start -> hiding_compute_node(s)
+                        Can be a single node or a list/OrderedSet of nodes
     """
     from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo
 
@@ -65,20 +66,28 @@ def build_collective_info(graph, hiding_annotations):
 
     # Build CollectiveInfo for each collective
     for start_node, wait_node in start_to_wait.items():
-        hiding_node = hiding_annotations.get(start_node)
+        hiding_annotation = hiding_annotations.get(start_node)
+
+        # Convert to OrderedSet
+        hiding_nodes = OrderedSet()
+        if hiding_annotation is not None:
+            if isinstance(hiding_annotation, list | OrderedSet):
+                hiding_nodes = OrderedSet(hiding_annotation)
+            else:
+                hiding_nodes = OrderedSet([hiding_annotation])
 
         # Estimate size and time
         size_bytes = 16 * 4  # 4x4 tensor of floats
         estimated_time_ms = 1.0  # Dummy time
-        exposed_time_ms = 0.0 if hiding_node else 1.0  # Hidden if has hiding_node
+        exposed_time_ms = 0.0 if hiding_nodes else 1.0  # Hidden if has hiding_nodes
 
         collective_info[start_node] = CollectiveInfo(
             start_node=start_node,
             wait_node=wait_node,
             size_bytes=size_bytes,
             estimated_time_ms=estimated_time_ms,
             exposed_time_ms=exposed_time_ms,
-            hiding_node=hiding_node,
+            hiding_nodes=hiding_nodes,
         )
 
     return collective_info
@@ -567,6 +576,97 @@ def func(a, b):
             graph_str
         )
 
+    def test_can_bucket_with_multiple_hiding_nodes(self):
+        """
+        Test that collectives with multiple hiding nodes CAN bucket.
+
+        Graph structure:
+        ag1_start -> ag2_start -> mm1 -> mm2 -> mm3 -> ag1_wait -> ag2_wait
+
+        Where:
+        - ag1 is hidden by mm1 and mm2
+        - ag2 is hidden by mm2 and mm3
+        - Both collectives share mm2 as a hiding node
+        """
+
+        def func(a, b):
+            group_name = "0"
+            group_size = 1
+
+            # Start both collectives
+            ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
+                a, group_size, group_name
+            )
+            ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
+                b, group_size, group_name
+            )
+
+            # Three compute operations that hide the collectives
+            mm1 = torch.mm(a, a)
+            mm2 = torch.mm(b, b)
+            mm3 = torch.mm(a + b, a + b)
+
+            # Wait for both
+            ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
+            ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
+
+            return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        # Use fake mode to trace without executing
+        with FakeTensorMode():
+            a = torch.ones(4, 4, device=self.device)
+            b = torch.ones(4, 4, device=self.device) * 2
+
+            # Trace with make_fx
+            traced = make_fx(func)(a, b)
+
+        # Find nodes using find_nodes
+        ag1, ag2 = traced.graph.find_nodes(
+            op="call_function",
+            target=torch.ops._c10d_functional.all_gather_into_tensor.default,
+        )
+        mm1, mm2, mm3 = traced.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.mm.default
+        )
+
+        # Manually annotate hiding relationships with multiple hiding nodes
+        hiding_annotations = {
+            ag1: [mm1, mm2],  # ag1 is hidden by mm1 and mm2
+            ag2: [mm2, mm3],  # ag2 is hidden by mm2 and mm3
+        }
+
+        # Build collective info and ancestors
+        collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
+        scheduled = OrderedSet(traced.graph.nodes)
+
+        # Verify hiding_nodes are correctly set
+        self.assertEqual(len(collective_info[ag1].hiding_nodes), 2)
+        self.assertIn(mm1, collective_info[ag1].hiding_nodes)
+        self.assertIn(mm2, collective_info[ag1].hiding_nodes)
+        self.assertEqual(len(collective_info[ag2].hiding_nodes), 2)
+        self.assertIn(mm2, collective_info[ag2].hiding_nodes)
+        self.assertIn(mm3, collective_info[ag2].hiding_nodes)
+
+        # Run bucketing
+        from torch._inductor.fx_passes.overlap_preserving_bucketer import (
+            OverlapPreservingBucketer,
+        )
+
+        bucketer = OverlapPreservingBucketer(
+            traced.graph,
+            collective_info,
+            node_ancestors,
+            scheduled,
+        )
+        bucketer.bucket_collectives()
+
+        FileCheck().check_count(
+            "all_gather_into_tensor_out", 1, exactly=False
+        ).check_count("torch.ops.aten.mm.default", 3, exactly=True).run(
+            str(traced.graph)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -176,18 +176,20 @@ def build_timeline(self, pg: str) -> Optional[PGEvent]:
         head = None
         prev_event = None
         position = 0
+        hiding_nodes = OrderedSet()
 
         for node in self.scheduled:
             node_type = None
 
             # Determine if this node is relevant for this PG
             if node in self.collective_info and get_group_name(node) == pg:
                 node_type = "starts"
+                hiding_nodes |= self.collective_info[node].hiding_nodes
             elif is_wait_tensor(node):
                 wait_input = node.args[0]
                 if isinstance(wait_input, fx.Node) and get_group_name(wait_input) == pg:
                     node_type = "waits"
-            elif is_compute_node(node):
+            elif is_compute_node(node) or node in hiding_nodes:
                 node_type = "compute"
 
             if node_type is None:
@@ -205,7 +207,6 @@ def build_timeline(self, pg: str) -> Optional[PGEvent]:
 
             prev_event = event
             position += 1
-
         return head
 
     def _populate_node_to_event(self, pg: str) -> None:
@@ -222,10 +223,12 @@ def _add_hiding_interval_constraints(self) -> None:
         Add hiding interval constraints: start -> compute -> wait.
         """
         for start, info in self.collective_info.items():
-            if info.hiding_node and not info.is_exposed:
+            if info.is_exposed:
+                continue
+            for hn in info.hiding_nodes:
                 # Enforce: start -> compute -> wait
-                self.aug_graph.add_extra_dep(n=info.hiding_node, dep=start)
-                self.aug_graph.add_extra_dep(n=info.wait_node, dep=info.hiding_node)
+                self.aug_graph.add_extra_dep(n=hn, dep=start)
+                self.aug_graph.add_extra_dep(n=info.wait_node, dep=hn)
 
     def bucket_collectives(self) -> None:
         """Main entry point for bucketing collectives."""
@@ -358,13 +361,13 @@ def _ancestor_dep(self, n1: fx.Node, n2: fx.Node) -> bool:
 
     def _get_intervals(
         self, event: PGEvent
-    ) -> tuple[Optional[tuple[int, int]], Optional[tuple[int, int]]]:
-        """Get (execution_interval, hiding_interval) for a collective event.
+    ) -> tuple[Optional[tuple[int, int]], list[tuple[int, int]]]:
+        """Get (execution_interval, hiding_intervals) for a collective event.
 
         Returns:
-            (execution_interval, hiding_interval) where:
+            (execution_interval, hiding_intervals) where:
             - execution_interval is (start_pos, wait_pos) or None
-            - hiding_interval is (start_pos, compute_pos) or None if no hiding node
+            - hiding_intervals is a list of (start_pos, compute_pos) tuples, one for each hiding node
 
         Works for both start and wait events by looking up the collective info.
         """
@@ -375,28 +378,31 @@ def _get_intervals(
         elif event.is_wait:
             wait_input = event.node.args[0]
             if not isinstance(wait_input, fx.Node):
-                return None, None
+                return None, []
             coll = wait_input
         else:
-            return None, None
+            return None, []
 
         if coll not in self.collective_info:
-            return None, None
+            return None, []
 
         info = self.collective_info[coll]
         start_event = self.node_to_event[coll]
         wait_event = self.node_to_event[info.wait_node]
 
         execution_interval = (start_event.position, wait_event.position)
 
-        hiding_interval = None
-        if info.hiding_node:
-            hiding_interval = (
-                start_event.position,
-                self.node_to_event[info.hiding_node].position,
-            )
+        hiding_intervals = []
+        if info.hiding_nodes:
+            for hiding_node in info.hiding_nodes:
+                hiding_intervals.append(
+                    (
+                        start_event.position,
+                        self.node_to_event[hiding_node].position,
+                    )
+                )
 
-        return execution_interval, hiding_interval
+        return execution_interval, hiding_intervals
 
     def _preserves_hiding_intervals(
         self,
@@ -424,9 +430,9 @@ def _preserves_hiding_intervals(
         # Collect hiding compute positions for the bucket
         bucket_hiding_compute_positions = []
         for coll in all_bucketed_colls:
-            if hiding_node := self.collective_info[coll].hiding_node:
+            for coll_hiding_node in self.collective_info[coll].hiding_nodes:
                 bucket_hiding_compute_positions.append(
-                    self.node_to_event[hiding_node].position
+                    self.node_to_event[coll_hiding_node].position
                 )
 
         # Get new positions
@@ -478,11 +484,10 @@ def get_pos(n: fx.Node) -> int:
                 curr_event.node not in all_bucketed_colls
                 and curr_event.node not in all_bucketed_waits
             ):
-                exec_interval, hiding_interval = self._get_intervals(curr_event)
+                exec_interval, hiding_interval_list = self._get_intervals(curr_event)
                 if exec_interval:
                     execution_intervals.append(exec_interval)
-                if hiding_interval:
-                    hiding_intervals.append(hiding_interval)
+                hiding_intervals.extend(hiding_interval_list)
             curr_event = curr_event.next
 
         curr_event = new_wait_event.prev
@@ -491,11 +496,10 @@ def get_pos(n: fx.Node) -> int:
                 curr_event.node not in all_bucketed_colls
                 and curr_event.node not in all_bucketed_waits
             ):
-                exec_interval, hiding_interval = self._get_intervals(curr_event)
+                exec_interval, hiding_interval_list = self._get_intervals(curr_event)
                 if exec_interval:
                     execution_intervals.append(exec_interval)
-                if hiding_interval:
-                    hiding_intervals.append(hiding_interval)
+                hiding_intervals.extend(hiding_interval_list)
             curr_event = curr_event.prev
 
         # Check: no hiding interval should be enclosed by any execution interval
@@ -659,12 +663,12 @@ def _has_ancestor_conflicts(
                 return True
 
             # Check if existing hiding node conflicts with candidate wait
-            if hiding_node := self.collective_info[coll].hiding_node:
-                if self._ancestor_dep(hiding_node, candidate_wait):
+            for old_hiding_node in self.collective_info[coll].hiding_nodes:
+                if self._ancestor_dep(old_hiding_node, candidate_wait):
                     return True
 
             # Check if candidate hiding node conflicts with existing wait
-            if new_hiding_node := candidate_info.hiding_node:
+            for new_hiding_node in candidate_info.hiding_nodes:
                 if self._ancestor_dep(new_hiding_node, coll_wait):
                     return True
 
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py