Revert "bucketing compile time improve (pytorch#168122)"

pytorchmergebot · pytorchmergebot · commit 654c5fba3e6c · 2025-11-24T14:24:17.000Z
This reverts commit 1328a02. Reverted pytorch#168122 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](pytorch#168122 (comment)))
diff --git a/test/distributed/test_overlap_bucketing_unit.py b/test/distributed/test_overlap_bucketing_unit.py
@@ -93,6 +93,28 @@ def build_collective_info(graph, hiding_annotations):
     return collective_info
 
 
+def compute_ancestors(graph):
+    """Compute ancestor sets for all nodes in the graph."""
+    node_ancestors = {}
+
+    for node in graph.nodes:
+        ancestors = OrderedSet()
+        stack = list(node.all_input_nodes)
+        visited = set()
+
+        while stack:
+            current = stack.pop()
+            if current in visited:
+                continue
+            visited.add(current)
+            ancestors.add(current)
+            stack.extend(current.all_input_nodes)
+
+        node_ancestors[node] = ancestors
+
+    return node_ancestors
+
+
 @requires_accelerator_dist_backend()
 @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
 @instantiate_parametrized_tests
@@ -168,8 +190,9 @@ def func(a, b):
             ag2: mm2,  # mm2 hides ag2
         }
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing
@@ -180,6 +203,7 @@ def func(a, b):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
         bucketer.bucket_collectives()
@@ -254,8 +278,9 @@ def func(a, b):
             ag2: mm2,  # mm2 hides ag2
         }
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing
@@ -266,6 +291,7 @@ def func(a, b):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
         bucketer.bucket_collectives()
@@ -355,8 +381,9 @@ def func(a, b, c):
         if final_mm_hidden:
             hiding_annotations[rs] = mm2
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing logic to find buckets (without applying them, which would require process groups)
@@ -367,6 +394,7 @@ def func(a, b, c):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
 
@@ -439,6 +467,7 @@ def func(a, b):
 
         # Build collective info
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing
@@ -449,6 +478,7 @@ def func(a, b):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
         bucketer.bucket_collectives()
@@ -520,8 +550,9 @@ def func(a, b):
             ag2: mm2,  # mm2 hides ag2
         }
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing with multidtype mode
@@ -532,6 +563,7 @@ def func(a, b):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
             bucket_mode="custom_ops_multidtype",
         )
@@ -603,8 +635,9 @@ def func(a, b):
             ag2: [mm2, mm3],  # ag2 is hidden by mm2 and mm3
         }
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Verify hiding_nodes are correctly set
@@ -623,6 +656,7 @@ def func(a, b):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
         bucketer.bucket_collectives()
@@ -695,8 +729,9 @@ def func(a, b, c):
             ag3: mm,
         }
 
-        # Build collective info and scheduled
+        # Build collective info and ancestors
         collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
         scheduled = OrderedSet(traced.graph.nodes)
 
         # Run bucketing
@@ -707,6 +742,7 @@ def func(a, b, c):
         bucketer = OverlapPreservingBucketer(
             traced.graph,
             collective_info,
+            node_ancestors,
             scheduled,
         )
         bucketer.bucket_collectives()
diff --git a/torch/_inductor/fx_passes/overlap_manual_scheduling.py b/torch/_inductor/fx_passes/overlap_manual_scheduling.py
@@ -182,6 +182,7 @@ def __init__(
         self.bucketer = ManualOverlapPreservingBucketer(
             graph=self.graph,
             collective_info=self.collective_info,
+            node_ancestors=self.node_ancestors,
             node_users=self.node_users,
             scheduled=OrderedSet(self.graph.nodes),
         )
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -1,4 +1,3 @@
-import itertools
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
@@ -131,6 +130,7 @@ def __init__(
         self,
         graph: fx.Graph,
         collective_info: dict[fx.Node, CollectiveInfo],
+        node_ancestors: dict[fx.Node, OrderedSet[fx.Node]],
         scheduled: OrderedSet[fx.Node],
         max_bucket_memory_gb: float = 1.0,
         max_coll_distance: int = 1000,
@@ -139,45 +139,18 @@ def __init__(
     ):
         self.graph = graph
         self.collective_info = collective_info
+        self.node_ancestors = node_ancestors
         self.scheduled = scheduled
         self.max_bucket_memory_gb = max_bucket_memory_gb
         self.node_idx = {n: i for i, n in enumerate(scheduled)}
+        self.aug_graph = AugmentedGraphHelper(self.graph, self.node_ancestors)
         self.max_coll_distance = max_coll_distance
         self.insert_overlap_deps = insert_overlap_deps
         self.bucket_mode = bucket_mode
         self.node_to_event: dict[fx.Node, PGEvent] = {}
-
-        # Compute ancestors including original graph edges and hiding interval dependencies
-        self.node_ancestors = self._compute_node_ancestors()
-        self.aug_graph = AugmentedGraphHelper(self.graph, self.node_ancestors)
-
-        # Build timelines and add constraints to aug_graph
         self.pg_to_timeline_head: dict[str, Optional[PGEvent]] = self.build_timelines()
-        self._add_hiding_interval_constraints()
-
-    def _compute_node_ancestors(self) -> dict[fx.Node, OrderedSet[fx.Node]]:
-        """
-        Compute ancestor sets for all nodes including:
-        1. Original graph edges
-        2. Hiding interval deps: collective_start -> hiding_node -> wait
-        """
-        augmented_inputs: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
-        for start, info in self.collective_info.items():
-            if info.is_exposed:
-                continue
-            for hiding_node in info.hiding_nodes:
-                augmented_inputs[hiding_node].add(start)
-                augmented_inputs[info.wait_node].add(hiding_node)
 
-        node_ancestors: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
-        for node in self.scheduled:
-            for input_node in itertools.chain(
-                augmented_inputs[node], node.all_input_nodes
-            ):
-                node_ancestors[node].add(input_node)
-                node_ancestors[node] |= node_ancestors[input_node]
-
-        return node_ancestors
+        self._add_hiding_interval_constraints()
 
     def build_timelines(self) -> dict[str, Optional[PGEvent]]:
         "Construct each process groups ordered series of event"
@@ -364,30 +337,21 @@ def _find_buckets(
             )
             processed.add(start_node)
 
-            # Greedy optimization: stop after consecutive failures
-            consecutive_failures = 0
-            max_consecutive_failures = 20
-
             # Check candidates in sorted order, break when beyond max distance
             for candidate in sorted_collectives[i + 1 : i + 1 + self.max_coll_distance]:
+                if candidate in processed:
+                    continue
+
                 candidate_bytes = self.collective_info[candidate].size_bytes
                 # proxy on memory use, if we see a too large bucket,
                 # dont look for another, later bucket
                 if bucket_info.total_bytes + candidate_bytes > max_bucket_bytes:
                     break
 
-                if candidate in processed:
-                    continue
-
                 if self._can_add_to_bucket(bucket_info, candidate):
                     bucket_info.collectives.append(candidate)
                     bucket_info.total_bytes += candidate_bytes
                     processed.add(candidate)
-                    consecutive_failures = 0  # Reset on success
-                else:
-                    consecutive_failures += 1
-                    if consecutive_failures >= max_consecutive_failures:
-                        break
 
             if len(bucket_info.collectives) > 1:
                 buckets.append(bucket_info)
@@ -692,28 +656,23 @@ def _has_ancestor_conflicts(
         candidate_wait = candidate_info.wait_node
 
         for coll in bucket_info.collectives:
-            if (
-                coll in self.node_ancestors[candidate]
-                or candidate in self.node_ancestors[coll]
-            ):
+            # Check if collectives are ancestors of each other
+            if self._ancestor_dep(coll, candidate):
                 return True
 
             # Check if waits are ancestors of each other
             coll_wait = self.collective_info[coll].wait_node
-            if (
-                coll_wait in self.node_ancestors[candidate_wait]
-                or candidate_wait in self.node_ancestors[coll_wait]
-            ):
+            if self._ancestor_dep(candidate_wait, coll_wait):
                 return True
 
             # Check if existing hiding node conflicts with candidate wait
             for old_hiding_node in self.collective_info[coll].hiding_nodes:
-                if candidate_wait in self.node_ancestors[old_hiding_node]:
+                if self._ancestor_dep(old_hiding_node, candidate_wait):
                     return True
 
             # Check if candidate hiding node conflicts with existing wait
             for new_hiding_node in candidate_info.hiding_nodes:
-                if coll_wait in self.node_ancestors[new_hiding_node]:
+                if self._ancestor_dep(new_hiding_node, coll_wait):
                     return True
 
         return False
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py
@@ -1125,6 +1125,7 @@ def _bucket_collectives(self) -> None:
         bucketer = OverlapPreservingBucketer(
             graph=self.graph,
             collective_info=self.collective_info,
+            node_ancestors=self.node_ancestors,
             scheduled=self.scheduled,
             max_bucket_memory_gb=2.0,  # Could make this configurable
             max_coll_distance=self.max_node_distance,

Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ def __init__(`
`182`	`182`	`self.bucketer = ManualOverlapPreservingBucketer(`
`183`	`183`	`graph=self.graph,`
`184`	`184`	`collective_info=self.collective_info,`
	`185`	`+ node_ancestors=self.node_ancestors,`
`185`	`186`	`node_users=self.node_users,`
`186`	`187`	`scheduled=OrderedSet(self.graph.nodes),`
`187`	`188`	`)`