[Hierarchical Compile] Add mutation dependencies to topological sorting (pytorch#152410)

mlazos · pytorchmergebot · commit 118192011e61 · 2025-05-13T12:17:59.000Z
Pull Request resolved: pytorch#152410 Approved by: https://github.com/anijain2305 ghstack dependencies: pytorch#152389, pytorch#152505
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
@@ -1,10 +1,16 @@
 # Owner(s): ["module: dynamo"]
 # flake8: noqa: B950
+import contextlib
+
 import torch
 import torch.fx
 from torch._dynamo.graph_utils import _detect_cycles
 from torch._dynamo.test_case import TestCase
-from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+from torch._dynamo.testing import (
+    AotEagerAndRecordGraphs,
+    extract_graph_and_tracker,
+    normalize_gm,
+)
 
 
 def extract_graph(fn, *args, **kwargs):
@@ -18,9 +24,19 @@ def graph_str(gm):
 
 
 class GraphDededuplicationTests(TestCase):
+    def setUp(self):
+        self.exit_stack = contextlib.ExitStack()
+        self.exit_stack.enter_context(
+            torch._dynamo.config.patch("use_graph_deduplication", True)
+        )
+        super().setUp()
+
+    def tearDown(self):
+        self.exit_stack.close()
+        super().tearDown()
+
     def run_and_return_graphs(self, fn, *args, **kwargs):
-        with torch._dynamo.config.patch("use_graph_deduplication", True):
-            return extract_graph(fn, *args, **kwargs)
+        return extract_graph(fn, *args, **kwargs)
 
     def test_single_subgraph(self):
         def inner_fn(x, y):
@@ -691,7 +707,7 @@ def get_node(name):
         sum_2 = get_node("sum_2")
         exit_autocast = mod.graph.call_function(torch.amp._exit_autocast)
         sum_2.append(exit_autocast)
-        additional_deps = _populate_additional_deps(mod.graph)
+        additional_deps = _populate_additional_deps(mod.graph, {})
         invoke_subgraph = get_node("invoke_subgraph")
         invoke_subgraph.append(enter_autocast)
         getitem_1 = get_node("getitem_1")
@@ -906,6 +922,105 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
 """,
         )
 
+    def test_mutation_ordering(self):
+        from torch._dynamo.graph_deduplication import (
+            _populate_additional_deps,
+            _stable_topological_sort,
+        )
+
+        def inner_fn(x, y):
+            x0 = x.view(x.size())
+            return x0.view(x.size())
+
+        def inner_fn2(x, y):
+            x = x * 2
+            y = y * 2
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = inner_fn(x, y)
+            x.add_(x)
+            o2 = inner_fn2(x, y)
+            y.mul_(y)
+            o3 = inner_fn2(x, y)
+            return o0 + o1 + o2.sum() + o3.sum()
+
+        x = torch.rand(10, 10)
+        y = torch.rand(10, 20)
+        x_clone = x.clone()
+        y_clone = y.clone()
+
+        graph, tracker = extract_graph_and_tracker(fn, x_clone, y_clone)
+
+        def get_node(name):
+            return next(n for n in graph.nodes if n.name == name)
+
+        additional_deps = _populate_additional_deps(
+            graph, tracker.node_to_mutated_arg_positions
+        )
+
+        self.assertExpectedInline(
+            additional_deps,
+            """defaultdict(<class 'torch.utils._ordered_set.OrderedSet'>, {add_: OrderedSet([x0, x0_1]), invoke_subgraph: OrderedSet([add_]), invoke_subgraph_1: OrderedSet([add_, mul_]), mul_: OrderedSet([invoke_subgraph])})""",
+        )
+
+        add_ = get_node("add_")
+        mul_ = get_node("mul_")
+        x0 = get_node("x0")
+        x0.append(mul_)
+        o1 = get_node("o1")
+        o1.append(add_)
+        self.assertExpectedInline(
+            graph,
+            """\
+graph():
+    %subgraph_0 : [num_users=2] = get_attr[target=subgraph_0]
+    %l_x_ : torch.Tensor [num_users=5] = placeholder[target=L_x_]
+    %l_y_ : torch.Tensor [num_users=3] = placeholder[target=L_y_]
+    %x0 : [num_users=1] = call_method[target=view](args = (%l_x_, (10, 10)), kwargs = {})
+    %mul_ : [num_users=0] = call_method[target=mul_](args = (%l_y_, %l_y_), kwargs = {})
+    %o0 : [num_users=1] = call_method[target=view](args = (%x0, (10, 10)), kwargs = {})
+    %x0_1 : [num_users=1] = call_method[target=view](args = (%l_x_, (10, 10)), kwargs = {})
+    %o1 : [num_users=1] = call_method[target=view](args = (%x0_1, (10, 10)), kwargs = {})
+    %add_ : [num_users=0] = call_method[target=add_](args = (%l_x_, %l_x_), kwargs = {})
+    %add_2 : [num_users=1] = call_function[target=operator.add](args = (%o0, %o1), kwargs = {})
+    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%subgraph_0, subgraph_0, %l_x_, %l_y_), kwargs = {})
+    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
+    %sum_5 : [num_users=1] = call_method[target=sum](args = (%getitem,), kwargs = {})
+    %add_3 : [num_users=1] = call_function[target=operator.add](args = (%add_2, %sum_5), kwargs = {})
+    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%subgraph_0, subgraph_0, %l_x_, %l_y_), kwargs = {})
+    %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
+    %sum_6 : [num_users=1] = call_method[target=sum](args = (%getitem_1,), kwargs = {})
+    %add_4 : [num_users=1] = call_function[target=operator.add](args = (%add_3, %sum_6), kwargs = {})
+    return (add_4,)""",
+        )
+        _stable_topological_sort(graph, additional_deps)
+        self.assertExpectedInline(
+            graph,
+            """\
+graph():
+    %subgraph_0 : [num_users=2] = get_attr[target=subgraph_0]
+    %l_x_ : torch.Tensor [num_users=5] = placeholder[target=L_x_]
+    %l_y_ : torch.Tensor [num_users=3] = placeholder[target=L_y_]
+    %x0 : [num_users=1] = call_method[target=view](args = (%l_x_, (10, 10)), kwargs = {})
+    %o0 : [num_users=1] = call_method[target=view](args = (%x0, (10, 10)), kwargs = {})
+    %x0_1 : [num_users=1] = call_method[target=view](args = (%l_x_, (10, 10)), kwargs = {})
+    %o1 : [num_users=1] = call_method[target=view](args = (%x0_1, (10, 10)), kwargs = {})
+    %add_ : [num_users=0] = call_method[target=add_](args = (%l_x_, %l_x_), kwargs = {})
+    %add_2 : [num_users=1] = call_function[target=operator.add](args = (%o0, %o1), kwargs = {})
+    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%subgraph_0, subgraph_0, %l_x_, %l_y_), kwargs = {})
+    %mul_ : [num_users=0] = call_method[target=mul_](args = (%l_y_, %l_y_), kwargs = {})
+    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
+    %sum_5 : [num_users=1] = call_method[target=sum](args = (%getitem,), kwargs = {})
+    %add_3 : [num_users=1] = call_function[target=operator.add](args = (%add_2, %sum_5), kwargs = {})
+    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%subgraph_0, subgraph_0, %l_x_, %l_y_), kwargs = {})
+    %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
+    %sum_6 : [num_users=1] = call_method[target=sum](args = (%getitem_1,), kwargs = {})
+    %add_4 : [num_users=1] = call_function[target=operator.add](args = (%add_3, %sum_6), kwargs = {})
+    return (add_4,)""",
+        )
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/graph_deduplication.py b/torch/_dynamo/graph_deduplication.py
@@ -57,7 +57,9 @@ def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:
     duplicated_region_groups = output_graph.region_tracker.get_identical_regions(
         output_graph.graph
     )
-    node_to_additional_deps = _populate_additional_deps(output_graph.graph)
+    node_to_additional_deps = _populate_additional_deps(
+        output_graph.graph, output_graph.region_tracker.node_to_mutated_arg_positions
+    )
 
     sub_gms: dict[str, torch.fx.GraphModule] = {}
 
@@ -107,7 +109,7 @@ def _replace_region_with_subgraph(
     inds_with_external_users: list[int],
     sub_gm: torch.fx.GraphModule,
     subgraph_name: str,
-    node_to_additional_deps: dict[torch.fx.Node, list[torch.fx.Node]],
+    node_to_additional_deps: dict[torch.fx.Node, OrderedSet[torch.fx.Node]],
 ) -> None:
     sub_args = []
     for node_ind, arg_ind in node_ind_arg_ind:
@@ -143,11 +145,12 @@ def _replace_region_with_subgraph(
     # Erase in reverse topological order
     for node in reversed(region):
         graph.erase_node(node)
-        node_to_additional_deps.pop(node)
-        for dep_list in node_to_additional_deps.values():
+        node_to_additional_deps.pop(node, None)
+        for deps in node_to_additional_deps.values():
             try:
-                dep_list.remove(node)
-            except ValueError:
+                deps.remove(node)
+                deps.add(invoke_subgraph_node)
+            except KeyError:
                 pass
 
     if config.graph_deduplication_lint:
@@ -294,23 +297,29 @@ def _stable_topological_sort(
 
 
 def _populate_additional_deps(
-    graph: torch.fx.Graph,
-) -> dict[torch.fx.Node, list[torch.fx.Node]]:
+    graph: torch.fx.Graph, node_to_mutated_arg_positions: dict[Node, OrderedSet[int]]
+) -> dict[Node, OrderedSet[Node]]:
+    node_to_additional_deps: dict[Node, OrderedSet[Node]] = defaultdict(OrderedSet)
+    _add_mutation_dependencies(node_to_mutated_arg_positions, node_to_additional_deps)
+    _add_global_state_dependencies(graph, node_to_additional_deps)
+    return node_to_additional_deps
+
+
+def _add_global_state_dependencies(
+    graph: torch.fx.Graph, node_to_additional_deps: dict[Node, OrderedSet[Node]]
+) -> None:
     import torch.amp
 
-    node_to_additional_deps: dict[torch.fx.Node, list[torch.fx.Node]] = defaultdict(
-        list
-    )
     all_nodes = list(graph.nodes)
 
     # These are targets of the nodes which need to stay in the same relative place in the graph
     global_state_targets = {torch.amp._enter_autocast, torch.amp._exit_autocast}
-    all_nodes_dep_on: list[torch.fx.Node] = []
+    all_nodes_dep_on: list[Node] = []
 
     def prev_cur_nodes(
-        all_nodes: list[torch.fx.Node],
-    ) -> Generator[tuple[list[torch.fx.Node], torch.fx.Node]]:
-        prev_nodes: list[torch.fx.Node] = []
+        all_nodes: list[Node],
+    ) -> Generator[tuple[list[Node], Node], None, None]:
+        prev_nodes: list[Node] = []
         next_nodes = list(reversed(all_nodes))
 
         while next_nodes:
@@ -320,10 +329,36 @@ def prev_cur_nodes(
 
     for prev_nodes, cur_node in prev_cur_nodes(all_nodes):
         args_unique = _get_flat_args_unique(cur_node, {})
-        additional_deps = node_to_additional_deps[cur_node]
-        additional_deps.extend(n for n in all_nodes_dep_on if n not in args_unique)
+        new_deps = [n for n in all_nodes_dep_on if n not in args_unique]
+
+        if new_deps:
+            additional_deps = node_to_additional_deps[cur_node]
+            additional_deps.update(new_deps)
+
         if cur_node.target in global_state_targets:
-            additional_deps.extend(n for n in prev_nodes if n not in args_unique)
+            additional_deps = node_to_additional_deps[cur_node]
+            additional_deps.update(n for n in prev_nodes if n not in args_unique)
             all_nodes_dep_on.append(cur_node)
 
-    return node_to_additional_deps
+
+def _add_mutation_dependencies(
+    node_to_mutated_arg_positions: dict[Node, OrderedSet[int]],
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> None:
+    for node, indices in node_to_mutated_arg_positions.items():
+        flat_args_kwargs = _get_flat_args(node, {})
+
+        # for all mutated args,
+        # add dependency on usages which occur after node to ensure
+        # node will always be ordered before them
+        # also add node as a dependency on usages which
+        # occur before node to ensure node is ordered after them
+        for index in indices:
+            mutated_arg = flat_args_kwargs[index]
+            for user in mutated_arg.users:
+                if user is node:
+                    continue
+                elif user < node:
+                    node_to_additional_deps[node].add(user)
+                elif user > node:
+                    node_to_additional_deps[user].add(node)