Constant folding for lifted graph (pytorch#135060)

trieuat · pytorchmergebot · commit 633dcf1a2d80 · 2024-10-28T06:28:31.000Z
Summary: Current implementation for lifted graph takes a dict of [constant name: constant value]. And the constant value is used to run_node and excute the constant graph to get the folded values and then create new getattr nodes for folded values. We don't have constant values for lifted graph during model compilation on MTIA. I think it is more general to allow the constant folding pass to just take the constant names only to produce the constant graph and represent the folded nodes as placeholders to make it consistent with lifted graph. Additionally, this mimic the real situation on Sigmoid, where Sigmoid executes the constant graph, get the folded values and set the folded values to the main graph. This diff is to update the pass to work with a list of constant names. Test Plan: ``` buck run mode/opt caffe2/test:test_export -- -r split_const_gm ``` Differential Revision: D62144791 Pull Request resolved: pytorch#135060 Approved by: https://github.com/SherlockNoMad Co-authored-by: Tuan Trieu <tuant@meta.com>
diff --git a/test/export/test_export.py b/test/export/test_export.py
@@ -8089,7 +8089,9 @@ def forward(self, x):
                 w_transpose = torch.transpose(self.w_pre, 0, 1)
                 w_relu = torch.nn.functional.relu(w_transpose)
                 w = w_relu + self.b
-                return torch.matmul(x, w)
+                return (
+                    torch.matmul(x, w) + self.b + torch.arange(4, dtype=torch.float16)
+                )
 
         example_inputs = (torch.randn(4, 4),)
         mod = Model()
@@ -8105,17 +8107,38 @@ def forward(self, x):
             for n, spec in zip(placeholder_nodes, new_sig.input_specs)
             if spec.target is not None
         }
-        const_gm, _ = split_const_gm(new_gm, lifted_constants)
+        # [self.w_pre, self.b]
+        lifted_constant_names = list(lifted_constants)
+        lifted_constant_values = [lifted_constants[n] for n in lifted_constant_names]
+        const_gm, _ = split_const_gm(new_gm, False, lifted_constant_names)
         counter = 0
         for node in const_gm.graph.nodes:
             if node.op == "call_function":
                 counter += 1
-        self.assertTrue(counter > 0)
+        self.assertTrue(counter == 4)
+        counter = 0
+        for n in new_gm.graph.nodes:
+            if n.op == "placeholder":
+                counter += 1
+        # expect 3 existing placeholders and 2 folded constant
+        self.assertTrue(counter == 5)
+        # return (self.b, folded_const, folded_const)
+        const_folded_value = const_gm(*lifted_constant_values)
+
         test_input = torch.randn(4, 4)
-        expected = new_gm(None, None, test_input)[0]
-        actual = mod(test_input)
+        # new_gm(c_w_pre, b, x, folded_const, folded_const)
+        actual = new_gm(
+            lifted_constant_values[0],
+            const_folded_value[0],
+            test_input,
+            const_folded_value[1],
+            const_folded_value[2],
+        )[0]
+        expected = mod(test_input)
         self.assertEqual(actual, expected)
-        const_gm, _ = split_const_gm(ep.graph_module, lifted_constants, lambda x: True)
+        const_gm, _ = split_const_gm(
+            ep.graph_module, False, lifted_constant_names, lambda x: True
+        )
         counter = 0
         for node in const_gm.graph.nodes:
             if node.op == "call_function":
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -350,7 +350,8 @@ def _recursive_post_grad_passes(gm: GraphModule, is_inference: bool = False) ->
 
 def split_const_gm(
     gm: GraphModule,
-    lifted_constants: Optional[Dict[str, Any]] = None,
+    skip_constructor: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
     skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> Tuple[GraphModule, Dict[str, int]]:
     """
@@ -377,9 +378,10 @@ def split_const_gm(
         run_and_get_constant_graph,
     )
 
-    const_gm, const_result = run_and_get_constant_graph(
-        gm, lifted_constants, skip_folding_node_fn
+    const_gm = run_and_get_constant_graph(
+        gm, skip_constructor, lifted_constant_names, skip_folding_node_fn
     )
+    const_result = const_gm() if lifted_constant_names is None else None
 
     const_outputs = {
         x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
@@ -399,7 +401,11 @@ def split_const_gm(
         replace_node_with_constant(
             gm,
             node,
-            const_result[const_outputs[node.name]],
+            (
+                const_result[const_outputs[node.name]]
+                if lifted_constant_names is None
+                else None
+            ),
             new_const_name,
         )
         const_output_index[new_const_name] = const_outputs[node.name]
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
@@ -1,5 +1,5 @@
 import collections
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -18,7 +18,7 @@
 def replace_node_with_constant(
     gm: torch.fx.GraphModule,
     node: torch.fx.Node,
-    constant: torch.Tensor,
+    constant: Optional[torch.Tensor] = None,
     name: Optional[str] = None,
 ) -> None:
     g = gm.graph
@@ -39,32 +39,33 @@ def replace_node_with_constant(
         gm._frozen_param_count = i + 1
 
     with g.inserting_before(node):
-        new_input_node = g.create_node("get_attr", qualname, (), {})
+        if constant is not None:
+            new_input_node = g.create_node("get_attr", qualname, (), {})
+        else:
+            # this is the case for lifted constants
+            new_input_node = g.create_node("placeholder", qualname, (), {})
         node.replace_all_uses_with(new_input_node)
         new_input_node.meta.update(node.meta)
         g.erase_node(node)
 
-    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
-    gm.register_buffer(qualname, constant)
-    setattr(gm, qualname, constant)
+    if constant is not None:
+        # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+        gm.register_buffer(qualname, constant)
+        setattr(gm, qualname, constant)
 
 
 def is_const_source(
-    node: torch.fx.Node, lifted_constants: Optional[Dict[str, Any]]
+    node: torch.fx.Node, lifted_constant_names: Optional[List[str]]
 ) -> bool:
-    return node.op == "get_attr" or (
-        node.op == "placeholder"
-        and lifted_constants is not None
-        and node.name in lifted_constants
-    )
+    return node.op == "get_attr" or node.name in (lifted_constant_names or ())
 
 
 class ConstantFolder(torch.fx.Interpreter):
     def __init__(
         self,
         gm: torch.fx.GraphModule,
         skip_constructors: bool = False,
-        lifted_constants: Optional[Dict[str, torch.Tensor]] = None,
+        lifted_constant_names: Optional[List[str]] = None,
         skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
     ) -> None:
         super().__init__(gm)
@@ -76,14 +77,27 @@ def __init__(
         # overwrite this to deallocate env values if their only remaining use
         # is the output
         self.user_to_last_uses = self.node_to_last_non_output_use()
-        self.lifted_constants = lifted_constants
+        self.lifted_constant_names = lifted_constant_names
+        self.deferred_value = object()
 
     def _support_dynamic_shape(self) -> bool:
         # ConstantFolder not support dynamic shape now
         return False
 
     def _deduce_value(self, node: torch.fx.Node) -> Any:
-        return super().run_node(node)
+        if self.lifted_constant_names is None:
+            return super().run_node(node)
+        # if lifted_constant_names is passed in, no concrete value is available
+        # so we just check if all inputs have values
+        flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+        for inp in flattened_node_inps:
+            if (
+                isinstance(inp, torch.fx.Node)
+                and inp.name not in (self.lifted_constant_names or ())
+                and self.env[inp] != self.deferred_value
+            ):
+                return self.unknown_value
+        return self.deferred_value
 
     def is_impure(self, node: torch.fx.node.Node) -> bool:
         def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
@@ -103,7 +117,7 @@ def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
                 and is_woq_int8_pattern(next(iter(node.users)))
             )
         ) and is_const_source(
-            node.args[0], self.lifted_constants  # type: ignore[arg-type]
+            node.args[0], self.lifted_constant_names  # type: ignore[arg-type]
         ):
             # Case 1: int8_weight -> dq -> bf16_weight
             # Case 2: int8_weight -> permute -> dq -> bf16_weight
@@ -191,7 +205,7 @@ def set_env(arg: torch.fx.Node) -> None:
         # TODO - more complicated strategy
         if (
             self.skip_constructors
-            and not is_const_source(node, self.lifted_constants)
+            and not is_const_source(node, self.lifted_constant_names)
             and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
         ):
             return self.unknown_value
@@ -207,10 +221,10 @@ def set_env(arg: torch.fx.Node) -> None:
         if out == self.unknown_value:
             return self.unknown_value
 
-        if not is_const_source(node, self.lifted_constants) and isinstance(
-            out, torch.Tensor
+        if not is_const_source(node, self.lifted_constant_names) and (
+            isinstance(out, torch.Tensor) or out == self.deferred_value
         ):
-            if out.device.type == "meta":
+            if out != self.deferred_value and out.device.type == "meta":
                 return out
 
             if not self.insertable_tensor_check(out):
@@ -248,10 +262,12 @@ def run(self) -> Any:  # type: ignore[override]
 
     def insert_placerholder_values(self, env: Dict[torch.fx.Node, Any]) -> None:
         for n in self.module.graph.find_nodes(op="placeholder"):
-            if self.lifted_constants is not None and n.name in self.lifted_constants:
-                env[n] = self.lifted_constants[n.name]
-            else:
-                env[n] = self.unknown_value  # type: ignore[assignment]
+            env[n] = self.unknown_value  # type: ignore[assignment]
+        if self.lifted_constant_names is None:
+            return
+        for n in self.module.graph.nodes:
+            if n.name in (self.lifted_constant_names or ()):
+                env[n] = self.deferred_value
 
 
 def constant_fold(
@@ -284,12 +300,15 @@ def constant_fold(
 
 def constant_graph_tag(
     gm: torch.fx.GraphModule,
-    lifted_constants: Optional[Dict[str, Any]],
-    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
+    skip_constructors: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> None:
     with torch.utils._python_dispatch._disable_current_modes():
         cf = ConstantFolder(
-            gm, skip_constructors=True, lifted_constants=lifted_constants
+            gm,
+            skip_constructors=skip_constructors,
+            lifted_constant_names=lifted_constant_names,
         )
         cf.run()
 
@@ -298,7 +317,7 @@ def constant_graph_tag(
                 node.meta[META_TAG] = MODULE_TAG
                 continue
             if (
-                is_const_source(node, lifted_constants)
+                is_const_source(node, lifted_constant_names)
                 or node in cf.node_replacements
                 or node in cf.replaced_uses
             ):
@@ -309,15 +328,18 @@ def constant_graph_tag(
 
 def run_and_get_constant_graph(
     gm: torch.fx.GraphModule,
-    lifted_constants: Optional[Dict[str, Any]],
-    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
-) -> Tuple[torch.fx.GraphModule, Tuple[torch.Tensor, ...]]:
+    skip_constructors: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+) -> torch.fx.GraphModule:
     """
     Construct a GraphModule which corresponds to the part which could be
     constant folded in provided gm.
     """
 
-    constant_graph_tag(gm, lifted_constants, skip_folding_node_fn)
+    constant_graph_tag(
+        gm, skip_constructors, lifted_constant_names, skip_folding_node_fn
+    )
 
     def untag(node: torch.fx.Node) -> bool:
         used_to_fold = False
@@ -329,19 +351,11 @@ def untag(node: torch.fx.Node) -> bool:
             node.meta[META_TAG] = MODULE_TAG
         return used_to_fold
 
-    const_args = []
-    if lifted_constants is not None:
-        placeholders = list(gm.graph.find_nodes(op="placeholder"))
-        for node in placeholders:
-            if node.meta[META_TAG] == MODULE_TAG:
-                continue
-            if untag(node):
-                const_args.append(lifted_constants[node.name])
-
     # We rewrite the tags, if it's a constant being directly consumed, without
     # any folding opportunity, we keep it in main gm.
-    for node in gm.graph.find_nodes(op="get_attr"):
-        untag(node)
+    for node in gm.graph.nodes:
+        if node.op == "getattr" or (node.name in (lifted_constant_names or ())):
+            untag(node)
 
     new_graph = torch.fx.Graph()
 
@@ -363,5 +377,4 @@ def untag(node: torch.fx.Node) -> bool:
     new_graph.lint()
     new_gm = torch.fx.GraphModule(gm, new_graph)
 
-    const_result = new_gm(*const_args)
-    return new_gm, const_result
+    return new_gm