Enable quant fusion and const propagation by default (pytorch#10394)

metascroy · facebook-github-bot · commit 7aa12ab3f414 · 2025-04-25T18:13:52.000-07:00
Summary:

This diff enables quant fusion and constant propagation by default in ExecuTorch.  It occurs after all to_edge passes, but before memory planning.

Differential Revision: D73513516
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
@@ -102,3 +102,6 @@ class ExecutorchBackendConfig:
     # serialized in the PTE file. Its value is ignored if mutable buffers are not
     # memory planned as the names must be serialized in that case.
     emit_mutable_buffer_names: bool = False
+
+    # If set to true, we run quant fusion and constant propagation passes
+    do_quant_fusion_and_const_prop: bool = True
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -431,8 +431,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             .executorch_program
         )
         # The value for beta should appear before alpha
-        self.assertEqual(program.execution_plan[0].values[12].val, Int(3))
-        self.assertEqual(program.execution_plan[0].values[13].val, Int(2))
+        self.assertEqual(program.execution_plan[0].values[4].val, Int(3))
+        self.assertEqual(program.execution_plan[0].values[5].val, Int(2))
 
     def test_kwargs2(self) -> None:
         """Tests that the kwargs are placed in the order specified by
@@ -451,10 +451,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             to_edge(export(f, (x,), strict=True)).to_executorch().executorch_program
         )
         # The value for right should appear before side
-        self.assertEqual(program.execution_plan[0].values[6].val, Bool(False))
-        self.assertEqual(program.execution_plan[0].values[7].val, Bool(True))
-        self.assertEqual(program.execution_plan[0].values[8].val, String("right"))
-        self.assertEqual(program.execution_plan[0].values[9].val, Null())
+        self.assertEqual(program.execution_plan[0].values[3].val, Bool(False))
+        self.assertEqual(program.execution_plan[0].values[4].val, Bool(True))
+        self.assertEqual(program.execution_plan[0].values[5].val, String("right"))
+        self.assertEqual(program.execution_plan[0].values[6].val, Null())
 
     def _assertCallLength(self, program: Program, idx: int, expected_len: int) -> None:
         instr_args = program.execution_plan[0].chains[0].instructions[idx].instr_args
@@ -532,24 +532,24 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Check the mul operator's stack trace contains f -> g -> h
         self.assertTrue(
             "return torch.mul(x, torch.randn(3, 2))"
-            in program.execution_plan[0].chains[0].stacktrace[1].items[-1].context
+            in program.execution_plan[0].chains[0].stacktrace[0].items[-1].context
         )
         self.assertEqual(
-            program.execution_plan[0].chains[0].stacktrace[1].items[-1].name, "f"
+            program.execution_plan[0].chains[0].stacktrace[0].items[-1].name, "f"
         )
         self.assertEqual(
-            program.execution_plan[0].chains[0].stacktrace[1].items[-2].name, "g"
+            program.execution_plan[0].chains[0].stacktrace[0].items[-2].name, "g"
         )
         self.assertEqual(
-            program.execution_plan[0].chains[0].stacktrace[1].items[-3].name, "forward"
+            program.execution_plan[0].chains[0].stacktrace[0].items[-3].name, "forward"
         )
 
         # Check the sin operator's stack trace contains g -> h
         self.assertEqual(
-            program.execution_plan[0].chains[0].stacktrace[2].items[-1].name, "g"
+            program.execution_plan[0].chains[0].stacktrace[1].items[-1].name, "g"
         )
         self.assertEqual(
-            program.execution_plan[0].chains[0].stacktrace[2].items[-2].name, "forward"
+            program.execution_plan[0].chains[0].stacktrace[1].items[-2].name, "forward"
         )
 
     def test_stacktrace_off(self) -> None:
@@ -878,10 +878,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             .executorch_program.execution_plan[0]
             .non_const_buffer_sizes
         )
-
+        
+        config = ExecutorchBackendConfig(
+            do_quant_fusion_and_const_prop=False,
+        )
         edge_program_manager = to_edge(export(f, (torch.ones(3, 2),), strict=True))
         non_const_buffer_size_without_const_prop_pass = (
-            edge_program_manager.to_executorch()
+            edge_program_manager.to_executorch(config)
             .executorch_program.execution_plan[0]
             .non_const_buffer_sizes
         )
@@ -1510,7 +1513,12 @@ def forward(self, x):
         self.assertEqual(model.W1.untyped_storage().nbytes(), 8)
         self.assertEqual(model.W2.nbytes, 4)
         self.assertEqual(model.W2.untyped_storage().nbytes(), 8)
-        program = to_edge(export(model, (torch.ones(1),), strict=True)).to_executorch()
+
+        # Without this, the views get 
+        config = exir.ExecutorchBackendConfig(
+            do_quant_fusion_and_const_prop=False,
+        )
+        program = to_edge(export(model, (torch.ones(1),), strict=True)).to_executorch(config)
 
         program = program._emitter_output.program
         # each emitted weight is not a view
@@ -1531,7 +1539,10 @@ def forward(self, x):
         program = program._emitter_output.program
         # confirm that the buffer was emitted
         self.assertEqual(len(program.constant_buffer), 2)
-        self.assertEqual(len(program.constant_buffer[1].storage), 8)
+
+        # executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default
+        # converts the buffer from i64 to fp32 (4 bytes), which gets const propagated
+        self.assertEqual(len(program.constant_buffer[1].storage), 4)
 
     def test_emit_lifted_tensor_constant(self) -> None:
         class LiftedTensorConstants(nn.Module):
@@ -1544,7 +1555,7 @@ def forward(self, x):
 
         model = LiftedTensorConstants()
         # Specify that we want to move non-lifted constants to external file
-        et_cfg = ExecutorchBackendConfig(external_constants=True)
+        et_cfg = ExecutorchBackendConfig(external_constants=True, do_quant_fusion_and_const_prop=False)
         program = to_edge(
             export(model, (torch.ones(3, 2),), strict=True)
         ).to_executorch(et_cfg)
@@ -1566,7 +1577,7 @@ def forward(self, x):
 
         model = LiftedConstants()
         # Specify that we want to move non-lifted constants to external file
-        et_cfg = ExecutorchBackendConfig(external_constants=True)
+        et_cfg = ExecutorchBackendConfig(external_constants=True, do_quant_fusion_and_const_prop=False)
         program = to_edge(
             export(model, (torch.ones(3, 2),), strict=True)
         ).to_executorch(et_cfg)
@@ -1658,7 +1669,10 @@ def forward(self, x):
         model = to_edge(export(InfinityMaskModel(), (torch.randn(2, 2),), strict=True))
 
         # Confirm that we can serialize the model with infinity in it.
-        model = model.to_executorch()
+        config = ExecutorchBackendConfig(
+            do_quant_fusion_and_const_prop=False,
+        )
+        model = model.to_executorch(config)
 
         # Assert that the infinity is stored as a string "-inf".
         values = model.executorch_program.execution_plan[0].values
@@ -1716,8 +1730,8 @@ def forward(self, x):
         external_map = emitter_output.external_constant_map[
             "_default_external_constant"
         ]
-        self.assertEqual(external_map["linear.weight"], 0)
-        self.assertEqual(external_map["linear.bias"], 1)
+        self.assertEqual(external_map["_prop_tensor_constant0"], 1)
+        self.assertEqual(external_map["linear.bias"], 0)
 
     def test_delegate_deduplicate(self) -> None:
         class SharedModule(torch.nn.Module):
@@ -1804,7 +1818,7 @@ def forward(self, input, label):
         ep = to_edge(ep)
         # Lower the graph to executorch.
         ep = ep.to_executorch(
-            config=ExecutorchBackendConfig(external_mutable_weights=True)
+            config=ExecutorchBackendConfig(external_mutable_weights=True, do_quant_fusion_and_const_prop=False)
         )
 
         emitter_output = ep._emitter_output
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
@@ -154,6 +154,8 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
+        "//pytorch/ao:torchao",
+        "//executorch/exir/passes:constant_prop_pass",
     ],
 )
 
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
@@ -8,6 +8,7 @@
 
 from collections import OrderedDict
 from typing import cast, Mapping, Optional
+import logging
 
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -29,6 +30,31 @@
 # Propagating aten.full can significantly increase compiled model size.
 _DEFAULT_SKIP_TARGETS = {exir_ops.edge.aten.full.default}
 
+# Do not const prop quantization primitives
+_QDQ_OPS = [
+    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.convert_element_type.no_fuse,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.choose_qparams.tensor,
+]
+try:
+    import torchao # noqa: F401
+    _QDQ_OPS.extend(
+        [
+            exir_ops.edge.torchao.dequantize_affine.default,
+            exir_ops.edge.torchao.quantize_affine.default,
+            exir_ops.edge.torchao.choose_qparams_affine.default,
+        ]
+    )
+except ImportError:
+    pass
+_DEFAULT_SKIP_TARGETS.update(set(_QDQ_OPS))
+
+
 _PRIMITIVE_TYPES = (
     float,
     int,
@@ -40,7 +66,6 @@
     torch.layout,
 )
 
-
 def is_const(
     arg,
     exported_program: ExportedProgram,
@@ -308,7 +333,7 @@ def constant_prop_pass(
         if node.target == torch.ops.higher_order.cond
     ]
     if len(has_control_flow) > 0:
-        raise RuntimeError("constant_prop_pass for control flow is not supported yet.")
+        logging.warning("constant_prop_pass does not constant propagate in control flow modules")
 
     const_node_to_tensor = get_propagated_const_tensor_dict(
         exported_program, custom_skip_targets
diff --git a/exir/passes/quant_fusion_pass.py b/exir/passes/quant_fusion_pass.py
@@ -10,6 +10,8 @@
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.utils import _pytree as pytree
+from executorch.exir.passes.constant_prop_pass import constant_prop_pass
+from torch.export import ExportedProgram
 
 from ._quant_patterns_and_replacements import get_quant_patterns_and_replacements
 
@@ -139,3 +141,13 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module.graph.lint()
         graph_module.graph.eliminate_dead_code()
         return PassResult(graph_module, True)
+
+
+def quant_fusion_and_const_prop_pass(program: ExportedProgram) -> ExportedProgram:
+    gm = program.graph_module
+    gm_res = QuantFusionPass(_fix_node_meta_val=True)(gm)
+    gm = gm_res.graph_module
+    
+    # Do const prop pass to remove packing/dtype conversion ops
+    program = constant_prop_pass(program)
+    return program
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -52,6 +52,7 @@
 from executorch.exir.passes.normalize_view_copy_base_pass import (
     NormalizeViewCopyBasePass,
 )
+from executorch.exir.passes.quant_fusion_pass import quant_fusion_and_const_prop_pass
 from executorch.exir.passes.remove_graph_asserts_pass import (
     RemoveGraphAssertsPass,
     RemoveNonCoreAtenOpGraphAssertsPass,
@@ -1524,9 +1525,15 @@ def to_executorch(
             after it has been transformed to the ExecuTorch backend.
         """
         config = config if config else ExecutorchBackendConfig()
-
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
+            if config.do_quant_fusion_and_const_prop:
+                if program.graph_signature.backward_signature is not None:
+                    raise Exception(
+                        "Cannot run do_quant_fusion_and_const_prop on a graph with a backward signature intended for on-device training."
+                        " Please set do_quant_fusion_and_const_prop to False in the ExecutorchBackendConfig."
+                     )
+                program = quant_fusion_and_const_prop_pass(program)
             program = weights_to_outputs_pass(program)
             program = unsafe_remove_auto_functionalized_pass(program)
             gm, new_signature = insert_write_back_for_buffers_pass(program)
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
@@ -769,7 +769,8 @@ def forward(self, input, label):
         ep = export(net, inputs, strict=True)
         ep = _export_forward_backward(ep)
         ep = to_edge(ep)
-        ep = ep.to_executorch()
+        config = ExecutorchBackendConfig(do_quant_fusion_and_const_prop=False)
+        ep = ep.to_executorch(config)
 
         ep.dump_executorch_program(True)
 
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -1085,7 +1085,16 @@ def forward(self) -> torch.Tensor:
         self.assertEqual(ep.graph_signature.input_specs[1].arg.name, "b_a")
 
         # Validate that the program successfully passes validation to executorch:
-        edge.to_executorch()
+
+        # The test fails when do_quant_fusion_and_const_prop=True, but it is not related to
+        # the pass, but rather that memory planning fails (AssertionError: graph_output_allocated not set)
+        # when a graph has no user inputs and no operations.  We can construct a failure case
+        # even with do_quant_fusion_and_const_prop = False by changing the forward method in NoUserInputs
+        # to just return self.a
+        config = exir.ExecutorchBackendConfig(
+            do_quant_fusion_and_const_prop=False,
+        )
+        edge.to_executorch(config)
 
     def test_constant_prop_pass_for_parameter(self) -> None:
         def count_additions(gm: torch.fx.GraphModule) -> int:
@@ -1279,6 +1288,7 @@ class Module(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(3, 3)
+                self.w = torch.randn(3, 3)
 
             def t(self, val):
                 return val + 1
@@ -1293,8 +1303,9 @@ def false_fn(self, val):
                 return self.linear(val) - self.f(val)
 
             def forward(self, pred, x):
+                out = torch.nn.functional.linear(x, self.w.to(torch.float16).to(torch.float32))
                 return torch.ops.higher_order.cond(
-                    pred, self.true_fn, self.false_fn, [x]
+                    pred, self.true_fn, self.false_fn, [out]
                 )
 
         mod = Module()
@@ -1304,14 +1315,41 @@ def forward(self, pred, x):
             export(mod, (pred, x), strict=True),
             compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
         )
-        error_msg = r"constant_prop_pass for control flow is not supported yet."
-
-        # TODO(chenlai): enable constant prop pass for control flow
-        with self.assertRaisesRegex(
-            RuntimeError,
-            error_msg,
-        ):
-            _ = constant_prop_pass(edge.exported_program())
+        expected_out = edge.exported_program().module()(pred, x)
+
+        warn_log = "constant_prop_pass does not constant propagate in control flow modules"
+        with self.assertLogs(level="WARNING") as log:
+            program = constant_prop_pass(edge.exported_program())
+            self.assertIn(warn_log, log.output[0])
+
+        out = program.module()(pred, x)
+        self.assertTrue(torch.allclose(expected_out, out))
+
+        # dtype casts in parent module are const propagated
+        FileCheck().check("executorch_exir_dialects_edge__ops_aten_mm_default(x, _prop_tensor_constant").run(program.graph_module.code)
+    
+    def test_constant_prop_pass_quant_primitives(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w_int = torch.ones(3, 3, dtype=torch.int8)
+                self.w_scale = 3.0
+                self.w_zero_point = 3
+
+            def forward(self, x):
+                w_dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default(
+                    self.w_int, self.w_scale, self.w_zero_point, -127, 128, torch.int8)
+                return torch.nn.functional.linear(x, w_dq)
+        
+        mod = M()
+        x = torch.randn([3])
+        mod(x)
+        edge = to_edge(
+            export(mod, (x,), strict=True),
+            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        )
+        constant_prop_pass(edge.exported_program())
+        FileCheck().check("executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default").run(edge.exported_program().graph_module.code)
 
     def test_mutable_buffers(self) -> None:
         def count_copies(gm: torch.fx.GraphModule) -> int:
diff --git a/exir/tests/test_quant_fusion_pass.py b/exir/tests/test_quant_fusion_pass.py
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,8 @@ python_library(`
`154`	`154`	`"//caffe2:torch",`
`155`	`155`	`"//executorch/exir:pass_base",`
`156`	`156`	`"//executorch/exir/dialects:lib",`
	`157`	`+ "//pytorch/ao:torchao",`
	`158`	`+ "//executorch/exir/passes:constant_prop_pass",`
`157`	`159`	`],`
`158`	`160`	`)`
`159`	`161`