From f3ed763ebb46e4d2d57c3e1ef69e574b5c263b60 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Fri, 2 May 2025 12:55:04 -0700
Subject: [PATCH] Make quantize_pt2 return an ExportedProgram instead of a
 GraphModule (#10644)

Summary:

This will help differentiating the fp32 models from the quantized models, and prevent people from using the wrong APIs.
For fp32 cases, we have a `torch.nn.Module`, which we trace and then lower. For quantized cases, we trace, quantize, and lower.

After this diff, `export_to_<edge, executorch>` will ONLY handle non-quantized cases, and importantly, the sequence of `quantize_pt2` and then `export_to_<edge, executorch>` will not work anymore. Those cases should use the (existing) `lower_ep_to_<edge, executorch>` instead.

Note that in subsequent diffs, both `quantize_pt2` and `lower_ep_to<edge, executorch` should be inner calls, not exposed to users

Reviewed By: Vysarat, zonglinpeng

Differential Revision: D73722640
---
 backends/cadence/aot/compiler.py              | 39 ++++++++++++++++---
 .../aot/tests/test_replace_ops_passes.py      |  4 +-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index e1c0f9b7337..6a92251a3e0 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -151,7 +151,7 @@ def quantize_pt2(
     quantizer: Optional[CadenceQuantizer] = None,
     calibration_data: Optional[list[tuple[object, ...]]] = None,
     dump_graphs: bool = False,
-) -> torch.fx.GraphModule:
+) -> ExportedProgram:
     """
     Trace, prepare, convert and fuse the model using the given quantizer.
     If calibration data is provided, it will be used to calibrate the model. If
@@ -178,7 +178,9 @@ def quantize_pt2(
         logging.info("Graph after quantization and fusion:")
         logging.info(fused_gm.graph.print_tabular())
 
-    return fused_gm
+    program = torch.export.export(fused_gm, inputs, strict=True)
+
+    return program
 
 
 # Export the model and lower it to an ExportedProgram (in aten IR)
@@ -260,6 +262,9 @@ def quantize_and_export_to_edge(
     dump_graphs: bool = False,
     constant_methods: Optional[dict[str, object]] = None,
 ) -> EdgeProgramManager:
+    """
+    Trace, quantize and lower a model/inputs pair to edge IR.
+    """
     quantized_model = quantize_pt2(
         model,
         inputs,
@@ -267,14 +272,33 @@ def quantize_and_export_to_edge(
         dump_graphs=dump_graphs,
     )
 
-    return export_to_edge(
+    return lower_ep_to_edge(
         quantized_model,
-        inputs,
         dump_graphs=dump_graphs,
         constant_methods=constant_methods,
     )
 
 
+def lower_ep_to_cadence(
+    program: ExportedProgram,
+    dump_graphs: bool = False,
+    opt_level: int = 1,
+) -> EdgeProgramManager:
+    """
+    Lower an existing ExportedProgram to edge IR and apply frontend optimization passes.
+    """
+    edge_prog_manager = lower_ep_to_edge(program, dump_graphs=dump_graphs)
+    cadence_passes = get_cadence_passes(opt_level)
+
+    # Run a couple required passes for quant/dequant ops
+    cadence_prog_manager = edge_prog_manager.transform(
+        cast(
+            list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
+        )
+    )
+    return cadence_prog_manager
+
+
 def export_to_cadence(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
@@ -299,11 +323,14 @@ def quantize_and_export_to_cadence(
     dump_graphs: bool = False,
     opt_level: int = 1,
 ) -> EdgeProgramManager:
+    """
+    Trace, quantize, lower a model/inputs pair to edge IR and apply frontend
+    optimization passes.
+    """
     quantized_model = quantize_pt2(model, inputs)
 
-    return export_to_cadence(
+    return lower_ep_to_cadence(
         quantized_model,
-        inputs,
         opt_level=opt_level,
         dump_graphs=dump_graphs,
     )
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index b8ebe21832c..85077db93ca 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -16,7 +16,6 @@
 from executorch.backends.cadence.aot.compiler import (
     export_to_edge,
     quantize_and_export_to_edge,
-    quantize_pt2,
 )
 from executorch.backends.cadence.aot.graph_builder import (
     GraphBuilder,
@@ -113,9 +112,8 @@ def forward(self, x, y):
         Y = torch.randn(y_shape)
         p = ReplaceMatmulWithTransposedMatmulPass()
         inputs = (X, Y)
-        quantized_model = quantize_pt2(model, inputs)
         graph_module = (
-            export_to_edge(quantized_model, inputs).exported_program().graph_module
+            quantize_and_export_to_edge(model, inputs).exported_program().graph_module
         )
         # pyre-fixme[16]: Optional type has no attribute `graph_module`
         graph_after_passes = p(graph_module).graph_module