From f3ed763ebb46e4d2d57c3e1ef69e574b5c263b60 Mon Sep 17 00:00:00 2001 From: Matthias Cremon Date: Fri, 2 May 2025 12:55:04 -0700 Subject: [PATCH] Make quantize_pt2 return an ExportedProgram instead of a GraphModule (#10644) Summary: This will help differentiating the fp32 models from the quantized models, and prevent people from using the wrong APIs. For fp32 cases, we have a `torch.nn.Module`, which we trace and then lower. For quantized cases, we trace, quantize, and lower. After this diff, `export_to_` will ONLY handle non-quantized cases, and importantly, the sequence of `quantize_pt2` and then `export_to_` will not work anymore. Those cases should use the (existing) `lower_ep_to_` instead. Note that in subsequent diffs, both `quantize_pt2` and `lower_ep_to torch.fx.GraphModule: +) -> ExportedProgram: """ Trace, prepare, convert and fuse the model using the given quantizer. If calibration data is provided, it will be used to calibrate the model. If @@ -178,7 +178,9 @@ def quantize_pt2( logging.info("Graph after quantization and fusion:") logging.info(fused_gm.graph.print_tabular()) - return fused_gm + program = torch.export.export(fused_gm, inputs, strict=True) + + return program # Export the model and lower it to an ExportedProgram (in aten IR) @@ -260,6 +262,9 @@ def quantize_and_export_to_edge( dump_graphs: bool = False, constant_methods: Optional[dict[str, object]] = None, ) -> EdgeProgramManager: + """ + Trace, quantize and lower a model/inputs pair to edge IR. + """ quantized_model = quantize_pt2( model, inputs, @@ -267,14 +272,33 @@ def quantize_and_export_to_edge( dump_graphs=dump_graphs, ) - return export_to_edge( + return lower_ep_to_edge( quantized_model, - inputs, dump_graphs=dump_graphs, constant_methods=constant_methods, ) +def lower_ep_to_cadence( + program: ExportedProgram, + dump_graphs: bool = False, + opt_level: int = 1, +) -> EdgeProgramManager: + """ + Lower an existing ExportedProgram to edge IR and apply frontend optimization passes. + """ + edge_prog_manager = lower_ep_to_edge(program, dump_graphs=dump_graphs) + cadence_passes = get_cadence_passes(opt_level) + + # Run a couple required passes for quant/dequant ops + cadence_prog_manager = edge_prog_manager.transform( + cast( + list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes + ) + ) + return cadence_prog_manager + + def export_to_cadence( model: torch.nn.Module, inputs: tuple[object, ...], @@ -299,11 +323,14 @@ def quantize_and_export_to_cadence( dump_graphs: bool = False, opt_level: int = 1, ) -> EdgeProgramManager: + """ + Trace, quantize, lower a model/inputs pair to edge IR and apply frontend + optimization passes. + """ quantized_model = quantize_pt2(model, inputs) - return export_to_cadence( + return lower_ep_to_cadence( quantized_model, - inputs, opt_level=opt_level, dump_graphs=dump_graphs, ) diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py index b8ebe21832c..85077db93ca 100644 --- a/backends/cadence/aot/tests/test_replace_ops_passes.py +++ b/backends/cadence/aot/tests/test_replace_ops_passes.py @@ -16,7 +16,6 @@ from executorch.backends.cadence.aot.compiler import ( export_to_edge, quantize_and_export_to_edge, - quantize_pt2, ) from executorch.backends.cadence.aot.graph_builder import ( GraphBuilder, @@ -113,9 +112,8 @@ def forward(self, x, y): Y = torch.randn(y_shape) p = ReplaceMatmulWithTransposedMatmulPass() inputs = (X, Y) - quantized_model = quantize_pt2(model, inputs) graph_module = ( - export_to_edge(quantized_model, inputs).exported_program().graph_module + quantize_and_export_to_edge(model, inputs).exported_program().graph_module ) # pyre-fixme[16]: Optional type has no attribute `graph_module` graph_after_passes = p(graph_module).graph_module