Export recipes integration in export_llama

tarun292 · tarun292 · commit 8e48a75158e5 · 2025-05-29T13:05:18.000-07:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -21,6 +21,8 @@
 from pathlib import Path
 from typing import Callable, List, Optional, Union
 
+import executorch
+
 import pkg_resources
 import torch
 
@@ -50,6 +52,7 @@
     get_qnn_quantizer,
     get_vulkan_quantizer,
 )
+from executorch.extension.llm.export.recipes import get_llm_recipe
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
 from ..model_factory import EagerModelFactory
@@ -546,6 +549,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="If true, stops right after torch.export() and saves the exported model.",
     )
+
+    parser.add_argument(
+        "--recipe_flow",
+        default=False,
+        action="store_true",
+        help="Experimental feature, this will use the executorch.export + recipe based flow",
+    )
     return parser
 
 
@@ -610,6 +620,9 @@ def export_llama(args) -> str:
                 "Please run `pip install snakeviz` to install required dependencies for cProfiler flamegraph."
             )
             return ""
+    elif args.recipe_flow:
+        filename = _recipe_based_export_llama(args)
+        return filename
     else:
         builder = _export_llama(args)
         assert (
@@ -1102,6 +1115,24 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     return builder
 
 
+def _recipe_based_export_llama(args) -> str:  # noqa: C901
+    _validate_args(args)
+    assert args.xnnpack, "Recipe based flow only supports xnnpack backend currently."
+
+    builder = _prepare_for_llama_export(args)
+    session = executorch.export.export(
+        builder.model,
+        [builder.example_inputs],
+        get_llm_recipe(args),
+        dynamic_shapes=builder._get_dynamic_shape(),
+    )
+
+    session.print_delegation_info()
+    session.save_to_pte(builder.modelname)
+
+    return builder.modelname
+
+
 def _load_llama_model_metadata(
     weight_type: WeightType,
     use_kv_cache: bool,
diff --git a/extension/llm/export/export_passes.py b/extension/llm/export/export_passes.py
@@ -3,6 +3,7 @@
 import torch
 
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.program._program import _update_exported_program_graph_module
 from torch._subclasses import FakeTensor
 from torch.fx.passes.infra.pass_base import PassResult
 
@@ -99,6 +100,14 @@ def call(self, graph_module: torch.fx.GraphModule):
         return PassResult(graph_module, graph_changed)
 
 
+def remove_redundant_transposes(
+    ep: torch.export.ExportedProgram,
+) -> torch.export.ExportedProgram:
+    res = RemoveRedundantTransposes()(ep.graph_module)
+    assert res is not None
+    return _update_exported_program_graph_module(ep, res.graph_module)
+
+
 class ReplaceSDPAWithCustomSDPAPass(ExportPass):
     """
     This pass replaces aten.scaled_dot_product_attention.default with llama.custom_sdpa.default.
diff --git a/extension/llm/export/recipes.py b/extension/llm/export/recipes.py
@@ -0,0 +1,36 @@
+from executorch.export.recipe import ExportRecipe, QuantizationRecipe
+from executorch.exir import EdgeCompileConfig
+from executorch.extension.llm.export.quantizer_lib import get_quantizer_and_quant_params
+from executorch.extension.llm.export.export_passes import remove_redundant_transposes
+from executorch.extension.llm.export.partitioner_lib import (
+    get_coreml_partitioner,
+    get_mps_partitioner,
+    get_qnn_partitioner,
+    get_vulkan_partitioner,
+    get_xnnpack_partitioner,
+)
+
+def get_llm_recipe(args) -> ExportRecipe:
+    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
+
+    if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
+        # Force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
+        args.xnnpack = True
+
+    quant_recipe = QuantizationRecipe(
+        quantizers=quantizers,
+    )
+
+    partitioners = []
+    if args.xnnpack:
+        partitioners.append(get_xnnpack_partitioner(dynamic_quant_only_partitioner=True))
+    if args.xnnpack_extended_ops:
+        partitioners.append(get_xnnpack_partitioner(dynamic_quant_only_partitioner=False))
+    
+    return ExportRecipe(
+        quantization_recipe=quant_recipe,
+        edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        pre_edge_transform_passes=[remove_redundant_transposes],
+        edge_transform_passes=[],
+        partitioners=partitioners,
+    )