Add a parameter to output delegate summary in llama export (#8174)

limintang · facebook-github-bot · commit 56502a715538 · 2025-02-03T20:03:28.000-08:00
Summary:

Print delegation summary when the verbose parameter is set.

Differential Revision: D68991594
diff --git a/devtools/backend_debug/__init__.py b/devtools/backend_debug/__init__.py
@@ -7,6 +7,7 @@
 from executorch.devtools.backend_debug.delegation_info import (
     DelegationBreakdown,
     get_delegation_info,
+    print_delegation_info,
 )
 
-__all__ = ["DelegationBreakdown", "get_delegation_info"]
+__all__ = ["DelegationBreakdown", "get_delegation_info", "print_delegation_info"]
diff --git a/devtools/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py
@@ -7,6 +7,7 @@
 import re
 from collections import defaultdict
 from dataclasses import asdict, dataclass
+from tabulate import tabulate
 from typing import Dict
 
 import pandas as pd
@@ -174,3 +175,10 @@ def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
         num_delegated_subgraphs=delegated_subgraph_counter,
         delegation_by_operator=op_occurrences_dict,
     )
+
+
+def print_delegation_info(graph_module: torch.fx.GraphModule):
+    delegation_info = get_delegation_info(graph_module)
+    print(delegation_info.get_summary())
+    df = delegation_info.get_operator_delegation_dataframe()
+    print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -54,6 +54,9 @@
 from executorch.examples.models.llama.source_transformation.quantize import (
     get_quant_embedding_transform,
 )
+
+from executorch.devtools.backend_debug import print_delegation_info
+
 from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken
 from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
     LlamaModel,
@@ -389,6 +392,7 @@ def lowering_modules(
         num_sharding=1,
         passes_job=OrderedDict(),
         shared_buffer=False,
+        verbose=False,
     ):
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -440,6 +444,10 @@ def lowering_modules(
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
             if num_sharding > 1:
                 update_spill_fill_size(edge_prog_mgr.exported_program())
+
+            if verbose:
+                print_delegation_info(edge_prog_mgr.exported_program().graph_module)
+
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
             with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file:
                 exec_prog_mgr.write_to_file(file)
@@ -667,6 +675,10 @@ def compile(args, pte_filename, tokenizer):
             )
             compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
 
+        if args.verbose:
+            for exported_program in exported_programs:
+                print_delegation_info(exported_program.graph_module)
+
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
             # which is allocated by RPC memory to executor runner.
@@ -980,6 +992,8 @@ def _build_parser():
         help="Fallback to cpu embedding operator and type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '4,32'.",
     )
 
+    parser.add_argument("-v", "--verbose", action="store_true")
+
     return parser
 
 

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`from executorch.devtools.backend_debug.delegation_info import (`
`8`	`8`	`DelegationBreakdown,`
`9`	`9`	`get_delegation_info,`
	`10`	`+ print_delegation_info,`
`10`	`11`	`)`
`11`	`12`
`12`		`-__all__ = ["DelegationBreakdown", "get_delegation_info"]`
	`13`	`+__all__ = ["DelegationBreakdown", "get_delegation_info", "print_delegation_info"]`