pytorch · zingo · Oct 30, 2025 · Sep 17, 2025 · Oct 28, 2025 · Oct 29, 2025
@@ -42,6 +42,7 @@ toolchain=arm-none-eabi-gcc
 select_ops_list="aten::_softmax.out"
 qdq_fusion_op=false
 model_explorer=false
+perf_overlay=false
 
 function help() {
     echo "Usage: $(basename $0) [options]"
@@ -72,7 +73,8 @@ function help() {
     echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
     echo "  --qdq_fusion_op                        Enable QDQ fusion op"
-    echo "  --model_explorer                       Generate and open a visual graph of the compiled model."
+    echo "  --model_explorer                       Enable model explorer to visualize TOSA graph."
+    echo "  --perf_overlay                         With --model_explorer, include performance data from FVP PMU trace."
     exit 0
 }
 
@@ -102,11 +104,17 @@ for arg in "$@"; do
       --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;;
       --qdq_fusion_op) qdq_fusion_op=true;;
       --model_explorer) model_explorer=true ;;
+      --perf_overlay) perf_overlay=true ;;
       *)
       ;;
     esac
 done
 
+if [ "$perf_overlay" = true ] && [ "$model_explorer" != true ]; then
+    echo "Error: --perf_overlay requires --model_explorer" >&2
+    exit 1
+fi
+
 if ! [[ ${pte_placement} == "elf" ]]; then
     if ! [[ "$pte_placement" =~ ^0x[0-9a-fA-F]{1,16}$ ]]; then
         echo "ERROR: Placing the PTE in memory failed, address is larger then 64bit $pte_placement"
@@ -204,6 +212,7 @@ bundleio_flag=""
 etrecord_flag=""
 et_dump_flag=""
 qdq_fusion_op_flag=""
+fvp_pmu_flag=""
 if [ "$build_with_etdump" = true ] ; then
     et_dump_flag="--etdump"
     etrecord_flag="--etrecord"
@@ -273,6 +282,11 @@ for i in "${!test_model[@]}"; do
         output_folder=${et_build_root}/${model_short_name}
     fi
 
+    if [ "$perf_overlay" = true ] ; then
+        model_compiler_flags+="--enable_debug_mode tosa"
+        fvp_pmu_flag="--trace_file=${output_folder}/pmu_trace.gz"
+    fi
+
     mkdir -p ${output_folder}
     output_folder=$(realpath ${output_folder})
     pte_file="${output_folder}/${model_filename_ext}"
@@ -330,14 +344,18 @@ for i in "${!test_model[@]}"; do
         if [ "$build_only" = false ] ; then
             # Execute the executor_runner on FVP Simulator
 
-            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag}
+            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag} ${fvp_pmu_flag}
         fi
         set +x
     fi
 
     if [ "$model_explorer" = true ]; then
         tosa_flatbuffer_path=$(find ${output_folder} -name "*TOSA*.tosa" | head -n 1)
-        python3 ${script_dir}/visualize.py ${tosa_flatbuffer_path}
+        perf_flags=""
+        if [ "$perf_overlay" = true ]; then
+            perf_flags+="--trace ${output_folder}/pmu_trace.gz --tables ${output_folder}/output/out_debug.xml"
+        fi
+        python3 ${script_dir}/visualize.py --model_path ${tosa_flatbuffer_path} ${perf_flags}
     fi
 done
 

@@ -4,28 +4,289 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import gzip
+import io
+import json
+import xml.etree.ElementTree as ET
+from pathlib import Path
 
-import model_explorer
+from typing import Any, Callable, Dict, Iterable, NamedTuple, Union
+
+import pandas as pd
 
 from executorch.devtools.visualization.visualization_utils import (
     visualize_model_explorer,
 )
+from model_explorer import config as model_explorer_config, node_data_builder as ndb
+
+COMPILER_OP_ID = "scheduled_id"
+
+
+class Tables(NamedTuple):
+    queue: pd.DataFrame
+    group: pd.DataFrame
+    perf: pd.DataFrame
+    source: pd.DataFrame
+
+
+def parse_tables(tables_path: Path) -> Tables:
+    """
+    Parse the XML debug tables file and extract required tables as pandas DataFrames.
+    """
+    required_tables = {"queue", "group", "perf", "source"}
+    try:
+        tree = ET.parse(tables_path)
+    except ET.ParseError as e:
+        raise ValueError(f"Failed to parse XML tables file {tables_path}: {e}")
+
+    tables: Dict[str, pd.DataFrame] = {}
+    for table in tree.getroot().findall("table"):
+        name = table.attrib.get("name")
+        if name in required_tables:
+            text = table.text or ""
+            tables[name] = pd.read_csv(io.StringIO(text))
+
+    missing = required_tables - tables.keys()
+    if missing:
+        raise ValueError(f"Missing required tables in XML: {missing}")
+
+    return Tables(**tables)
+
+
+def get_trace_file_objects(trace_file_path: Path) -> list[Dict[str, Any]]:
+    """
+    Load and return the 'traceEvents' list from a gzip-compressed JSON trace file.
+    """
+    try:
+        with gzip.open(trace_file_path, "rt", encoding="utf-8") as file:
+            data = json.load(file)
+    except (OSError, json.JSONDecodeError) as e:
+        raise ValueError(f"Failed to read or parse trace file {trace_file_path}: {e}")
+
+    if "traceEvents" not in data:
+        raise KeyError(f"'traceEvents' key not found in {trace_file_path}")
+
+    return data["traceEvents"]
+
+
+def get_subops(df_group: pd.DataFrame) -> set:
+    return set(df_group[df_group["id"] != df_group["group_id"]]["id"])
+
+
+def transform_events(
+    objects: Iterable[Dict[str, Any]], queue_df: pd.DataFrame, sub_ops: set
+) -> None:
+    """
+    Annotate the 'queue' table in-place with duration based on trace events.
+    """
+    queue_df_len = len(queue_df)
+    offsets = queue_df["offset"].astype(int)
+
+    start_ts, cmd_index, chain_len = 0, 0, 1
+
+    def is_end_of_command(qread_offset: int, end_idx: int) -> bool:
+        if end_idx >= queue_df_len:
+            return qread_offset > offsets[cmd_index]
+        return qread_offset == offsets[end_idx]
+
+    for event in (e for e in objects if e.get("tid") == "qread"):
+        if cmd_index >= queue_df_len:
+            break
+
+        qread_offset = 4 * int(event["args"]["qread"])
+
+        end_idx = cmd_index + chain_len
+        if is_end_of_command(qread_offset, end_idx):
+            end_ts = int(event["ts"]) - 1
+            queue_df.loc[cmd_index, ["duration"]] = [
+                end_ts - start_ts,
+            ]
+            start_ts = end_ts
+            cmd_index += chain_len
+            chain_len = 1
+            while (cmd_index + chain_len <= queue_df_len - 1) and queue_df.iloc[
+                cmd_index + chain_len
+            ]["scheduled_id"] in sub_ops:
+                chain_len += 1
+
+
+Agg = Union[str, Callable[[pd.Series], Any]]
+
+
+def list_unique(s: pd.Series) -> list[Any]:
+    return sorted(set(s.dropna()))
+
+
+def build_perf_df(tables: Tables) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Build a performance DataFrame summarizing queue metrics grouped by source_id.
+    Returns a tuple of (perf_df, cmd_to_op_df) where cmd_to_op_df is needed for unmapped op tracking.
+    """
+    tables.queue["cmd_id"] = tables.queue.index
+
+    excluded = {"optimised_id", "scheduled_id", "offset"}
+    col_funcs: Dict[str, Agg] = {
+        c: "sum" for c in tables.queue.columns if c not in excluded
+    }
+
+    col_funcs.update({"cmdstream_id": list_unique, "cmd_id": list_unique})
+
+    cmd_to_op_df = tables.queue.groupby(COMPILER_OP_ID).agg(col_funcs).reset_index()
+
+    opt_df = (
+        pd.merge(tables.perf[["id", "source_id"]], tables.group, on="id", how="left")
+        .rename(columns={"id": COMPILER_OP_ID})
+        .merge(cmd_to_op_df, on=COMPILER_OP_ID, how="inner")
+    )
+
+    exclude_columns = ["source_id"]
+    src_col_funcs: Dict[str, Agg] = {
+        col: "sum" for col in opt_df.columns if col not in exclude_columns
+    }
+    src_col_funcs[COMPILER_OP_ID] = list_unique
+
+    perf_df = opt_df.groupby("source_id").agg(src_col_funcs).reset_index()
+
+    return perf_df, cmd_to_op_df
+
+
+def check_unmapped_ops(
+    tables: Tables, src_df: pd.DataFrame, cmd_to_op_df: pd.DataFrame
+) -> None:
+    """
+    Identify operators in the performance data that are not mapped to any source operation.
+    """
+    opt_ids_in_src_table = set()
+    for opt_ids in src_df[COMPILER_OP_ID].dropna():
+        if type(opt_ids) is list:
+            opt_ids_in_src_table.update(opt_ids)
+
+    opt_df = pd.merge(
+        tables.perf[["id", "source_id"]], tables.group, on="id", how="left"
+    )
+    opt_df = opt_df.rename(columns={"id": COMPILER_OP_ID})
+    opt_df = pd.merge(opt_df, cmd_to_op_df, on=COMPILER_OP_ID, how="inner")
+
+    unmapped_operators = opt_df[
+        ~opt_df[COMPILER_OP_ID].isin(list(opt_ids_in_src_table))
+    ]
+
+    if not unmapped_operators.empty:
+        print("Warning: There are unmapped operators in the performance data.")
+        print(unmapped_operators)
+    return None
+
+
+def build_src_df(tables: Tables, perf_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Merge source table with performance metrics and total NPU cycles.
+    Returns a tuple of (src_df, cmd_to_op_df) where df_cmd_to_op is needed for unmapped op tracking.
+    """
+    return pd.merge(
+        tables.source.rename(columns={"id": "source_id"})[["ext_key", "source_id"]],
+        perf_df,
+        on="source_id",
+        how="left",
+    ).merge(
+        tables.perf[["source_id", "npu_cycles"]]
+        .groupby("source_id")
+        .sum(numeric_only=True)
+        .reset_index(),
+        on="source_id",
+        how="left",
+    )
+
+
+def get_model_node_data(df: pd.DataFrame) -> ndb.ModelNodeData:
+    """
+    Convert source-level metrics into ModelExplorer node data for duration.
+    """
+    durations = df["duration"].fillna(0).astype(int)
+
+    duration_results: Dict[str, ndb.NodeDataResult] = {}
+
+    for src, dur in zip(df["ext_key"], durations):
+        node_id = f"main/op{int(src)}"
+        duration_results[node_id] = ndb.NodeDataResult(value=int(dur))
+
+    gradient = [
+        ndb.GradientItem(stop=0.0, bgColor="#ffffff"),
+        ndb.GradientItem(stop=0.1, bgColor="#33FF00"),
+        ndb.GradientItem(stop=0.2, bgColor="#66FF00"),
+        ndb.GradientItem(stop=0.5, bgColor="#FFFF00"),
+        ndb.GradientItem(stop=0.7, bgColor="#FF6600"),
+        ndb.GradientItem(stop=1.0, bgColor="#FF0000"),
+    ]
+
+    return ndb.ModelNodeData(
+        graphsData={
+            "main": ndb.GraphNodeData(results=duration_results, gradient=gradient)
+        }
+    )
+
+
+def build_overlay_data(trace_path: Path, tables_path: Path) -> ndb.ModelNodeData:
+    """
+    Build ModelExplorer node data from trace and tables files.
+    """
+    tables = parse_tables(tables_path)
+    events = get_trace_file_objects(trace_path)
+    transform_events(events, tables.queue, get_subops(tables.group))
+    perf_df, cmd_to_op_df = build_perf_df(tables)
+    src_df = build_src_df(tables, perf_df)
+    check_unmapped_ops(tables, src_df, cmd_to_op_df)
+
+    return get_model_node_data(src_df)
+
+
+def validate_file_exists(file_path: Path) -> None:
+    if not file_path.exists():
+        raise FileNotFoundError(f"{file_path} not found")
+
+
+def validate_perf_mode_args(trace: str, tables: str) -> None:
+    if not (trace and tables):
+        raise ValueError(
+            "Both --trace and --tables must be provided for perf mode, or neither for default mode"
+        )
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(
         description="Visualize a model using model explorer."
     )
-    parser.add_argument("model_path", type=str, help="Path to the model file.")
+    parser.add_argument(
+        "--model_path", required=True, type=str, help="Path to the model file"
+    )
+    parser.add_argument(
+        "--trace",
+        required=False,
+        help="(perf mode) PMU trace JSON.gz file with performance data",
+    )
+    parser.add_argument(
+        "--tables",
+        required=False,
+        help="(perf mode) Vela debug database tables XML file",
+    )
+
     args = parser.parse_args()
+    model_file = Path(args.model_path).resolve()
+    validate_file_exists(model_file)
 
-    config = model_explorer.config()
-    (config.add_model_from_path(args.model_path))
+    config = model_explorer_config().add_model_from_path(str(model_file))
 
-    visualize_model_explorer(
-        config=config,
-        extensions=["tosa_adapter_model_explorer"],
-    )
+    if args.trace or args.tables:
+        validate_perf_mode_args(args.trace, args.tables)
+        trace_file = Path(args.trace).resolve()
+        tables_file = Path(args.tables).resolve()
+        validate_file_exists(trace_file)
+        validate_file_exists(tables_file)
+
+        config.add_node_data(
+            "Duration (Cycles)", build_overlay_data(trace_file, tables_file)
+        )
+
+    visualize_model_explorer(config=config, extensions=["tosa_adapter_model_explorer"])
 
 
 if __name__ == "__main__":