final

yangw-dev · yangw-dev · commit 4d30776fd67d · 2025-06-18T08:52:29.000-07:00
Signed-off-by: Yang Wang &lt;elainewy@meta.com&gt;
diff --git a/.ci/scripts/benchmark_tooling/README.md b/.ci/scripts/benchmark_tooling/README.md
@@ -1,74 +1,126 @@
 # Benchmark Tooling
 
-A library providing tools for benchmarking ExecutorchBenchmark data.
+A library providing tools for fetching, processing, and analyzing ExecutorchBenchmark data from the HUD Open API.
 
-## Read Benchmark Data
-`get_benchmark_analysis_data.py` fetches benchmark data from HUD Open API, clean the data that only contains FAILURE_REPORT column,
-and get all private device metrics and associated public device metrics if any based on [model,backend,device,ios]
-
-### Quick Start
+## Installation
 
 Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 
-Run with csv output (CLI):
+## Tools
+
+### get_benchmark_analysis_data.py
+
+This script fetches benchmark data from HUD Open API, cleans data that only contains FAILURE_REPORT columns, and retrieves all private device metrics and associated public device metrics based on [model, backend, device, arch].
+
+#### Quick Start
+
 ```bash
-python3 .ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py --startTime "2025-06-11T00:00:00" --endTime "2025-06-17T18:00:00" --outputType "csv"
+python3 .ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py \
+  --startTime "2025-06-11T00:00:00" \
+  --endTime "2025-06-17T18:00:00" \
+  --outputType "csv"
 ```
 
-Additional options:
-- `--not-silent`: show processing logs, otherwise only show results & minimum loggings
-- `--outputType df`: Display results in DataFrame format
-- `--outputType excel --outputDir "{YOUR_LOCAL_DIRECTORY}"`: Generate Excel file with multiple sheets (`res_private.xlsx` and `res_public.xlsx`)
-- `--outputType csv --outputDir "{YOUR_LOCAL_DIRECTORY}"`: Generate CSV files in folders (`private` and `public`)
+#### Command Line Options
+
+##### Basic Options:
+- `--startTime`: Start time in ISO format (e.g., "2025-06-11T00:00:00") (required)
+- `--endTime`: End time in ISO format (e.g., "2025-06-17T18:00:00") (required)
+- `--env`: Choose environment ("local" or "prod", default: "prod")
+- `--not-silent`: Show processing logs (default: only show results & minimum logging)
+
+##### Output Options:
+- `--outputType`: Choose output format (default: "print")
+  - `print`: Display results in console
+  - `json`: Generate JSON file
+  - `df`: Display results in DataFrame format
+  - `excel`: Generate Excel files with multiple sheets
+  - `csv`: Generate CSV files in separate folders
+- `--outputDir`: Directory to save output files (default: current directory)
+
+##### Filtering Options:
+- `--devices`: Filter by specific device names (e.g., "samsung-galaxy-s22-5g", "samsung-galaxy-s22plus-5g")
+- `--backends`: Filter by specific backend names
+- `--models`: Filter by specific model names
 
-you can then call methods in common.py to convert the file date back to df version
-```python3
+#### Working with Output Files
+
+You can use methods in `common.py` to convert the file data back to DataFrame format:
+
+```python
 import logging
 logging.basicConfig(level=logging.INFO)
-from common.py import
+from .ci.scripts.benchmark_tooling.common import read_all_csv_with_metadata, read_excel_with_json_header
 
-# assume the folder private for csv is in cunrrent directory
+# For CSV files (assuming the 'private' folder is in the current directory)
 folder_path = './private'
 res = read_all_csv_with_metadata(folder_path)
 logging.info(res)
 
-# assume the excel file for private device is in cunrrent directory
-folder_path = "./private.xlsx"
-res = read_excel_with_json_header(folder_path)
+# For Excel files (assuming the Excel file is in the current directory)
+file_path = "./private.xlsx"
+res = read_excel_with_json_header(file_path)
 logging.info(res)
 ```
 
-### Python API Usage
+#### Python API Usage
 
 To use the benchmark fetcher in your own scripts:
 
 ```python
-import ExecutorchBenchmarkFetcher from benchmark_tooling.get_benchmark_analysis_data
-fetcher = ExecutorchBenchmarkFetcher()
-# Must call run first
-fetcher.run()
-res = fetcher.
-```
+from .ci.scripts.benchmark_tooling.get_benchmark_analysis_data import ExecutorchBenchmarkFetcher
 
-## analyze_benchmark_stability.py
-`analyze_benchmark_stability.py` analyzes the stability of benchmark data, comparing the results of private and public devices.
+# Initialize the fetcher
+fetcher = ExecutorchBenchmarkFetcher(env="prod", disable_logging=False)
 
-### Quick Start
-Install dependencies:
-```bash
-pip install -r requirements.txt
-```
+# Fetch data for a specific time range
+fetcher.run(
+    start_time="2025-06-11T00:00:00",
+    end_time="2025-06-17T18:00:00"
+)
 
+# Get results in different formats
+# As DataFrames
+df_results = fetcher.to_df()
+
+# Export to Excel
+fetcher.to_excel(output_dir="./results")
+
+# Export to CSV
+fetcher.to_csv(output_dir="./results")
+
+# Export to JSON
+json_path = fetcher.to_json(output_dir="./results")
+
+# Get raw dictionary results
+dict_results = fetcher.to_dict()
+
+# Use the output_data method for flexible output
+results = fetcher.output_data(output_type="excel", output_dir="./results")
 ```
+
+### analyze_benchmark_stability.py
+
+This script analyzes the stability of benchmark data, comparing the results of private and public devices.
+
+#### Quick Start
+
+```bash
 python .ci/scripts/benchmark_tooling/analyze_benchmark_stability.py \
-    Benchmark\ Dataset\ with\ Private\ AWS\ Devices.xlsx \
-    --reference_file Benchmark\ Dataset\ with\ Public\ AWS\ Devices.xlsx
+    "Benchmark Dataset with Private AWS Devices.xlsx" \
+    --reference_file "Benchmark Dataset with Public AWS Devices.xlsx"
 ```
-## Run unittest
-```
-cd execuTorch/
+
+## Running Unit Tests
+
+The benchmark tooling includes comprehensive unit tests to ensure functionality.
+
+### Using pytest
+
+```bash
+# From the executorch root directory
 pytest -c /dev/null .ci/scripts/tests/test_get_benchmark_analysis_data.py
 ```
diff --git a/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py b/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py
@@ -7,7 +7,6 @@
 and customizing data retrieval parameters.
 """
 
-from yaspin import yaspin
 import argparse
 import json
 import logging
@@ -21,6 +20,7 @@
 
 import pandas as pd
 import requests
+from yaspin import yaspin
 
 logging.basicConfig(level=logging.INFO)
 
@@ -80,6 +80,13 @@ class MatchingGroupResult:
     data: list
 
 
+@dataclass
+class BenchmarkFilters:
+    models: list
+    backends: list
+    devices: list
+
+
 BASE_URLS = {
     "local": "http://localhost:3000",
     "prod": "https://hud.pytorch.org",
@@ -156,19 +163,21 @@ def run(
         self,
         start_time: str,
         end_time: str,
+        filters: BenchmarkFilters,
     ) -> None:
+        # reset group & raw data for new run
+        self.matching_groups = {}
+        self.data = None
+
         data = self._fetch_execu_torch_data(start_time, end_time)
         if data is None:
             logging.warning("no data fetched from the HUD API")
             return None
-
-        res = self._process(data)
+        res = self._process(data, filters)
         self.data = res.get("data", [])
         private_list = res.get("private", [])
         public_list = self._filter_public_result(private_list, res["public"])
 
-        # reset group
-        self.matching_groups = {}
         self.matching_groups["private"] = MatchingGroupResult(
             category="private", data=private_list
         )
@@ -456,13 +465,18 @@ def print_all_groups_info(self) -> None:
         if not self.data or not self.matching_groups:
             logging.info("No data found, please call get_data() first")
             return
-        logging.info(f" all clean benchmark table info from HUD")
+        logging.info(
+            "=========== Full list of table info from HUD API =============\n"
+            " please use values in field `info` for filtering, "
+            "while `groupInfo` holds the original benchmark metadata"
+        )
         names = []
         for item in self.data:
             names.append(
                 {
                     "table_name": item.get("table_name", ""),
-                    "groupInfo": item.get("groupInfo", ""),
+                    "groupInfo": item.get("groupInfo", {}),
+                    "info": item.get("info", {}),
                     "counts": len(item.get("rows", [])),
                 }
             )
@@ -492,7 +506,7 @@ def _generate_matching_name(self, group_info: dict, fields: list[str]) -> str:
             # name = name +'(private)'
         return name
 
-    def _process(self, input_data: List[Dict[str, Any]]):
+    def _process(self, input_data: List[Dict[str, Any]], filters: BenchmarkFilters):
         """
         Process raw benchmark data.
 
@@ -509,9 +523,9 @@ def _process(self, input_data: List[Dict[str, Any]]):
         # filter data with arch equal exactly "",ios and android, this normally indicates it's job-level falure indicator
         logging.info(f"fetched {len(input_data)} data from HUD")
         data = self._clean_data(input_data)
-
         private = []
         public = []
+
         for item in data:
             # normalized string values groupInfo to info
             item["info"] = {
@@ -528,17 +542,30 @@ def _process(self, input_data: List[Dict[str, Any]]):
             # Mark aws_type: private or public
             if group.get("device", "").find("private") != -1:
                 item["info"]["aws_type"] = "private"
-                private.append(item)
             else:
                 item["info"]["aws_type"] = "public"
                 public.append(item)
-        data.sort(key=lambda x: x["table_name"])
-        private.sort(key=lambda x: x["table_name"])
-        public.sort(key=lambda x: x["table_name"])
+        raw_data = deepcopy(data)
+
+        # applies customized filters if any
+        data = self.filter_results(data, filters)
+        # generate private and public results
+        private = sorted(
+            (
+                item
+                for item in data
+                if item.get("info", {}).get("aws_type") == "private"
+            ),
+            key=lambda x: x["table_name"],
+        )
+        public = sorted(
+            (item for item in data if item.get("info", {}).get("aws_type") == "public"),
+            key=lambda x: x["table_name"],
+        )
         logging.info(
             f"fetched clean data {len(data)}, private:{len(private)}, public:{len(public)}"
         )
-        return {"data": data, "private": private, "public": public}
+        return {"data": raw_data, "private": private, "public": public}
 
     def _clean_data(self, data_list):
         removed_gen_arch = [
@@ -575,6 +602,7 @@ def _fetch_execu_torch_data(self, start_time, end_time):
 
     def normalize_string(self, s: str) -> str:
         s = s.lower().strip()
+        s = s.replace("+","plus")
         s = s.replace("_", "-")
         s = s.replace(" ", "-")
         s = re.sub(r"[^\w\-\.\(\)]", "-", s)
@@ -583,6 +611,37 @@ def normalize_string(self, s: str) -> str:
         s = s.replace(")-", ")").replace("-)", ")")
         return s
 
+    def filter_results(self, data: List, filters: BenchmarkFilters):
+        backends = filters.backends
+        devices = filters.devices
+        models = filters.models
+
+        if not backends and not devices and not models:
+            return data
+        logging.info(
+            f"applies OR filter: backends {backends},  devices:{devices},models:{models} "
+        )
+        pre_len = len(data)
+        results = []
+        for item in data:
+            info = item.get("info", {})
+            if backends and info.get("backend") not in backends:
+                continue
+            if devices and not any(dev in info.get("device", "") for dev in devices):
+                continue
+            if models and info.get("model", "") not in models:
+                continue
+            results.append(item)
+        after_len = len(results)
+        logging.info(f"applied customized filter before: {pre_len}, after: {after_len}")
+        if after_len == 0:
+            logging.info(
+                "it seems like there is no result matches the filter values"
+                ", please run script --no-silent again, and search for values in field"
+                " 'info' for right format"
+            )
+        return results
+
 
 def argparsers():
     parser = argparse.ArgumentParser(description="Benchmark Analysis Runner")
@@ -622,7 +681,17 @@ def argparsers():
     parser.add_argument(
         "--outputDir", default=".", help="Output directory, default is ."
     )
-
+    parser.add_argument(
+        "--backends",
+        nargs="+",
+        help="Filter results by one or more backend full name(e.g. --backend qlora mv3) (OR logic)",
+    )
+    parser.add_argument(
+        "--devices",
+        nargs="+",
+        help="Filter results by device names (e.g. --devices samsung-galaxy-s22-5g)(OR logic)",
+    )
+    parser.add_argument("--models", nargs="+", help="Filter by models (OR logic)")
     return parser.parse_args()
 
 
@@ -632,6 +701,9 @@ def argparsers():
     result = fetcher.run(
         args.startTime,
         args.endTime,
+        filters=BenchmarkFilters(
+            models=args.models, backends=args.backends, devices=args.devices
+        ),
     )
     if not args.silent:
         fetcher.print_all_groups_info()