final

yangw-dev · yangw-dev · commit 04dbd97577e9 · 2025-06-18T10:36:02.000-07:00
Signed-off-by: Yang Wang &lt;elainewy@meta.com&gt;
diff --git a/.ci/scripts/benchmark_tooling/README.md b/.ci/scripts/benchmark_tooling/README.md
@@ -119,7 +119,7 @@ python .ci/scripts/benchmark_tooling/analyze_benchmark_stability.py \
 
 ## Running Unit Tests
 
-The benchmark tooling includes comprehensive unit tests to ensure functionality.
+The benchmark tooling includes unit tests to ensure functionality.
 
 ### Using pytest
 
diff --git a/.ci/scripts/benchmark_tooling/common.py b/.ci/scripts/benchmark_tooling/common.py
@@ -1,10 +1,11 @@
 import json
 import os
+from typing import Any, Dict, List
 
 import pandas as pd
 
 
-def read_excel_with_json_header(path: str):
+def read_excel_with_json_header(path: str) -> List[Dict[str, Any]]:
     # Read all sheets into a dict of DataFrames, without altering
     all_sheets = pd.read_excel(path, sheet_name=None, header=None, engine="openpyxl")
 
@@ -21,7 +22,7 @@ def read_excel_with_json_header(path: str):
     return results
 
 
-def read_all_csv_with_metadata(folder_path: str):
+def read_all_csv_with_metadata(folder_path: str) -> List[Dict[str, Any]]:
     results = []  # {filename: {"meta": dict, "df": DataFrame}}
     for fname in os.listdir(folder_path):
         if not fname.lower().endswith(".csv"):
diff --git a/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py b/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Union
 
 import pandas as pd
 import requests
@@ -126,8 +126,8 @@ class ExecutorchBenchmarkFetcher:
 
     def __init__(
         self,
-        env="prod",
-        disable_logging=False,
+        env: str = "prod",
+        disable_logging: bool = False,
         group_table_fields=None,
         group_row_fields=None,
     ):
@@ -189,7 +189,13 @@ def _filter_out_failure_only(
         self, data_list: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
         """
-        clean FAILURE_REPORT only metrics
+        Clean data by removing rows that only contain FAILURE_REPORT metrics.
+
+        Args:
+            data_list: List of benchmark data dictionaries
+
+        Returns:
+            Filtered list with rows containing only FAILURE_REPORT removed
         """
         ONLY = {"workflow_id", "granularity_bucket", "job_id", "FAILURE_REPORT"}
         for item in data_list:
@@ -230,7 +236,13 @@ def _filter_public_result(self, private_list, public_list):
         filtered_public = [item for item in public_list if item["table_name"] in common]
         return filtered_public
 
-    def get_result(self):
+    def get_result(self) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Get a deep copy of the benchmark results.
+
+        Returns:
+            Dictionary containing benchmark results grouped by category
+        """
         return deepcopy(self.to_dict())
 
     def to_excel(self, output_dir: str = ".") -> None:
@@ -270,7 +282,7 @@ def _write_multi_sheet_excel(self, data_list, output_dir, file_name):
                 worksheet.write_string(0, 0, json_str)
 
                 logging.info(
-                    f"Wrting excel sheet to file {file} with sheet name {sheet_name} for {entry["table_name"]}"
+                    f"Wrting excel sheet to file {file} with sheet name {sheet_name} for {entry['table_name']}"
                 )
                 # Write DataFrame starting at row 2 (index 1)
                 df.to_excel(writer, sheet_name=sheet_name, startrow=1, index=False)
@@ -366,7 +378,7 @@ def generate_json_file(self, data, file_name, output_dir: str = "."):
             json.dump(data, f, indent=2)
         return path
 
-    def to_dict(self) -> Any:
+    def to_dict(self) -> Dict[str, List[Dict[str, Any]]]:
         """
         Convert benchmark results to a dictionary.
 
@@ -378,15 +390,16 @@ def to_dict(self) -> Any:
             result[item.category] = item.data
         return result
 
-    def to_df(self) -> Any:
+    def to_df(self) -> Dict[str, List[Dict[str, Union[Dict[str, Any], pd.DataFrame]]]]:
         """
         Convert benchmark results to pandas DataFrames.
 
         Creates a dictionary with categories as keys and lists of DataFrames as values.
         Each DataFrame represents one benchmark configuration.
 
         Returns:
-            Dictionary mapping categories to lists of DataFrames with metadata
+            Dictionary mapping categories ['private','public'] to lists of DataFrames "df" with metadata "groupInfo".
+
         """
         result = {}
         for item in self.matching_groups.values():
@@ -423,7 +436,20 @@ def to_csv(self, output_dir: str = ".") -> None:
             path = os.path.join(output_dir, item.category)
             self._write_multiple_csv_files(item.data, path)
 
-    def _write_multiple_csv_files(self, data_list, output_dir, prefix=""):
+    def _write_multiple_csv_files(
+        self, data_list: List[Dict[str, Any]], output_dir: str, prefix: str = ""
+    ) -> None:
+        """
+        Write multiple benchmark results to CSV files.
+
+        Creates a CSV file for each benchmark configuration, with metadata
+        as a JSON string in the first row and data in subsequent rows.
+
+        Args:
+            data_list: List of benchmark result dictionaries
+            output_dir: Directory to save CSV files
+            prefix: Optional prefix for CSV filenames
+        """
         os.makedirs(output_dir, exist_ok=True)
         for idx, entry in enumerate(data_list):
             filename = f"{prefix}_table{idx+1}.csv" if prefix else f"table{idx+1}.csv"
@@ -506,7 +532,9 @@ def _generate_matching_name(self, group_info: dict, fields: list[str]) -> str:
             # name = name +'(private)'
         return name
 
-    def _process(self, input_data: List[Dict[str, Any]], filters: BenchmarkFilters):
+    def _process(
+        self, input_data: List[Dict[str, Any]], filters: BenchmarkFilters
+    ) -> Dict[str, Any]:
         """
         Process raw benchmark data.
 
@@ -578,7 +606,9 @@ def _clean_data(self, data_list):
         data = self._filter_out_failure_only(removed_gen_arch)
         return data
 
-    def _fetch_execu_torch_data(self, start_time, end_time):
+    def _fetch_execu_torch_data(
+        self, start_time: str, end_time: str
+    ) -> Optional[List[Dict[str, Any]]]:
         url = f"{self.base_url}/api/benchmark/group_data"
         params_object = BenchmarkQueryGroupDataParams(
             repo="pytorch/executorch",
@@ -611,7 +641,19 @@ def normalize_string(self, s: str) -> str:
         s = s.replace(")-", ")").replace("-)", ")")
         return s
 
-    def filter_results(self, data: List, filters: BenchmarkFilters):
+    def filter_results(self, data: List[Dict[str, Any]], filters: BenchmarkFilters) -> List[Dict[str, Any]]:
+        """
+        Filter benchmark results based on specified criteria.
+
+        Applies OR logic for filtering - results match if they match any of the specified filters.
+
+        Args:
+            data: List of benchmark data dictionaries
+            filters: BenchmarkFilters object containing filter criteria
+
+        Returns:
+            Filtered list of benchmark data dictionaries
+        """
         backends = filters.backends
         devices = filters.devices
         models = filters.models