final

yangw-dev · yangw-dev · commit 8e6956d42e3c · 2025-06-17T11:38:36.000-07:00
Signed-off-by: Yang Wang &lt;elainewy@meta.com&gt;
diff --git a/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py b/.ci/scripts/benchmark_tooling/get_benchmark_analysis_data.py
@@ -139,188 +139,6 @@ def argparser():
 
     return parser.parse_args()
 
-
-class BenchmarkFetcher:
-    def __init__(
-        self,
-        env="prod",
-        repo="pytorch/pytorch",
-        benchmark_name="",
-        disable_logging=False,
-        group_table_fields=None,
-        group_row_fields=None,
-        processor_funcs=None,
-    ):
-        """
-        Initialize the ExecutorchBenchmarkFetcher.
-
-        Args:
-            env: Environment to use ("local" or "prod")
-            disable_logging: Whether to suppress log output
-            group_table_fields: Custom fields to group tables by (defaults to device, backend, arch, model)
-            group_row_fields: Custom fields to group rows by (defaults to workflow_id, job_id, granularity_bucket)
-        """
-        self.env = env
-        self.base_url = self._get_base_url()
-        self.query_group_table_by_fields = (
-            group_table_fields
-            if group_table_fields
-            else ["device", "backend", "arch", "model"]
-        )
-        self.query_group_row_by_fields = (
-            group_row_fields
-            if group_row_fields
-            else ["workflow_id", "job_id", "granularity_bucket"]
-        )
-        self.data = None
-        self.disable_logging = disable_logging
-        self.processor_funcs = processor_funcs if processor_funcs else []
-
-        self.repo = repo
-        self.benchmark_name = benchmark_name
-
-    def _get_base_url(self) -> str:
-        """
-        Get the base URL for API requests based on environment.
-
-        Returns:
-            Base URL string for the configured environment
-        """
-        base_urls = {
-            "local": "http://localhost:3000",
-            "prod": "https://hud.pytorch.org",
-        }
-        return base_urls[self.env]
-
-    def _fetch_data(
-        self, start_time: str, end_time: str
-    ) -> Optional[List[Dict[str, Any]]]:
-        """
-        Fetch and process benchmark data for the specified time range.
-
-        Args:
-            start_time: ISO8601 formatted start time
-            end_time: ISO8601 formatted end time
-
-        Returns:
-            Processed benchmark data or None if fetch failed
-        """
-        data = self._fetch_data(start_time, end_time)
-        if data is None:
-            return None
-        self.data = self._process(data)
-        return self.data
-
-    def to_df(self) -> Any:
-        if not self.data:
-            return
-        dfs = [
-            {"groupInfo": item["groupInfo"], "df": pd.DataFrame(item["rows"])}
-            for item in self.data
-        ]
-        return dfs
-
-    def _process(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        Process raw benchmark data.
-
-        This method:
-        1. Normalizes string values in groupInfo
-        2. Creates table_name from group info components
-        3. Determines aws_type (public/private) based on device name
-        4. Sorts results by table_name
-
-        Args:
-            data: Raw benchmark data from API
-
-        Returns:
-            Processed benchmark data
-        """
-        for item in data:
-            # normalized string values in groupInfo
-            item["groupInfo"] = {
-                k: self.normalize_string(v)
-                for k, v in item.get("groupInfo", {}).items()
-                if v is not None and isinstance(v, str)
-            }
-            group = item.get("groupInfo", {})
-            name = self._generate_table_name(group, self.query_group_table_by_fields)
-
-            # Add full name joined by the group key fields
-            item["table_name"] = name
-
-            # Mark aws_type: private or public
-            if group.get("device", "").find("private") != -1:
-                item["groupInfo"]["aws_type"] = "private"
-            else:
-                item["groupInfo"]["aws_type"] = "public"
-
-        data.sort(key=lambda x: x["table_name"])
-        logging.info(f"fetched {len(data)} table views")
-        return data
-
-    def normalize_string(self, s, replace="_"):
-        return s.lower().replace(" ", replace)
-
-    def _generate_table_name(self, group_info: dict, fields: list[str]) -> str:
-        name = "|".join(
-            group_info[k] for k in fields if k in group_info and group_info[k]
-        )
-        return self.normalize_string(name)
-
-    def _call_api(self, start_time, end_time):
-        url = f"{self.base_url}/api/benchmark/group_data"
-        params_object = BenchmarkQueryGroupDataParams(
-            repo="pytorch/executorch",
-            benchmark_name="ExecuTorch",
-            start_time=start_time,
-            end_time=end_time,
-            group_table_by_fields=self.query_group_table_by_fields,
-            group_row_by_fields=self.query_group_row_by_fields,
-        )
-        params = {k: v for k, v in params_object.__dict__.items() if v is not None}
-        response = requests.get(url, params=params)
-        if response.status_code == 200:
-            return response.json()
-        else:
-            logging.info(f"Failed to fetch benchmark data ({response.status_code})")
-            logging.info(response.text)
-            return None
-
-    def _write_multiple_csv_files(
-        self, data_list: List[Dict[str, Any]], output_dir: str, file_prefix=""
-    ) -> None:
-        """
-        Write multiple benchmark results to separate CSV files.
-
-        Each entry in `data_list` becomes its own CSV file.
-
-        Args:
-            data_list: List of benchmark result dictionaries
-            output_dir: Directory to save the CSV files
-        """
-        os.makedirs(output_dir, exist_ok=True)
-        logging.info(
-            f"\n ========= Generating multiple CSV files in {output_dir} ========= \n"
-        )
-        for idx, entry in enumerate(data_list):
-            file_name = entry.get("short_name", f"file{idx+1}")
-
-            if file_prefix:
-                file_name = file_prefix + file_name
-            if len(file_name) > 100:
-                logging.warning(
-                    f"File name '{file_name}' is too long, truncating to 100 characters"
-                )
-                file_name = file_name[:100]
-            file_path = os.path.join(output_dir, f"{file_name}.csv")
-
-            rows = entry.get("rows", [])
-            logging.info(f"Writing CSV: {file_path} with {len(rows)} rows")
-            df = pd.DataFrame(rows)
-            df.to_csv(file_path, index=False)
-
-
 class ExecutorchBenchmarkFetcher:
     """
     Fetch and process benchmark data from HUD API for ExecutorchBenchmark.